In [13]:
from bs4 import BeautifulSoup
import requests
import lxml
import time
from selenium import webdriver
import os
import random
import pyprind
from pprint import pprint
import pandas as pd
import os

In [14]:
def get_domain_and_company(node):
    company_node = node.find(class_= "company")
    domain_profile = company_node.select('.heading_4_5.profile')
    company_name = company_node.select('.heading_6.company_name')
    return {'domain_profile': domain_profile[0].text.strip(), 'company_name' : company_name[0].text.strip() }



In [15]:
def get_location(node):
    container_node = node.find(class_='individual_internship_details')
    location_node = container_node.select('.location_link')

    if(len(location_node) >0):
        location = location_node[0].text.strip()
        return {"location" : location}

    return None




In [16]:
def get_job_metadata(node):
    row_nodes = node.select(".other_detail_item")
    info = dict()
    for row in row_nodes:
        head = row.find(class_="item_heading")
        details = row.find(class_="item_body")
        if head != None and details != None:
            info[head.text.strip()] = details.text.strip()
        
    num_applicants= node.find(class_="applications_message")
    if num_applicants != None:
        info["applicants"]= num_applicants
    
    return info





In [17]:
def get_job_details(node):
    job_details_node = node.find(class_='internship_details')
    section_headings = job_details_node.select('.section_heading')
    section_content = job_details_node.select('.text-container, .round_tabs_container')

    n_head= len(section_headings)
    n_content = len(section_content)

    contents = dict()


    for heading, content in zip(section_headings, section_content):
        key = heading.text.strip()
        contents[key] = content.text.strip()

    diff= n_content- n_head

    if diff>0:
        while diff>0:
            contents[key] = section_content[n_content - diff ]
            diff -= 1

    old_key = list(contents.keys())[n_head-1]
    contents['company_details'] = contents[old_key]
    del contents[old_key]

    for key in contents:
        if type(contents[key]) is list:
            contents[key] = ';'.join(contents[key])

    return contents

    

    












In [18]:
ROOT_PATH = './raw/internshala/job_post_pages'
all_pages = os.listdir(ROOT_PATH)

all_pages_data = []
bar = pyprind.ProgBar(len(all_pages))
for page_name in all_pages:
#     try:
    combined_path = ROOT_PATH + '/' + page_name
    data = dict()
    soup = BeautifulSoup(open(combined_path))
    container_node = soup.find(class_=['detail_view'])
    location = get_location(container_node)
    dnc = get_domain_and_company(container_node)
    details = get_job_details(container_node)
    metadata = get_job_metadata(container_node)
    data.update(location)
    data.update(dnc)
    data.update(details)
    data.update(metadata)
    all_pages_data.append(data)
# except Exception as e:
#         print('Exception = ' + str(e))
    bar.update()

print(bar)












KeyboardInterrupt: 

In [None]:
s= set()
for x in all_pages_data:
    for k in x.keys():
        s.add(k)
        
s 

{'About the job',
 'Apply By',
 'CTC (ANNUAL)',
 'Editor’s note',
 'Experience',
 'Number of openings',
 'Other requirements',
 'Perks',
 'Salary',
 'Skill(s) required',
 'Start date',
 'Who can apply',
 'applicants',
 'company_details',
 'company_name',
 'domain_profile',
 'location'}

In [None]:
df = pd.DataFrame(all_pages_data)


In [None]:
df.head()

Unnamed: 0,location,domain_profile,company_name,About the job,Who can apply,Salary,company_details,Start date,CTC (ANNUAL),Experience,Apply By,applicants,Skill(s) required,Perks,Number of openings,Other requirements,Editor’s note
0,Gurgaon,Web Developer,Toolify Private Limited,Company Overview\n\nToolify Private Limited is...,1. Candidates with minimum 4 years of experience.,"Annual CTC: ₹ 8,00,000 - 12,00,000 /year",Information above is Internshala's interpretat...,Starts Immediately,"₹ 8,00,000 - 12,00,000 ...",4-6 years,28 Apr' 24,[0 applicants],,,,,
1,Indore,3d Environment Artist,Mdroid Network Private Limited,We are looking for a passionate and talented 3...,1. Candidates with minimum 1 years of experience.,"Annual CTC: ₹ 2,00,000 /year",We are a character-driven animation studio str...,Starts Immediately,"₹ 2,00,000 \n\n...",1 year,9 Mar' 24,[Be an early applicant],Autodesk Maya,Informal dress code,1.0,,
2,Chennai,PHP Developer,CN Solutions,Company Overview\n\nCN Solutions is a leading ...,1. Candidates with minimum 4 years of experience.,Annual CTC: Competitive salary,Information above is Internshala's interpretat...,Starts Immediately,Competitive salary ...,4-6 years,18 Mar' 24,[0 applicants],,,,,
3,Bangalore,WordPress Developer,Webfluence,Key responsibilities: \n\n1. Design and develo...,,"Annual CTC: ₹ 4,20,000 - 6,00,000 /year",[\n Webfluence is a leading digital mar...,Starts Immediately,"₹ 4,20,000 - 6,00,000 ...",0-5 years,23 Mar' 24,[121 applicants],CSS\nHTML\nJavaScript\nNode.js\nPython\nWordPress,5 days a week,3.0,"1. Bachelor’s degree in computer science, info...",
4,Work from home,Junior Machine Learning Specialist,"HackerPulse (San Francisco, United States)",Hackerpulse is a new and growing company. We h...,1. Candidate must be available to work from 12...,"Annual CTC: $ 15,000 - 35,000 /year",[\n We at HackerPulse help software eng...,Starts Immediately,"$ 15,000 - 35,000 ...",0-2 years,6 Mar' 24,[1000+ applicants],Data Analytics\nData Science\nDeep Learning\nM...,,2.0,1. Degree in Computer Science or related disci...,


In [None]:
df.drop("Editor’s note",axis=1,inplace=True)


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 573 entries, 0 to 572
Data columns (total 16 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   location            573 non-null    object
 1   domain_profile      573 non-null    object
 2   company_name        573 non-null    object
 3   About the job       573 non-null    object
 4   Who can apply       314 non-null    object
 5   Salary              573 non-null    object
 6   company_details     573 non-null    object
 7   Start date          573 non-null    object
 8   CTC (ANNUAL)        573 non-null    object
 9   Experience          573 non-null    object
 10  Apply By            573 non-null    object
 11  applicants          573 non-null    object
 12  Skill(s) required   446 non-null    object
 13  Perks               276 non-null    object
 14  Number of openings  446 non-null    object
 15  Other requirements  176 non-null    object
dtypes: object(16)
memory usage

In [None]:
df.columns
rena=dict()
for col in df.columns.to_list():
    key = col
    col = col.replace(' ','_').lower()
    rena[key] = col


print(rena)



{'location': 'location', 'domain_profile': 'domain_profile', 'company_name': 'company_name', 'About the job': 'about_the_job', 'Who can apply': 'who_can_apply', 'Salary': 'salary', 'company_details': 'company_details', 'Start date': 'start_date', 'CTC (ANNUAL)': 'ctc_(annual)', 'Experience': 'experience', 'Apply By': 'apply_by', 'applicants': 'applicants', 'Skill(s) required': 'skill(s)_required', 'Perks': 'perks', 'Number of openings': 'number_of_openings', 'Other requirements': 'other_requirements'}


In [None]:
df.rename(columns=rena,inplace=True)


In [None]:
df.head()

NameError: name 'df' is not defined

In [None]:
df.to_csv('internshala-cs-jobs.csv')