### Scrapping Naukri's Data Science/Analytics Jobs data

**Python code using ```BeautifulSoup``` to scrap data**     
Motive is to find recent demands in data science in India as well as to important insights such as salary, experience, tools etc. related to data science job profiles.

In [8]:
%matplotlib inline
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import _pickle as cPickle
import time
import requests
from bs4 import BeautifulSoup
from collections import OrderedDict
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
sns.set_style("whitegrid")
sns.set_context("poster")

### IMPORTANT
1. Going to Query Job by different KeyWords Like Data Scientist, Data Analyst, Business Analyst, Machine Learning etc.
1. Though there is many pages for each key-word, after 20-25 pages I am noticing that Jobs are not as data scientist or the keyword, they matched because of the some portion of key word like (data-scientist-analyst-business-machine-learning) mathced.
1. So for different keywords, instead of getting all the pages data, I will simply get first 20-30 page of data

In [None]:
# Initialize the data frame every time before starting the scrap job
job_df = pd.DataFrame()

# df to validate with the un-scraped links from job_df
job_links_df = pd.DataFrame()

# Motive is to get all the data science jobs from "https://www.naukri.com"

# URLS : "-" is added to help later while structuring the other pages URL
data_scientist = "https://www.naukri.com/data-scientist-jobs-"
machine_learning = "https://www.naukri.com/machine-learning-jobs-"
data_analyst = "https://www.naukri.com/data-analyst-jobs-"
# go till page 60
business_analyst = "https://www.naukri.com/business-analyst-jobs-"

# Base URL (You can write a for loop to iterate through the above URLs, I am putting it manually so that server doesn't receive lots 
# of hits at a time)
base_url = business_analyst
base_response = requests.get(base_url)
base_page = base_response.text

# Convert the response to BeautifulSoup object
base_soup = BeautifulSoup(base_page, "html.parser")

# Find the total number of jobs : which is 10043
num_jobs = int(base_soup.find("div", { "class" : "count" }).h1.contents[1].getText().split(' ')[-1])
# Each page lists 50 jobs, so total pages
num_pages = int(math.ceil(num_jobs/50.0))

# Get all the job links, each page have 50 job links
# pattern in page url is - https://www.naukri.com/data-scientist-jobs-page_number

# Create a empty list to store all job links
job_links = []

# Want to scrap in a gap, so that I don't overload there server :: 30 for others, 60 for business_analyst, 40 for data_scientist
req_page = 60
# keep changing the start_ind
start_ind = 1
end_ind = start_ind + req_page

# description labels (other informations about the job)
labels = ['Salary', 'Industry', 'Functional_Area', 'Role_Category', 'Design_Role']
# education requirements
edu_labels = ["UG", "PG", "Doctorate"]
# For loop to get link from each of num_pages
# To run in a single loop USE : range(1, num_pages+1)
for page in range(start_ind, end_ind):
    # structuring the page URL
    page_url = base_url + str(page)
    page_response = requests.get(page_url)
    page_txt = page_response.text
    page_soup = BeautifulSoup(page_txt, "html.parser")
    # 50 job links are in the class content, so filtering only content class
    links = [l.get("href") for l in page_soup.find_all("a", {"class":"content"})]
    # Append the links into job_links
    #for pl in links:
        #job_links.append(pl)
    for job_url in links:
        job_response = requests.get(job_url)
        job_page = job_response.text
        job_soup = BeautifulSoup(job_page, "html.parser")
        # get one dataframe with all the links and response so that we can later check from which link we are not able to get data
        jdf = OrderedDict({"Job_Link":job_url, "Response":str(job_response)})
        job_links_df = job_links_df.append(jdf,ignore_index=True)
        try:
            job_title = job_soup.find("h1", {"itemprop":"title"}).getText().strip()
            company_name = job_soup.find("a",{"itemprop":"hiringOrganization"}).getText().strip()
            experience = job_soup.find("span",{"itemprop":"experienceRequirements"}).getText().strip()
            location = job_soup.find("div",{"class":"loc"}).getText().strip()
            salary = job_soup.find("span",{"class":"sal"}).getText().strip()
            #openings = job_soup.find("div",{"class":"sumFoot"}).find_all("span")
            #openings
            num_openings = ""
            job_post = ""
            for x in job_soup.find("div",{"class":"sumFoot"}).find_all("span"):
                if "Openings" in x.text.strip():
                    num_openings = x.text.strip()
                if "Posted" in x.text.strip():
                    job_post = x.text.strip()
            job_application = job_soup.find("span",{"class":"jApplys"}).find("strong").getText().strip()
            job_view = job_soup.find("span",{"class":"jViews"}).find("strong").getText().strip()
            job_description = job_soup.find("ul",{"itemprop":"description"}).getText().strip()
            # description labels (other informations about the job)
            #for x in job_soup.find("div",{"class":"jDisc mt20"}).contents:
            #    if len(str(x).replace(' ',''))!=0 :
            #        print(x.getText().split(':')[-1].strip())
            other_info = [x.getText().split(':')[-1].strip() for x in job_soup.find("div",{"class":"jDisc mt20"}).contents if len(str(x).replace(' ',''))!=0]
            other_info_label = {labels: other_info for labels, other_info in zip(labels, other_info)}
            key_skills = ','.join(job_soup.find("div",{"class":"ksTags"}).getText().split("  "))[1:]
            skill_experience = job_soup.find("ul",{"class":"listing mt15"}).getText().strip()
            # putting the education information
            education = [x.getText().split(':') for x in job_soup.find("div",{"itemprop":"educationRequirements"}).contents if len(str(x).replace(' ',''))!=0]
            education_info = {edu_label.strip(): education.strip() for edu_label, education in education}
            for l in edu_labels:
                if l not in education_info.keys():
                    education_info[l] = ""
            # recruiter information
            # This is inside a javascript button : BeautifulSoup is HTML parser: For this need SELENIUM
        except AttributeError:
            continue
        df = OrderedDict({'Job_Link':job_url, 'Job_Title':job_title, 'Company_Name':company_name, 'Experience':experience, 'Location':location, 'SalaryI':salary, 'Num_Openings':num_openings,
                          'Job_Post':job_post, 'Job_Application':job_application,
                          'Job_View':job_view, 'Key_Skills':key_skills, 'Skill_Experience':skill_experience})
        df.update(other_info_label)
        df.update(education_info)
        job_df = job_df.append(df,ignore_index=True)
        print(job_df.shape)
        time.sleep(1)
    print("page" + str(page))

**Here we need to pay attention that job can repeat in searches because of different keywords, so need to remove the duplicate based on job_url, company_name, job_titel etc.**

In [None]:
# Saving the data in pkl format
import _pickle as cPickle
# Python 3, _pickle
column_names = ['Job_Link', 'Job_Title', 'Company_Name', 'Experience', 'Location', 'SalaryI', 'Num_Openings', 'Job_Post', 'Job_Application',
                'Job_View', 'Salary', 'Industry', 'Functional_Area', 'Role_Category', 'Design_Role', 'Key_Skills', 'Skill_Experience', "UG",
                "PG", "Doctorate"]

job_df= job_df.reindex(columns=column_names)        
with open('data/job_df_business_analyst.pkl', 'wb') as f:
    cPickle.dump(job_df, f)

# Save the scraped links
with open("data/job_links_df_business_analyst.pkl", "wb") as f:
    cPickle.dump(job_links_df, f)

# Read canned scraped links
# Some error here ?
#with open('job_df.pkl', 'r') as f:
#    job_df = cPickle.load(f)

In [None]:
# Save data as csv
job_df.to_csv("data/job_df_business_analyst.csv", encoding='utf8')
job_links_df.to_csv("data/job_links_df_business_analyst.csv", encoding='utf8')

#### NOTE:
1. There are many links which we were not able to execute to get the data, we will filter out those links and run the scrap job again based on the un-executed links.

In [10]:
# As we saved the parsed data in various file, so lets get all the data for job

df1 = pd.read_csv("data/job_df20.csv", encoding = "ISO-8859-1")
# Using different encoding because did some mistake while saving the first data file, so not able to read as UTF-8
print("df1 shape" + str(df1.shape))
df2 = pd.read_csv("data/job_df40.csv", encoding = "utf-8")
print("df2 shape" + str(df2.shape))
df3 = pd.read_csv("data/job_df_dataAnalyst.csv", encoding = "utf-8")
print("df3 shape" + str(df3.shape))
df4 = pd.read_csv("data/job_df_machine.csv", encoding = "utf-8")
print("df4 shape" + str(df4.shape))
df5 = pd.read_csv("data/job_df_business_analyst.csv", encoding = "utf-8")
print("df5 shape" + str(df5.shape))

# Append all the df's to get one big job df
job_df = df1.append(df2).append(df3).append(df4).append(df5)

# There is an Unnamed:0 column, droping that column
job_df.drop(job_df.columns[[0]], axis=1, inplace=True)
print("data/job_df shape" + str(job_df.shape))

# Dropping the duplicate Job_URL rows from the data frame so that it comes only once
# In Job_Links, the last "?src" parts changes dynamically, so need to modify/remove that part to get the job_links
# So that we can compare the duplicates
job_df[["Actual_Job_Link", "SearchId"]] = job_df.Job_Link.str.split("?" , expand=True)
# drop SearchId column (starts with src- doesn't have any importance)
job_df.drop("SearchId", axis=1, inplace=True)
job_df = job_df.drop_duplicates("Actual_Job_Link",keep="first")
print("job_df shape after removing duplicates" + str(job_df.shape))
job_df.head(3)

df1 shape(719, 21)
df2 shape(678, 21)
df3 shape(754, 21)
df4 shape(1072, 21)
df4 shape(1072, 21)
df5 shape(1216, 21)
job_df shape(4439, 20)
job_df shape after removing duplicates(3727, 21)


Unnamed: 0,Job_Link,Job_Title,Company_Name,Experience,Location,SalaryI,Num_Openings,Job_Post,Job_Application,Job_View,Salary,Industry,Functional_Area,Role_Category,Design_Role,Key_Skills,Skill_Experience,UG,PG,Doctorate,Actual_Job_Link
0,https://www.naukri.com/job-listings-Data-Scien...,Data Scientist - Perl/python,niki.ai,3 - 6 yrs,Bengaluru,Not Disclosed by Recruiter,,Posted Just Now,Less than 10,Less than 10,Not Disclosed by Recruiter,IT-Software / Software Services,Analytics & Business Intelligence,Analytics & BI,Data Analyst,"Machine Learning,Python,Data Analysis,Statisti...",Qualifications and Skills :1. B.tech/MS or equ...,B.Tech/B.E. - Any Specialization,"M.Tech - Any Specialization, MS/M.Sc(Science) ...",Doctorate Not Required,https://www.naukri.com/job-listings-Data-Scien...
1,https://www.naukri.com/job-listings-Data-Scien...,Data Scientist,Brillio Technologies Pvt. Ltd,2 - 5 yrs,Bengaluru,Not Disclosed by Recruiter,,Posted 1 day ago,14,166,Not Disclosed by Recruiter,IT-Software / Software Services,"Medical , Healthcare , R&D , ...",R&D,Research Scientist,"Analytics,Data analysis,Python,Visualization,A...",,Any Graduate - Any Specialization,Post Graduation Not Required,,https://www.naukri.com/job-listings-Data-Scien...
2,https://www.naukri.com/job-listings-Senior-Dat...,"Senior Data Scientist, Data Scientist,",Knorex India,3 - 6 yrs,Pune(Hadapsar),"4,25,000 - 9,25,000 P.A.",Openings: 1,Posted Just Now,Less than 10,Less than 10,"INR 4,25,000 - 9,25,000 P.A.",IT-Software / Software Services,IT Software - System Programming,Programming & Design,System Analyst,"python,machine learning,r,algorithms,java,mark...",Please refer to the Job description above,B.Tech/B.E. - Computers,"M.Tech - Computers, Post Graduation Not Required",Doctorate Not Required,https://www.naukri.com/job-listings-Senior-Dat...


In [11]:
# Get all the data for job_links: combination of links, most of them we scrapped and available in job_df data set
# Due to various reason, some of them we are not able to scrapped
jdf1 = pd.read_csv("data/job_links_df20.csv", encoding = "ISO-8859-1")
print("jdf1 shape" + str(jdf1.shape))
jdf2 = pd.read_csv("data/job_links_df40.csv", encoding = "utf-8")
print("jdf2 shape" + str(jdf2.shape))
jdf3 = pd.read_csv("data/job_links_df_dataAnalyst.csv", encoding = "utf-8")
print("jdf3 shape" + str(jdf3.shape))
jdf4 = pd.read_csv("data/job_links_df_machine.csv", encoding = "utf-8")
print("jdf4 shape" + str(jdf4.shape))
jdf5 = pd.read_csv("data/job_links_df_business_analyst.csv", encoding = "utf-8")
print("jdf5 shape" + str(jdf5.shape))

# Append into a big df
job_links = jdf1.append(jdf2).append(jdf3).append(jdf4).append(jdf5)

# There is an Unnamed:0 column, droping that column
job_links.drop(job_links.columns[[0]], axis=1, inplace=True)
print("job_links shape" + str(job_links.shape))

# Dropping the duplicate Job_URL rows from the data frame so that it comes only once
# In Job_Links, the last "?src" parts changes dynamically, so need to modify/remove that part to get the job_links
# So that we can compare the duplicates
job_links[['Actual_Job_Link', 'SearchId']] = job_links.Job_Link.str.split("?" , expand=True)
# drop SearchId column
job_links.drop("SearchId", axis=1, inplace=True)
job_links = job_links.drop_duplicates("Actual_Job_Link")
print("job_links shape after removing duplicates" + str(job_links.shape))
job_links.head(3)

jdf1 shape(1001, 3)
jdf2 shape(1001, 3)
jdf3 shape(1500, 3)
jdf4 shape(1500, 3)
jdf5 shape(2999, 3)
job_links shape(8001, 2)
job_links shape after removing duplicates(6015, 3)


Unnamed: 0,Job_Link,Response,Actual_Job_Link
0,https://www.naukri.com/job-listings-AI-Scienti...,<Response [200]>,https://www.naukri.com/job-listings-AI-Scienti...
1,https://www.naukri.com/job-listings-Data-Scien...,<Response [200]>,https://www.naukri.com/job-listings-Data-Scien...
2,https://www.naukri.com/job-listings-Data-Scien...,<Response [200]>,https://www.naukri.com/job-listings-Data-Scien...


In [12]:
# Clearly we can notice that there are unique job_links for whom we don't have any data
# We will try to get those links and pass it to the scrapper to scrap those links again,
# or try to modify the scrapping operation if required

# Getting the links for whom we have data
links_with_data = job_df['Actual_Job_Link'].to_frame()
# Getting all unique job links
all_job_links = job_links['Actual_Job_Link'].to_frame()
# ANTI-JOIN is neede: get all the links which are available in all_job_links but not in links_with_data
out_merge = pd.merge(links_with_data,all_job_links, how='outer', indicator=True)
remaining_job_list = out_merge[out_merge['_merge'] == 'right_only']
remaining_job_list.shape

(2928, 2)

**There are 2928 links, for which we can try again to scrap.**    
*From few of them, we will be able to get data, and remaining will be because of different page structure.
We are not going to spend time to get those data*

In [23]:
# Initialize a new df
job_rem_df = pd.DataFrame()

# description labels (other informations about the job)
labels = ['Salary', 'Industry', 'Functional_Area', 'Role_Category', 'Design_Role']
# education requirements
edu_labels = ["UG", "PG", "Doctorate"]

# Iterate through all the links in remaining_job_list
for index, row in remaining_job_list.iterrows():
    job_url = row["Actual_Job_Link"]
    job_response = requests.get(job_url)
    job_page = job_response.text
    job_soup = BeautifulSoup(job_page, "html.parser")
    try:
        job_title = job_soup.find("h1", {"itemprop":"title"}).getText().strip()
        company_name = job_soup.find("a",{"itemprop":"hiringOrganization"}).getText().strip()
        experience = job_soup.find("span",{"itemprop":"experienceRequirements"}).getText().strip()
        location = job_soup.find("div",{"class":"loc"}).getText().strip()
        salary = job_soup.find("span",{"class":"sal"}).getText().strip()
        #openings = job_soup.find("div",{"class":"sumFoot"}).find_all("span")
        #openings
        num_openings = ""
        job_post = ""
        for x in job_soup.find("div",{"class":"sumFoot"}).find_all("span"):
            if "Openings" in x.text.strip():
                num_openings = x.text.strip()
            if "Posted" in x.text.strip():
                job_post = x.text.strip()
        job_application = job_soup.find("span",{"class":"jApplys"}).find("strong").getText().strip()
        job_view = job_soup.find("span",{"class":"jViews"}).find("strong").getText().strip()
        job_description = job_soup.find("ul",{"itemprop":"description"}).getText().strip()
        # description labels (other informations about the job)
        #for x in job_soup.find("div",{"class":"jDisc mt20"}).contents:
        #    if len(str(x).replace(' ',''))!=0 :
        #        print(x.getText().split(':')[-1].strip())
        other_info = [x.getText().split(':')[-1].strip() for x in job_soup.find("div",{"class":"jDisc mt20"}).contents if len(str(x).replace(' ',''))!=0]
        other_info_label = {labels: other_info for labels, other_info in zip(labels, other_info)}
        key_skills = ','.join(job_soup.find("div",{"class":"ksTags"}).getText().split("  "))[1:]
        skill_experience = job_soup.find("ul",{"class":"listing mt15"}).getText().strip()
        # putting the education information
        education = [x.getText().split(':') for x in job_soup.find("div",{"itemprop":"educationRequirements"}).contents if len(str(x).replace(' ',''))!=0]
        education_info = {edu_label.strip(): education.strip() for edu_label, education in education}
        for l in edu_labels:
            if l not in education_info.keys():
                education_info[l] = ""
        # recruiter information
        # This is inside a javascript button : BeautifulSoup is HTML parser: For this need SELENIUM
    except AttributeError:
        print("Attribute Error")
        continue
    df = OrderedDict({'Job_Link':job_url, 'Job_Title':job_title, 'Company_Name':company_name, 'Experience':experience, 'Location':location, 
                      'SalaryI':salary, 'Num_Openings':num_openings, 'Job_Post':job_post, 'Job_Application':job_application, 'Job_View':job_view, 
                      'Key_Skills':key_skills, 'Skill_Experience':skill_experience})
    df.update(other_info_label)
    df.update(education_info)
    job_rem_df = job_rem_df.append(df,ignore_index=True)
    print(job_rem_df.shape)
    time.sleep(1)
print("Iteration Completed and get data of" + str(job_rem_df.shape))

In [17]:
# Duplicating the Job_Link as Actual_Job_Link to make the shape of all dataframes same
job_rem_df["Actual_Job_Link"] = job_rem_df["Job_Link"]
job_rem_df.shape

(20, 21)

In [18]:
# Out of 2928, we got only 20
# Append this with job_df
job_df = job_df.append(job_rem_df)

In [23]:
# Structure of some page changed, as well as attribute error in try block,
# we will do two things, first by if else, we will handle the error for same structured pages
# and modify the scrap query for different structure pages
# Out of 2928
# Getting the links for whom we have data
rem_links_with_data = job_rem_df['Actual_Job_Link'].to_frame()
# Getting all remaining unique job links used previously
remaining_job_list = remaining_job_list['Actual_Job_Link'].to_frame()
# ANTI-JOIN is neede: get all the links which are available in all_job_links but not in links_with_data
ot_merge = pd.merge(rem_links_with_data,remaining_job_list, how='outer', indicator=True)
rem_job_list = ot_merge[ot_merge['_merge'] == 'right_only']
rem_job_list.shape

(2908, 2)

In [25]:
job_df.to_csv("updated_job.csv", encoding="utf-8")

In [39]:
# 2928 - 20 = 2908, remaning links
# first using if else condition

# Initialize a new df
job_rem_df1 = pd.DataFrame()

# description labels (other informations about the job)
labels = ['Salary', 'Industry', 'Functional_Area', 'Role_Category', 'Design_Role']
# education requirements
edu_labels = ["UG", "PG", "Doctorate"]

# Iterate through all the links in remaining_job_list
for index, row in rem_job_list.iterrows():
    job_url = row["Actual_Job_Link"]
    job_response = requests.get(job_url)
    job_page = job_response.text
    job_soup = BeautifulSoup(job_page, "html.parser")    
    #try:
    if job_soup.find("h1", {"itemprop":"title"}) is None:
        job_title = "Error"
    else:
        job_title = job_soup.find("h1", {"itemprop":"title"}).getText().strip()
    if job_soup.find("a",{"itemprop":"hiringOrganization"}) is None:
        company_name = "Error"
    else:
        company_name = job_soup.find("a",{"itemprop":"hiringOrganization"}).getText().strip()
    if job_soup.find("span",{"itemprop":"experienceRequirements"}) is None:
        experience = "Error"
    else:
        experience = job_soup.find("span",{"itemprop":"experienceRequirements"}).getText().strip()
    if job_soup.find("div",{"class":"loc"}) is None:
        location = "Error"
    else:
        location = job_soup.find("div",{"class":"loc"}).getText().strip()
    if job_soup.find("span",{"class":"sal"}) is None:
        salary = "Error"
    else:
        salary = job_soup.find("span",{"class":"sal"}).getText().strip()
    #openings = job_soup.find("div",{"class":"sumFoot"}).find_all("span")
    #openings
    num_openings = ""
    job_post = ""
    if job_soup.find("div",{"class":"sumFoot"}) is None:
        num_openings = "Error"
        job_post = "Error"
    else:
        if job_soup.find("div",{"class":"sumFoot"}).find_all("span") is None:
            num_openings = "Error"
            job_post = "Error"
        else:
            for x in job_soup.find("div",{"class":"sumFoot"}).find_all("span"):
                if "Openings" in x.text.strip():
                    num_openings = x.text.strip()
                if "Posted" in x.text.strip():
                    job_post = x.text.strip()
    if job_soup.find("span",{"class":"jApplys"}) is None:
        job_application = "Error"
    else:
        if job_soup.find("span",{"class":"jApplys"}).find("strong") is None:
            job_application = "Error"
        else:
            job_application = job_soup.find("span",{"class":"jApplys"}).find("strong").getText().strip()
    if job_soup.find("span",{"class":"jViews"}) is None:
        job_view = "Error"
    else:
        if job_soup.find("span",{"class":"jViews"}).find("strong") is None:
            job_view = "Error"
        else:
            job_view = job_soup.find("span",{"class":"jViews"}).find("strong").getText().strip()
    if job_soup.find("ul",{"itemprop":"description"}) is None:
        job_description = "Error"
    else:
        job_description = job_soup.find("ul",{"itemprop":"description"}).getText().strip()
    # description labels (other informations about the job)
    #for x in job_soup.find("div",{"class":"jDisc mt20"}).contents:
    #    if len(str(x).replace(' ',''))!=0 :
    #        print(x.getText().split(':')[-1].strip())
    if job_soup.find("div",{"class":"jDisc mt20"}) is None:
        other_info_label = {}
    else:
        other_info = [x.getText().split(':')[-1].strip() for x in job_soup.find("div",{"class":"jDisc mt20"}).contents if len(str(x).replace(' ',''))!=0]
        other_info_label = {labels: other_info for labels, other_info in zip(labels, other_info)}
    if job_soup.find("div",{"class":"ksTags"}) is None:
        key_skills = "Error"
    else:
        key_skills = ','.join(job_soup.find("div",{"class":"ksTags"}).getText().split("  "))[1:]
    if job_soup.find("ul",{"class":"listing mt15"}) is None:
        skill_experience = "Error"
    else:
        skill_experience = job_soup.find("ul",{"class":"listing mt15"}).getText().strip()
    # putting the education information
    if job_soup.find("div",{"itemprop":"educationRequirements"}) is None:
        education_info = {}
        for l in edu_labels:
            education_info[l] = ""
    else:
        education = [x.getText().split(':') for x in job_soup.find("div",{"itemprop":"educationRequirements"}).contents if len(str(x).replace(' ',''))!=0]
        education_info = {edu_label.strip(): education.strip() for edu_label, education in education}
        for l in edu_labels:
            if l not in education_info.keys():
                education_info[l] = ""
    # recruiter information
    # This is inside a javascript button : BeautifulSoup is HTML parser: For this need SELENIUM
    #except AttributeError:
    #    print("Attribute Error")
    #    continue
    df = OrderedDict({'Job_Link':job_url, 'Job_Title':job_title, 'Company_Name':company_name, 'Experience':experience, 'Location':location, 
                      'SalaryI':salary, 'Num_Openings':num_openings, 'Job_Post':job_post, 'Job_Application':job_application, 'Job_View':job_view, 
                      'Key_Skills':key_skills, 'Skill_Experience':skill_experience})
    df.update(other_info_label)
    df.update(education_info)
    job_rem_df1 = job_rem_df1.append(df,ignore_index=True)
    print(job_rem_df1.shape)
    time.sleep(1)
print("Iteration Completed and get data of" + str(job_rem_df1.shape))

Iteration Completed and get data of(2908, 20)


In [46]:
job_rem_df1.head(3)

Unnamed: 0,Company_Name,Design_Role,Doctorate,Experience,Functional_Area,Industry,Job_Application,Job_Link,Job_Post,Job_Title,Job_View,Key_Skills,Location,Num_Openings,PG,Role_Category,Salary,SalaryI,Skill_Experience,UG,Actual_Job_Link
0,Wissen Infotech Pvt. Ltd.,"Permanent Job, Full Time",,5 - 8 yrs,Other,IT-Software / Software Services,64,https://www.naukri.com/job-listings-AI-Scienti...,Posted 8 days ago,AI Scientist,583,Data Scientist,Bengaluru(Marathahalli),,,Other,Not Disclosed by Recruiter,Not Disclosed by Recruiter,Please refer to the Job description above,,https://www.naukri.com/job-listings-AI-Scienti...
1,Error,,,Error,,,44,https://www.naukri.com/job-listings-Senior-Dat...,Posted: 1 day ago,Error,28,Error,Error,,,,,Not Disclosed by Recruiter,Error,,https://www.naukri.com/job-listings-Senior-Dat...
2,Red Hat India Pvt Ltd,Data Analyst,,7 - 12 yrs,Analytics & Business Intelligence,IT-Software / Software Services,Less than 10,https://www.naukri.com/job-listings-Business-D...,Posted 2 days ago,Business Data Scientist _ Redhat_pune,192,"sql queries,amazon aws,redshift,data modeling,...",Pune,,,Analytics & BI,"INR 8,00,000 - 13,00,000 P.A.","8,00,000 - 13,00,000 P.A.",Please refer to the Job description above,,https://www.naukri.com/job-listings-Business-D...


In [41]:
# Add Actual_Job_Link column
job_rem_df1["Actual_Job_Link"] = job_rem_df1["Job_Link"]

# Now we have 2908 rows df, but in many places Whole row is as Error or None

# Filter the valid data
valid_job = job_rem_df1[job_rem_df1["Company_Name"] != "Error"]
print("Size of newly scrap job " + str(valid_job.shape))

# Append with job_df
job_df = job_df.append(valid_job)

job_df.to_csv("complete_job_profiles.csv")

# Page with different structure
invalid_jobs = job_rem_df1[job_rem_df1["Company_Name"] == "Error"]
invalid_jobs.to_csv("invalid_jobs.csv")
dif_links = invalid_jobs["Actual_Job_Link"].to_frame()
dif_links.shape

Size of newly scrap job (2351, 21)


(557, 1)

In [45]:
valid_job.head(3)

Unnamed: 0,Company_Name,Design_Role,Doctorate,Experience,Functional_Area,Industry,Job_Application,Job_Link,Job_Post,Job_Title,Job_View,Key_Skills,Location,Num_Openings,PG,Role_Category,Salary,SalaryI,Skill_Experience,UG,Actual_Job_Link
0,Wissen Infotech Pvt. Ltd.,"Permanent Job, Full Time",,5 - 8 yrs,Other,IT-Software / Software Services,64,https://www.naukri.com/job-listings-AI-Scienti...,Posted 8 days ago,AI Scientist,583,Data Scientist,Bengaluru(Marathahalli),,,Other,Not Disclosed by Recruiter,Not Disclosed by Recruiter,Please refer to the Job description above,,https://www.naukri.com/job-listings-AI-Scienti...
2,Red Hat India Pvt Ltd,Data Analyst,,7 - 12 yrs,Analytics & Business Intelligence,IT-Software / Software Services,Less than 10,https://www.naukri.com/job-listings-Business-D...,Posted 2 days ago,Business Data Scientist _ Redhat_pune,192,"sql queries,amazon aws,redshift,data modeling,...",Pune,,,Analytics & BI,"INR 8,00,000 - 13,00,000 P.A.","8,00,000 - 13,00,000 P.A.",Please refer to the Job description above,,https://www.naukri.com/job-listings-Business-D...
3,Adecco India Private Limited,Data Analyst,,3 - 5 yrs,Analytics & Business Intelligence,IT-Software / Software Services,742,https://www.naukri.com/job-listings-Data-Scien...,Posted 2 days ago,Data Scientist,1320,"SAS SQL,R,Python,Excel,Analytics,Data Manipula...",Pune,,,Analytics & BI,Not Disclosed by Recruiter,Not Disclosed by Recruiter,,,https://www.naukri.com/job-listings-Data-Scien...


### Remaining list 557: We will leave these
#### Iterate though the new list
#### Initialize a new df
job_dl_df = pd.DataFrame()

#### description labels (other informations about the job)
labels = ['Salary', 'Industry', 'Functional_Area', 'Role_Category', 'Design_Role']
#### education requirements
edu_labels = ["UG", "PG", "Doctorate"]

#### Iterate through all the links in remaining_job_list
for index, row in dif_links.iterrows():
    job_url = row["Actual_Job_Link"]
    job_response = requests.get(job_url)
    job_page = job_response.text
    job_soup = BeautifulSoup(job_page, "html.parser")    
    #try:
    if job_soup.find("h1", {"itemprop":"title"}) is None:
        job_title = "Error"
    else:
        job_title = job_soup.find("h1", {"itemprop":"title"}).getText().strip()
    if job_soup.find("a",{"itemprop":"hiringOrganization"}) is None:
        company_name = "Error"
    else:
        company_name = job_soup.find("a",{"itemprop":"hiringOrganization"}).getText().strip()
    if job_soup.find("span",{"itemprop":"experienceRequirements"}) is None:
        experience = "Error"
    else:
        experience = job_soup.find("span",{"itemprop":"experienceRequirements"}).getText().strip()
    if job_soup.find("div",{"class":"loc"}) is None:
        location = "Error"
    else:
        location = job_soup.find("div",{"class":"loc"}).getText().strip()
    if job_soup.find("span",{"class":"sal"}) is None:
        salary = "Error"
    else:
        salary = job_soup.find("span",{"class":"sal"}).getText().strip()
    #openings = job_soup.find("div",{"class":"sumFoot"}).find_all("span")
    #openings
    num_openings = ""
    job_post = ""
    if job_soup.find("div",{"class":"sumFoot"}) is None:
        num_openings = "Error"
        job_post = "Error"
    else:
        if job_soup.find("div",{"class":"sumFoot"}).find_all("span") is None:
            num_openings = "Error"
            job_post = "Error"
        else:
            for x in job_soup.find("div",{"class":"sumFoot"}).find_all("span"):
                if "Openings" in x.text.strip():
                    num_openings = x.text.strip()
                if "Posted" in x.text.strip():
                    job_post = x.text.strip()
    if job_soup.find("span",{"class":"jApplys"}) is None:
        job_application = "Error"
    else:
        if job_soup.find("span",{"class":"jApplys"}).find("strong") is None:
            job_application = "Error"
        else:
            job_application = job_soup.find("span",{"class":"jApplys"}).find("strong").getText().strip()
    if job_soup.find("span",{"class":"jViews"}) is None:
        job_view = "Error"
    else:
        if job_soup.find("span",{"class":"jViews"}).find("strong") is None:
            job_view = "Error"
        else:
            job_view = job_soup.find("span",{"class":"jViews"}).find("strong").getText().strip()
    if job_soup.find("ul",{"itemprop":"description"}) is None:
        job_description = "Error"
    else:
        job_description = job_soup.find("ul",{"itemprop":"description"}).getText().strip()
    #description labels (other informations about the job)
    #for x in job_soup.find("div",{"class":"jDisc mt20"}).contents:
    #if len(str(x).replace(' ',''))!=0 :
    #print(x.getText().split(':')[-1].strip())
    if job_soup.find("div",{"class":"jDisc mt20"}) is None:
        other_info_label = {}
    else:
        other_info = [x.getText().split(':')[-1].strip() for x in job_soup.find("div",{"class":"jDisc mt20"}).contents if len(str(x).replace(' ',''))!=0]
        other_info_label = {labels: other_info for labels, other_info in zip(labels, other_info)}
    if job_soup.find("div",{"class":"ksTags"}) is None:
        key_skills = "Error"
    else:
        key_skills = ','.join(job_soup.find("div",{"class":"ksTags"}).getText().split("  "))[1:]
    if job_soup.find("ul",{"class":"listing mt15"}) is None:
        skill_experience = "Error"
    else:
        skill_experience = job_soup.find("ul",{"class":"listing mt15"}).getText().strip()
    #putting the education information
    if job_soup.find("div",{"itemprop":"educationRequirements"}) is None:
        education_info = {}
        for l in edu_labels:
            education_info[l] = ""
    else:
        education = [x.getText().split(':') for x in job_soup.find("div",{"itemprop":"educationRequirements"}).contents if len(str(x).replace(' ',''))!=0]
        education_info = {edu_label.strip(): education.strip() for edu_label, education in education}
        for l in edu_labels:
            if l not in education_info.keys():
                education_info[l] = ""
    #recruiter information
    #This is inside a javascript button : BeautifulSoup is HTML parser: For this need SELENIUM
    #except AttributeError:
    #print("Attribute Error")
    #continue
    df = OrderedDict({'Job_Link':job_url, 'Job_Title':job_title, 'Company_Name':company_name, 'Experience':experience, 'Location':location, 
                      'SalaryI':salary, 'Num_Openings':num_openings, 'Job_Post':job_post, 'Job_Application':job_application, 'Job_View':job_view, 
                      'Key_Skills':key_skills, 'Skill_Experience':skill_experience})
    df.update(other_info_label)
    df.update(education_info)
    job_dl_df = job_dl_df.append(df,ignore_index=True)
    print(job_dl_df.shape)
    time.sleep(1)
print("Iteration Completed and get data of" + str(job_dl_df.shape))

In [50]:
print("Total jobs we scrapped is " + str(job_df.shape[0]))

Total jobs we scrapped is 6098


In [56]:
# Let's save us all the job profiles as pkl as well as csv file
with open("data/complete_job_profiles.pkl", "wb") as f:
    cPickle.dump(job_df, f)
    
job_df.to_csv(" data/complete_job_profiles.csv")

** The next work will be cleaning the data for further analysis :**    
> We will perform the task in some other notebook.