# **Fetch using PySpark**

In [1]:
import requests
import os
import pandas as pd
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from bs4 import BeautifulSoup

**Hit a Google search for data analyst jobs, open the LinkedIn result in new tab, copy the URL and follow next steps:**

In [2]:
job_url = "https://www.linkedin.com/jobs/search?keywords=Data%20Analyst&location=&geoId=&trk=public_jobs_jobs-search-bar_search-submit&position=1&pageNum=0"
souped = requests.get(job_url)  # '.get()' returns HTML content in jumbled form
job_html = BeautifulSoup(souped.content, "html.parser")  # formatting the HTML content

# Fetching the job titles (job positions)
job_list = []
job_titles = job_html.find_all('h3', {'class' : 'base-search-card__title'}) # finding all <H3> html tags of specified class
for i in job_titles: job_list.append(i.getText().strip()) # 'getText()' returns text content of <H3> tag; 'strip()' crops leading/lagging spaces

# Fetching name of the company offering each job
job_company = []
company_name = job_html.find_all('h4', {'class' : 'base-search-card__subtitle'})
for i in company_name: job_company.append(i.getText().strip())

# Fetch job location
job_location = []
location = job_html.find_all('span', {'class' : 'job-search-card__location'})
for i in location: job_location.append(i.getText().strip())

# Recency of job posting
job_recency = []
recency1 = job_html.find_all('time', {'class' : 'job-search-card__listdate--new'})
recency2 = job_html.find_all('time', {'class' : 'job-search-card__listdate'})
for i in recency1: job_recency.append(i.getText().strip())
for i in recency2: job_recency.append(i.getText().strip())


In [3]:
# Create a spark session

spark = SparkSession.builder.appName('LinkedInJobFetch').getOrCreate()

In [4]:
data = {'Job_Title' : job_list,
        'Company' : job_company,
        'Location' : job_location,
        'Job_Recency' : job_recency }

col_headings = list(data.keys())    # This will be used later to assign col headings

spark_data = list(zip(*data.values()))
spark_df = spark.createDataFrame(spark_data)  # Creating Spark dataframe

# spark_df.show(n = spark_df.count(), truncate=False)

csv_path = '/content/spark_linkedin_jobs'  # Saving file in Colab's local storage in a dedicated folder

if os.path.exists(csv_path):   # If path exists
    spark_df.write.csv(csv_path, mode = "append", header = False)  # Append new data EXCLUDING header row
    print(f"Data APPENDED to {csv_path}")
else:
    spark_df.write.csv(csv_path, mode = "overwrite", header = False)  # At first-time write, INCLUDE header columns
    print(f"Data SAVED to {csv_path}")  # Printed ONLY ON FIRST RUN of this cell; next run onward, 'print' in above 'if' runs


Data SAVED to /content/spark_linkedin_jobs


In [5]:
from pyspark.sql.types import StructType, StructField, StringType

# Define column headers' schema
col_schema = StructType([
    StructField(col_headings[0], StringType(), True),
    StructField(col_headings[1], StringType(), True),
    StructField(col_headings[2], StringType(), True),
    StructField(col_headings[3], StringType(), True) ])

# Reading the content of 'spark_linkedin_jobs.csv'
r = spark.read.csv('/content/spark_linkedin_jobs', schema = col_schema, multiLine = True)

r.dropDuplicates() # Remove duplicate rows

r.show(n = r.count(), truncate=False)  # 'r.count()' returns the total no. of rows


+-------------------------------------------+--------------------------------------+-------------------------------+------------+
|Job_Title                                  |Company                               |Location                       |Job_Recency |
+-------------------------------------------+--------------------------------------+-------------------------------+------------+
|Data Analyst, Finance & Strategy, MarComms |Netflix                               |Los Angeles, CA                |17 hours ago|
|Data Analyst                               |KFC                                   |Plano, TX                      |8 hours ago |
|Analyst, Insights & Analytics              |LoopMe                                |New York, NY                   |19 hours ago|
|Data Analyst                               |CULT GAIA                             |Los Angeles Metropolitan Area  |1 week ago  |
|Data Analyst                               |Zest AI                               |Burban

In [6]:
r.count()

60

**The first run is done (until the above cell). Now let us run the above code again to get next results of jobs and see what changes. We'll run all BUT the spark session creation cell above, again:**


In [7]:
job_url = "https://www.linkedin.com/jobs/search?keywords=Data%20Analyst&location=&geoId=&trk=public_jobs_jobs-search-bar_search-submit&position=1&pageNum=0"
souped = requests.get(job_url)  # '.get()' returns HTML content in jumbled form
job_html = BeautifulSoup(souped.content, "html.parser")  # formatting the HTML content

#Fetching the job titles (job positions)
job_list = []
job_titles = job_html.find_all('h3', {'class' : 'base-search-card__title'}) # finding all <H3> html tags of specified class
for i in job_titles: job_list.append(i.getText().strip()) # 'getText()' returns text content of <H3> tag; 'strip()' crops leading/lagging spaces

#Fetching name of the company offering each job
job_company = []
company_name = job_html.find_all('h4', {'class' : 'base-search-card__subtitle'})
for i in company_name: job_company.append(i.getText().strip())

#Fetch job location
job_location = []
location = job_html.find_all('span', {'class' : 'job-search-card__location'})
for i in location: job_location.append(i.getText().strip())

#Recency of job posting
job_recency = []
recency1 = job_html.find_all('time', {'class' : 'job-search-card__listdate--new'})
recency2 = job_html.find_all('time', {'class' : 'job-search-card__listdate'})
for i in recency1: job_recency.append(i.getText().strip())
for i in recency2: job_recency.append(i.getText().strip())


**Pay attention to the next cell's run; note which** ```print``` **runs here (tally with that in first run above).**

In [8]:
csv_path = '/content/spark_linkedin_jobs'  # Saving file in Colab's local storage in a dedicated folder

data.clear() # Clearing any previous content of dictionary 'data'

data = {'Job_Title' : job_list,
        'Company' : job_company,
        'Location' : job_location,
        'Job_Recency' : job_recency }

spark_data = list(zip(*data.values()))
spark_df = spark.createDataFrame(spark_data)  # Creating Spark dataframe

# spark_df.show(n = spark_df.count(), truncate=False)

if os.path.exists(csv_path):   # If path exists
    spark_df.write.csv(csv_path, mode = "append", header = False)  # Append new data EXCLUDING header columns
    print(f"Data APPENDED to {csv_path}")
else:
    spark_df.write.csv(csv_path, mode = "overwrite", header = False)  # At first-time write, INCLUDE header columns
    print(f"Data SAVED to {csv_path}")  # Printed ONLY ON FIRST RUN of this cell; next run onward, 'print' in above 'if' runs


Data APPENDED to /content/spark_linkedin_jobs


**Reading the content again:**

In [9]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

# Define schema
col_schema = StructType([
    StructField(col_headings[0], StringType(), True),
    StructField(col_headings[1], StringType(), True),
    StructField(col_headings[2], StringType(), True),
    StructField(col_headings[3], StringType(), True)
])

# Reading the content of 'spark_linkedin_jobs.csv'
r = spark.read.csv('/content/spark_linkedin_jobs', schema = col_schema, multiLine = True)

# r.toDF(*col_headings)
r.dropDuplicates()  # Remove duplicate rows

r.show(n = r.count(), truncate=False)  # 'r.count()' returns the total no. of rows


+-------------------------------------------+--------------------------------------+-------------------------------+------------+
|Job_Title                                  |Company                               |Location                       |Job_Recency |
+-------------------------------------------+--------------------------------------+-------------------------------+------------+
|Data Analyst, Finance & Strategy, MarComms |Netflix                               |Los Angeles, CA                |17 hours ago|
|Data Analyst                               |KFC                                   |Plano, TX                      |8 hours ago |
|Analyst, Insights & Analytics              |LoopMe                                |New York, NY                   |19 hours ago|
|Data Analyst                               |CULT GAIA                             |Los Angeles Metropolitan Area  |1 week ago  |
|Data Analyst                               |Zest AI                               |Burban

In [10]:
r.count()  # In first-run, count was 60; changed value indicates new data has been added

120

In [11]:
spark.stop() # Stop session to free resources

In [12]:
!rm -rf /content/spark_linkedin_jobs/  # Erase folder as the demonstration task is completed

# **Fetching using pandas**

In [69]:
from bs4 import BeautifulSoup
import requests, pandas as pd

# job_url = 'https://www.linkedin.com/jobs/data-analyst-jobs-pune/?currentJobId=3783699912&originalSubdomain=in'
job_url = "https://www.linkedin.com/jobs/search?keywords=Data%20Analyst&location=&geoId=&trk=public_jobs_jobs-search-bar_search-submit&position=1&pageNum=0"
souped = requests.get(job_url)

job_html = BeautifulSoup(souped.content, "html.parser")

#Fetch job titles (job positions)
job_titles = job_html.find_all('h3', {'class' : 'base-search-card__title'})
job_list = []
for i in job_titles: job_list.append(i.getText().strip())

#Fetch name of the company offering each job
company_name = job_html.find_all('h4', {'class' : 'base-search-card__subtitle'})
job_company = []
for i in company_name: job_company.append(i.getText().strip())

#Fetch job location
location = job_html.find_all('span', {'class' : 'job-search-card__location'})
job_location = []
for i in location: job_location.append(i.getText().strip())

#Recency of job posting
recency1 = job_html.find_all('time', {'class' : 'job-search-card__listdate--new'})
recency2 = job_html.find_all('time', {'class' : 'job-search-card__listdate'})
job_recency = []
for i in recency1: job_recency.append(i.getText().strip())
for i in recency2: job_recency.append(i.getText().strip())

#creating dataframe to store fetched job details
data = {'Job_Title' : job_list, 'Company' : job_company, 'Location' : job_location, 'Job_Recency' : job_recency}
pd_df = pd.DataFrame(data)

# Following block sends fetched job details to a CSV file

try:
    existing_df = pd.read_csv('pd_linkedin_jobs.csv')  # Try to read an existing CSV file
    updated_df = pd.concat([existing_df, pd_df], ignore_index = True)   # Append the new data to the existing DataFrame
    updated_df.to_csv('pd_linkedin_jobs.csv', index = False)    # Save the updated DataFrame back to the CSV file
    print(f"Data appended to {'pd_linkedin_jobs.csv'}")
except FileNotFoundError:
    pd_df.to_csv('pd_linkedin_jobs.csv', index = False)   # If the CSV file does not exist, create a new one
    print(f"Data saved to a new CSV file: {'pd_linkedin_jobs.csv'}")  # prints when data is fetch first ever time


Data saved to a new CSV file: pd_linkedin_jobs.csv


In [70]:
# Reading the saved 'pd_linkedin_jobs.csv' file
read_data = pd.read_csv('pd_linkedin_jobs.csv')
pd.set_option('display.max_rows', None)
read_data  # o/p showing 180 records as each run fetches 60 records and the above cell was run twice so it's appended same data

Unnamed: 0,Job_Title,Company,Location,Job_Recency
0,Data Analyst,KFC,"Plano, TX",12 hours ago
1,Data Analyst,Zest AI,"Burbank, CA",3 hours ago
2,"Data Analyst, Finance & Strategy, MarComms",Netflix,"Los Angeles, CA",1 hour ago
3,Data Analyst II,The Hershey Company,"Hershey, PA",15 hours ago
4,Data Analyst,Upstart,United States,4 days ago
5,Data Analyst,CULT GAIA,Los Angeles Metropolitan Area,1 week ago
6,Data Analyst,Korn Ferry,"California, United States",1 week ago
7,"Data Analyst, Pricing",Lyft,"Seattle, WA",2 weeks ago
8,Data Analyst,Korn Ferry,"Connecticut, United States",1 week ago
9,Data Analyst,Cloudflare,"Austin, TX",3 days ago


**Now, let us run the code again. We find that the output shows:** ```Data appended to pd_linkedin_jobs.csv
``` **whereas in the previous run (FIRST run) above, output shows** ```Data saved to a new CSV file: pd_linkedin_jobs.csv```.

In [71]:
from bs4 import BeautifulSoup
import requests, pandas as pd

# job_url = 'https://www.linkedin.com/jobs/data-analyst-jobs-pune/?currentJobId=3783699912&originalSubdomain=in'
job_url = "https://www.linkedin.com/jobs/search?keywords=Data%20Analyst&location=&geoId=&trk=public_jobs_jobs-search-bar_search-submit&position=1&pageNum=0"
souped = requests.get(job_url)

job_html = BeautifulSoup(souped.content, "html.parser")

#Fetch job titles (job positions)
job_titles = job_html.find_all('h3', {'class' : 'base-search-card__title'})
job_list = []
for i in job_titles: job_list.append(i.getText().strip())

#Fetch name of the company offering each job
company_name = job_html.find_all('h4', {'class' : 'base-search-card__subtitle'})
job_company = []
for i in company_name: job_company.append(i.getText().strip())

#Fetch job location
location = job_html.find_all('span', {'class' : 'job-search-card__location'})
job_location = []
for i in location: job_location.append(i.getText().strip())

#Recency of job posting
recency1 = job_html.find_all('time', {'class' : 'job-search-card__listdate--new'})
recency2 = job_html.find_all('time', {'class' : 'job-search-card__listdate'})
job_recency = []
for i in recency1: job_recency.append(i.getText().strip())
for i in recency2: job_recency.append(i.getText().strip())

#creating dataframe to store fetched job details
data = {'Job_Title' : job_list, 'Company' : job_company, 'Location' : job_location, 'Job_Recency' : job_recency}
pd_df = pd.DataFrame(data)

# Following block sends fetched job details to a CSV file

try:
    existing_df = pd.read_csv('pd_linkedin_jobs.csv')  # Try to read an existing CSV file
    updated_df = pd.concat([existing_df, pd_df], ignore_index = True)   # Append the new data to the existing DataFrame
    updated_df.to_csv('pd_linkedin_jobs.csv', index = False)    # Save the updated DataFrame back to the CSV file
    print(f"Data appended to {'pd_linkedin_jobs.csv'}")
except FileNotFoundError:
    pd_df.to_csv('pd_linkedin_jobs.csv', index = False)   # If the CSV file does not exist, create a new one
    print(f"Data saved to a new CSV file: {'pd_linkedin_jobs.csv'}")  # prints when data is fetch first ever time


Data appended to pd_linkedin_jobs.csv


**Now, if we read the CSV data again, it will show 120 records as we have run the code twice so far and each run is getting 60 records from LinkedIn.**

In [72]:
# Reading the saved 'pd_linkedin_jobs.csv' file
read_data = pd.read_csv('pd_linkedin_jobs.csv')
pd.set_option('display.max_rows', None)
read_data

Unnamed: 0,Job_Title,Company,Location,Job_Recency
0,Data Analyst,KFC,"Plano, TX",12 hours ago
1,Data Analyst,Zest AI,"Burbank, CA",3 hours ago
2,"Data Analyst, Finance & Strategy, MarComms",Netflix,"Los Angeles, CA",1 hour ago
3,Data Analyst II,The Hershey Company,"Hershey, PA",15 hours ago
4,Data Analyst,Upstart,United States,4 days ago
5,Data Analyst,CULT GAIA,Los Angeles Metropolitan Area,1 week ago
6,Data Analyst,Korn Ferry,"California, United States",1 week ago
7,"Data Analyst, Pricing",Lyft,"Seattle, WA",2 weeks ago
8,Data Analyst,Korn Ferry,"Connecticut, United States",1 week ago
9,Data Analyst,Cloudflare,"Austin, TX",3 days ago


# Shell commands for force deleting Colab folders/files


```
!rm -rf /content/folderName/            # -rf for recursive force (delete all folder content & the folder itself)
!rm -f /content/fileName.fileFormat     # -f for single file
```

e.g.
```
# !rm -rf /content/spark_linkedin_jobs.csv/  

# !rm -f /content/pd_linkedin_jobs.csv  
```