# **Fetch using PySpark**

In [None]:
import requests
import os
import pandas as pd
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from bs4 import BeautifulSoup

**Hit a Google search for data analyst jobs, open the LinkedIn result in new tab, copy the URL and follow next steps:**

In [None]:
job_url = "https://www.linkedin.com/jobs/search?keywords=Data%20Analyst&location=&geoId=&trk=public_jobs_jobs-search-bar_search-submit&position=1&pageNum=0"
souped = requests.get(job_url)  # '.get()' returns HTML content in jumbled form
job_html = BeautifulSoup(souped.content, "html.parser")  # formatting the HTML content

# Creating empty list to record job details
job_list, job_company, job_location, job_recency = [], [], [], []

# Extracting job details from 'job_html'
job_titles = job_html.find_all('h3', {'class' : 'base-search-card__title'}) # finding all <H3> html tags of specified class
company_name = job_html.find_all('h4', {'class' : 'base-search-card__subtitle'})
location = job_html.find_all('span', {'class' : 'job-search-card__location'})
recency1 = job_html.find_all('time', {'class' : 'job-search-card__listdate--new'})
recency2 = job_html.find_all('time', {'class' : 'job-search-card__listdate'})

# Populating the empty lists created above
for i in job_titles: job_list.append(i.getText().strip()) # 'getText()' returns text content of <H3> tag; 'strip()' crops leading/lagging spaces
for i in company_name: job_company.append(i.getText().strip())
for i in location: job_location.append(i.getText().strip())
for i in recency1: job_recency.append(i.getText().strip())
for i in recency2: job_recency.append(i.getText().strip())


In [None]:
# Create a spark session

spark = SparkSession.builder.appName('LinkedInJobFetch').getOrCreate()

In [None]:
data = {'Job_Title' : job_list,
        'Company' : job_company,
        'Location' : job_location,
        'Job_Recency' : job_recency }

col_headings = list(data.keys())    # This will be used later to assign col head

spark_data = list(zip(*data.values()))
spark_df = spark.createDataFrame(spark_data)  # Creating Spark dataframe

# spark_df.show(n = spark_df.count(), truncate=False)

csv_path = '/content/spark_linkedin_jobs'  # Saving file in Colab's local storage in a dedicated folder

if os.path.exists(csv_path):   # If path exists
    spark_df.write.csv(csv_path, mode = "append", header = False)
    print(f"Data APPENDED to {csv_path}")
else:
    spark_df.write.csv(csv_path, mode = "overwrite", header = False)
    print(f"Data SAVED to {csv_path}")  # Printed ONLY ON FIRST RUN of this cell; next run onward, 'print' in above 'if' runs


Data SAVED to /content/spark_linkedin_jobs


**By default, Spark saves CSV as multiple partitions in the specified folder, e.g.**

```
spark_linkedin_jobs/
├── part-00000-xxxx.csv
├── part-00001-xxxx.csv
├── _SUCCESS

```

**If only a single file is wanted as output, then change the** ``` spark_df.write.csv``` **to** ```spark_df.coalesce(1).write.csv```.


In [None]:
from pyspark.sql.types import StructType, StructField, StringType

# Define column headers' schema
col_schema = StructType([
    StructField(col_headings[0], StringType(), True),
    StructField(col_headings[1], StringType(), True),
    StructField(col_headings[2], StringType(), True),
    StructField(col_headings[3], StringType(), True) ])

# Reading the content of 'spark_linkedin_jobs.csv'
r = spark.read.csv('/content/spark_linkedin_jobs', schema = col_schema, multiLine = True)

r.dropDuplicates() # Remove duplicate rows

r.show(n = r.count(), truncate=False)  # 'r.count()' returns the total no. of rows


+-------------------------------------------+--------------------------------------+-------------------------------+------------+
|Job_Title                                  |Company                               |Location                       |Job_Recency |
+-------------------------------------------+--------------------------------------+-------------------------------+------------+
|Data Analyst, Finance & Strategy, MarComms |Netflix                               |Los Angeles, CA                |17 hours ago|
|Data Analyst                               |KFC                                   |Plano, TX                      |8 hours ago |
|Analyst, Insights & Analytics              |LoopMe                                |New York, NY                   |19 hours ago|
|Data Analyst                               |CULT GAIA                             |Los Angeles Metropolitan Area  |1 week ago  |
|Data Analyst                               |Zest AI                               |Burban

In [None]:
r.count()

60

**The first run is done (until the above cell). Now let us run the above code again to get next results of jobs and see what changes. We'll run all BUT the spark session creation cell above, again:**


In [None]:
job_url = "https://www.linkedin.com/jobs/search?keywords=Data%20Analyst&location=&geoId=&trk=public_jobs_jobs-search-bar_search-submit&position=1&pageNum=0"
souped = requests.get(job_url)  # '.get()' returns HTML content in jumbled form
job_html = BeautifulSoup(souped.content, "html.parser")  # formatting the HTML content

# Creating empty list to record job details
job_list, job_company, job_location, job_recency = [], [], [], []

# Extracting job details from 'job_html'
job_titles = job_html.find_all('h3', {'class' : 'base-search-card__title'}) # finding all <H3> html tags of specified class
company_name = job_html.find_all('h4', {'class' : 'base-search-card__subtitle'})
location = job_html.find_all('span', {'class' : 'job-search-card__location'})
recency1 = job_html.find_all('time', {'class' : 'job-search-card__listdate--new'})
recency2 = job_html.find_all('time', {'class' : 'job-search-card__listdate'})

# Populating the empty lists created above
for i in job_titles: job_list.append(i.getText().strip()) # 'getText()' returns text content of <H3> tag; 'strip()' crops leading/lagging spaces
for i in company_name: job_company.append(i.getText().strip())
for i in location: job_location.append(i.getText().strip())
for i in recency1: job_recency.append(i.getText().strip())
for i in recency2: job_recency.append(i.getText().strip())


**Pay attention to the next cell's run; note which** ```print``` **runs here (tally with that in first run above).**

In [None]:
csv_path = '/content/spark_linkedin_jobs'  # Saving file in Colab's local storage in a dedicated folder

data.clear() # Clearing any previous content of dictionary 'data'

data = {'Job_Title' : job_list,
        'Company' : job_company,
        'Location' : job_location,
        'Job_Recency' : job_recency }

spark_data = list(zip(*data.values()))
spark_df = spark.createDataFrame(spark_data)  # Creating Spark dataframe

# spark_df.show(n = spark_df.count(), truncate=False)

if os.path.exists(csv_path):   # If path exists
    spark_df.write.csv(csv_path, mode = "append", header = False)
    print(f"Data APPENDED to {csv_path}")  # Printed in o/p
else:
    spark_df.write.csv(csv_path, mode = "overwrite", header = False)
    print(f"Data SAVED to {csv_path}")


Data APPENDED to /content/spark_linkedin_jobs


**Reading the content again:**

In [None]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

# Define schema for columns
col_schema = StructType([
    StructField(col_headings[0], StringType(), True),
    StructField(col_headings[1], StringType(), True),
    StructField(col_headings[2], StringType(), True),
    StructField(col_headings[3], StringType(), True)
])

# Reading the content of 'spark_linkedin_jobs'
r = spark.read.csv('/content/spark_linkedin_jobs', schema = col_schema, multiLine = True)

r.dropDuplicates()  # Remove duplicate rows

r.show(n = r.count(), truncate=False)  # 'r.count()' returns the total no. of rows


+-------------------------------------------+--------------------------------------+-------------------------------+------------+
|Job_Title                                  |Company                               |Location                       |Job_Recency |
+-------------------------------------------+--------------------------------------+-------------------------------+------------+
|Data Analyst, Finance & Strategy, MarComms |Netflix                               |Los Angeles, CA                |17 hours ago|
|Data Analyst                               |KFC                                   |Plano, TX                      |8 hours ago |
|Analyst, Insights & Analytics              |LoopMe                                |New York, NY                   |19 hours ago|
|Data Analyst                               |CULT GAIA                             |Los Angeles Metropolitan Area  |1 week ago  |
|Data Analyst                               |Zest AI                               |Burban

In [None]:
r.count()  # In first-run, count was 60; changed value indicates new data has been added

120

In [None]:
spark.stop() # Stop session to free resources

In [None]:
!rm -rf /content/spark_linkedin_jobs/  # Erase folder as the demonstration task is completed

# Shell commands for force deleting Colab folders/files


```
!rm -rf /content/folderName/            # -rf for recursive force (delete all folder content & the folder itself)
!rm -f /content/fileName.fileFormat     # -f for single file
```

e.g.
```
# !rm -rf /content/spark_linkedin_jobs.csv/  

# !rm -f /content/pd_linkedin_jobs.csv  
```

### Exporting the current notebook to '/content/' as HTML file

```
Download the current notebook as .ipynb
Re-upload the saved file in '/content/' folder.
Run the following code:
```

In [None]:
# Convert the uploaded .ipynb file to HTML

import nbformat
from nbconvert import HTMLExporter

# File paths
notebook_path = "/content/PySpark_LinkedInJobsFetch.ipynb"
html_path = "/content/PySpark_LinkedInJobsFetch.html"

# Load the notebook
with open(notebook_path) as f:
    notebook_content = nbformat.read(f, as_version=4)

# Convert to HTML
html_exporter = HTMLExporter()
body, _ = html_exporter.from_notebook_node(notebook_content)

# Save HTML file
with open(html_path, "w", encoding="utf-8") as f:
    f.write(body)

# Return the HTML file path for download
html_path
