In [36]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

# Initialize SparkSession
spark = SparkSession.builder.appName("DataCleaning").getOrCreate()

# Load the dataset into a DataFrame
df = spark.read.csv("Cleaned_DS_Jobs.csv", header=True, inferSchema=True)

# Show the schema and check for null or malformed entries
df.printSchema()
df.show()

root
 |-- Job Title: string (nullable = true)
 |-- Salary Estimate: string (nullable = true)
 |-- Rating: double (nullable = true)
 |-- Location: string (nullable = true)
 |-- Size: string (nullable = true)
 |-- Type of ownership: string (nullable = true)
 |-- Industry: string (nullable = true)
 |-- Sector: string (nullable = true)
 |-- job_state: string (nullable = true)
 |-- company_age: integer (nullable = true)
 |-- python: integer (nullable = true)
 |-- spark: integer (nullable = true)
 |-- tableau: integer (nullable = true)

+--------------------+---------------+------+-----------------+--------------------+--------------------+--------------------+--------------------+---------+-----------+------+-----+-------+
|           Job Title|Salary Estimate|Rating|         Location|                Size|   Type of ownership|            Industry|              Sector|job_state|company_age|python|spark|tableau|
+--------------------+---------------+------+-----------------+------------------

In [37]:
# Split the salary_estimate column into two parts using the '-' delimiter
df = df.withColumn("min_salary", split(col("Salary Estimate"), "-").getItem(0).cast("integer")) \
       .withColumn("max_salary", split(col("Salary Estimate"), "-").getItem(1).cast("integer"))


In [38]:
df = df.drop("Salary Estimate")
df.show()

+--------------------+------+-----------------+--------------------+--------------------+--------------------+--------------------+---------+-----------+------+-----+-------+----------+----------+
|           Job Title|Rating|         Location|                Size|   Type of ownership|            Industry|              Sector|job_state|company_age|python|spark|tableau|min_salary|max_salary|
+--------------------+------+-----------------+--------------------+--------------------+--------------------+--------------------+---------+-----------+------+-----+-------+----------+----------+
|   Sr Data Scientist|   3.1|     New York, NY|1001 to 5000 empl...|Nonprofit Organiz...|  Insurance Carriers|           Insurance|       NY|         27|     0|    0|      0|       137|       171|
|      Data Scientist|   4.2|    Chantilly, VA|5001 to 10000 emp...|    Company - Public|Research & Develo...|   Business Services|       VA|         52|     0|    0|      0|       137|       171|
|      Data Sci

In [39]:
df = df.withColumn("average salary",((col("min_salary") + col("max_salary"))/2).cast("float"))

In [40]:
df = df.withColumn("Rating",when(col("Rating") <= 0, 1).otherwise(col("Rating")))

In [41]:
columns_with_null = [c for c in df.columns if df.filter(col(c).isNull()).count()>0]
print(f"Columns with NULL Values: {columns_with_null}")

df = df.fillna({col: 1 for col in columns_with_null})

columns_with_null = [c for c in df.columns if df.filter(col(c).isNull()).count()>0]
print(f"Columns with NULL Values: {columns_with_null}")

Columns with NULL Values: ['Size', 'Type of ownership', 'Industry', 'Sector']
Columns with NULL Values: []


In [43]:
# Group by Job Title and calculate the average salary
job_title_avg_salary = df.groupBy("Job Title").avg("average salary")
job_title_avg_salary.show()

+--------------------+-------------------+
|           Job Title|avg(average salary)|
+--------------------+-------------------+
|Senior Data Scien...|  99.33333333333333|
|Clinical Data Ana...|              164.5|
|Senior Business I...|               90.0|
|Data Analyst/Engi...|              115.5|
|Staff BI and Data...|              107.0|
|Intelligence Data...|              90.75|
|Report Writer-Dat...|               92.5|
|Hydrogen/Tritium ...|              148.0|
|Business Intellig...|             109.25|
|        Data Modeler|              154.0|
|Scientist / Group...|              197.5|
|Senior Research S...|              105.0|
|Software Engineer...|              164.5|
|   Sr Data Scientist|             126.75|
|COMPUTER SCIENTIS...|              271.5|
|Data Scientist/Ma...|              125.5|
|Data Scientist - ...|              120.5|
|  Decision Scientist|               94.5|
|Data Scientist - ...|              97.75|
|Data Scientist / ...|              128.5|
+----------

In [47]:
# Group by Company Size and calculate the average salary
company_size_avg_salary = df.groupBy("Size").avg("average salary")
company_size_avg_salary.show()

+--------------------+-------------------+
|                Size|avg(average salary)|
+--------------------+-------------------+
|5001 to 10000 emp...|  126.6639344262295|
|             Unknown| 143.38235294117646|
| 51 to 200 employees|       127.83203125|
|1001 to 5000 empl...|  121.7548076923077|
|501 to 1000 emplo...| 120.93506493506493|
|                   1|  130.7962962962963|
|201 to 500 employees|  119.1987951807229|
|    10000+ employees| 122.80379746835443|
|   1 to 50 employees|              120.0|
+--------------------+-------------------+



In [50]:
import pandas as pd

pd_frame = df.toPandas()
pd_frame.to_csv("cleaned_jobs.csv", index=False)