In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, when

spark = SparkSession.builder \
    .appName("Excel Processing with Spark") \
    .config("spark.jars.packages", "com.crealytics:spark-excel_2.12:0.13.7") \
    .getOrCreate()

file_name = 'raw_data.xlsx'
df_spark = spark.read.format("com.crealytics.spark.excel") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .load(file_name)



:: loading settings :: url = jar:file:/home/ubuntu/.local/lib/python3.10/site-packages/pyspark/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/ubuntu/.ivy2/cache
The jars for the packages stored in: /home/ubuntu/.ivy2/jars
com.crealytics#spark-excel_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-456a0f4b-af5f-4f02-8f1f-7a8828c694c8;1.0
	confs: [default]
	found com.crealytics#spark-excel_2.12;0.13.7 in central
	found org.apache.poi#poi;4.1.2 in central
	found commons-codec#commons-codec;1.13 in central
	found org.apache.commons#commons-collections4;4.4 in central
	found org.apache.commons#commons-math3;3.6.1 in central
	found com.zaxxer#SparseBitSet;1.2 in central
	found org.apache.poi#poi-ooxml;4.1.2 in central
	found org.apache.poi#poi-ooxml-schemas;4.1.2 in central
	found org.apache.xmlbeans#xmlbeans;3.1.0 in central
	found com.github.virtuald#curvesapi;1.06 in central
	found com.norbitltd#spoiwo_2.12;1.8.0 in central
	found org.scala-lang.modules#scala-xml_2.12;1.3.0 in central
	found com.github.pjfanning#excel-streaming-reader;2.3.6 in central
	

In [2]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window
import matplotlib.pyplot as plt

In [3]:
# Filtering Unnecessary Columns
cols_to_drop = [col for col in df_spark.columns if 'Unnamed' in col] + ['iso_code']
df_spark = df_spark.drop(*cols_to_drop)
# Check Columns
print("\nColumn names in the dataset:")
print(df_spark.columns)


Column names in the dataset:
['location', 'date', 'total_cases', 'total_deaths', 'stringency_index', 'population', 'gdp_per_capita', 'human_development_index']


In [4]:
# Handling Missing Values
numeric_columns = [col_name for col_name, dtype in df_spark.dtypes if dtype in ['int', 'double']]
for column in numeric_columns:
    mean_val = df_spark.agg(F.mean(column)).collect()[0][0]
    df_spark = df_spark.withColumn(column, F.when(F.col(column).isNull(), mean_val).otherwise(F.col(column)))
for column in df_spark.columns:
    count_missing = df_spark.filter(F.col(column).isNull()).count()
    if count_missing > 0:
        print(f"Missing values in '{column}': {count_missing}")
    else:
        print(f"'{column}' has no missing values.")

                                                                                

'location' has no missing values.
'date' has no missing values.
'total_cases' has no missing values.
'total_deaths' has no missing values.
'stringency_index' has no missing values.
'population' has no missing values.
'gdp_per_capita' has no missing values.
'human_development_index' has no missing values.


In [7]:
# Handling Outliers and Extremes
bounds = {}

for column in numeric_columns:
    Q1 = df_spark.approxQuantile(column, [0.25], 0.01)[0]
    Q3 = df_spark.approxQuantile(column, [0.75], 0.01)[0]
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    bounds[column] = (lower_bound, upper_bound)
    
    df_spark = df_spark.withColumn(column, 
                                  F.when(F.col(column) < lower_bound, lower_bound)
                                  .when(F.col(column) > upper_bound, upper_bound)
                                  .otherwise(F.col(column)))

for column in numeric_columns:
    lower_bound, upper_bound = bounds[column]
    count_lower_outliers = df_spark.filter(F.col(column) < lower_bound).count()
    count_upper_outliers = df_spark.filter(F.col(column) > upper_bound).count()
    print(f"'{column}' has {count_lower_outliers + count_upper_outliers} outliers.") 

'total_cases' has 0 outliers.
'total_deaths' has 0 outliers.
'stringency_index' has 0 outliers.
'population' has 0 outliers.
'gdp_per_capita' has 0 outliers.
'human_development_index' has 0 outliers.
