#### Start of the Program
Import Spark Context, Spark Session and Spark Conf

In [None]:
from pyspark import SparkContext, SparkConf

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql.functions import expr
from pyspark.sql.window import Window
from pyspark.sql.functions import monotonically_increasing_id, row_number

In [None]:
import os
sparkClassPath = os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.postgresql:postgresql:42.2.5 pyspark-shell'

##### Create a Spark session with name Spark session
Because we are running it in local mode the master is local otherwise we need to mention Yarn or some other

In [None]:
spark = SparkSession.builder.master("local").appName("PySpark Assignment").getOrCreate()

##### Specify the path where the file is Saved
Since the hadoop cluster is running locally we need to mention the location where the namenode is running, which in this case is localhost:9000
So the final Path will be - 
hdfs://localhost:9000/folder/filename

In [None]:
hdfs_file_path = "hdfs://hdfs:8020/data/time_series_covid19_confirmed_global.csv"

##### Read the file from the location sepcified above.
From Spark session reading the file stored in HDFS location mentioning option "header" as "true" to read the schema as it is. 

In [None]:
fileDf = spark.read.option("header", "true").csv(hdfs_file_path)

##### Create a temp view
Creating a temp view from the dataframe which can be used in Spark.sql

In [None]:
fileDf.createOrReplaceTempView("number_of_cases")

##### Calculate the number of columns

In [None]:
numberOfColumns = len(fileDf.columns)

#### List to store the last 14 days column names

In [None]:
arrayOfLast14DaysColumn = []

#### Loop through to get the latest 14 days column

In [None]:
for i in range(1, 15):
    arrayOfLast14DaysColumn.append(numberOfColumns - i)

#### Get last 14 days data into a dataframe

In [None]:
last14DaysDF = fileDf.select(*(fileDf.columns[i] for i in arrayOfLast14DaysColumn))

#### Showing 5 rows out of the last 14 days dataframe

In [None]:
last14DaysDF.show(5)

#### Summing up the values of the last 14 days and saving it into a column named result

In [None]:
resultSumDf = last14DaysDF.withColumn('result', sum(last14DaysDF[col] for col in last14DaysDF.columns))

#### Show 5 rows out of the resultSumDf

In [None]:
resultSumDf.show(5)

#### Getting the country details from the main dataframe

In [None]:
countryDF = fileDf.select(col("Lat").alias("latitude"), col("Long").alias("longitude"))

In [None]:
countryDF.show(5)

#### Add a row index to resultSumDf and countryDF, so that we can Join the dataframes

In [None]:
resultSumDfRowIndex=resultSumDf.withColumn('row_index', row_number().over(Window.orderBy(monotonically_increasing_id())))

In [None]:
countryDFRowIndex = countryDF.withColumn('row_index', row_number().over(Window.orderBy(monotonically_increasing_id())))

#### Joining the dataframes to get the final result

In [None]:
countryDFWithSum = resultSumDfRowIndex.join(countryDFRowIndex, on=["row_index"]).drop("row_index")

#### Show 5 rows from the countryDFWithSum

In [None]:
countryDFWithSum.show(5)

In [None]:
countryDFWithSum.createOrReplaceTempView("countryDFWithSum")

#### Group by the country name to sum up all the result for a particular country

In [None]:
numberOfCasesPerCountry = spark.sql("select latitude, longitude, sum(result) as total_numner_of_cases from countryDFWithSum group by latitude, longitude")

#### The directory path where we need to store the result

In [None]:
resultLocation = "hdfs://hdfs:8020/data/result"

#### Store the final DF to the result folder as a csv file here coalesce is used to have just one single file instead of partitioned file

In [None]:
numberOfCasesPerCountry.coalesce(1).write.mode('overwrite').option("header", "true").csv(resultLocation)

#### Adding the index column which can be treated as primary key

In [None]:
finalDF = numberOfCasesPerCountry.withColumn('row_index', row_number().over(Window.orderBy(monotonically_increasing_id())))

In [None]:
finalDF.show(10)

#### Write the dataframe into the table

In [None]:
finalDF.write \
.mode("overwrite") \
    .format("jdbc") \
    .option("url", "jdbc:postgresql://postgres:5432/WheaterData") \
    .option("dbtable", "average") \
    .option("user", "postgres") \
    .option("password", "password") \
    .option("driver", "org.postgresql.Driver") \
.save()

#### Read the data from the table to verify

In [None]:
number_of_cases_per_country = spark.read \
    .format("jdbc") \
    .option("url", "jdbc:postgresql://postgres:5432/WheaterData") \
    .option("dbtable", "average") \
    .option("user", "postgres") \
    .option("password", "password") \
    .option("driver", "org.postgresql.Driver") \
.load()

#### Show the result from the table

In [None]:
number_of_cases_per_country.show(5)