In [8]:
import pyspark
import pyspark.sql
from pyspark.sql import *
from pyspark import SparkContext
from pyspark.sql import SparkSession, Row
from pyspark.sql.functions import *
from functools import reduce
from pyspark.sql import DataFrame
import pandas
from pyspark.sql.types import StructType,StructField, StringType, IntegerType 
from pyspark.sql.types import ArrayType, DoubleType, BooleanType
from pyspark.sql.functions import col,array_contains

In [9]:
spark = SparkSession \
    .builder \
    .appName("Vaccine Analysis") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

In [10]:
# Reading CSV File
IND_DF = spark.read.csv("hdfs://nameservice1/user/adityabend99edu/IND.csv", header='true', 
                      inferSchema='true')
USA_DF = spark.read.csv("hdfs://nameservice1/user/adityabend99edu/USA.csv", header='true', 
                      inferSchema='true')
AUS_DF = spark.read.csv("hdfs://nameservice1/user/adityabend99edu/AUS.csv", header='true', 
                      inferSchema='true')

In [11]:
AUS_DF = AUS_DF.withColumn("Country", lit("AUS")).withColumnRenamed("Vaccine Type", "VaccinationType").withColumnRenamed("Patient Name", "Name").withColumnRenamed("Date of Vaccination", "VaccinationDate").drop("Date of Birth","Unique ID")
AUS_DF.show()
AUS_DF.printSchema()

+---------+---------------+-------------------+-------+
|     Name|VaccinationType|    VaccinationDate|Country|
+---------+---------------+-------------------+-------+
|     Mike|            LMN|2022-05-11 00:00:00|    AUS|
|Jonnathan|            XYZ|         2021-13-13|    AUS|
| Cristina|            ABC|2022-03-12 00:00:00|    AUS|
+---------+---------------+-------------------+-------+

root
 |-- Name: string (nullable = true)
 |-- VaccinationType: string (nullable = true)
 |-- VaccinationDate: string (nullable = true)
 |-- Country: string (nullable = false)



In [12]:
# Converting to Date Format
AUS_DF = AUS_DF.select(col("Name"), col("VaccinationType"), to_date(col("VaccinationDate"),"yyyy-MM-dd").alias("date"), col("Country"))
AUS_DF.show()
#Testing
#print(test_Data_frame_data(AUS_DF))

+---------+---------------+----------+-------+
|     Name|VaccinationType|      date|Country|
+---------+---------------+----------+-------+
|     Mike|            LMN|2022-05-11|    AUS|
|Jonnathan|            XYZ|      null|    AUS|
| Cristina|            ABC|2022-03-12|    AUS|
+---------+---------------+----------+-------+



In [13]:
USA_DF = USA_DF.withColumn("Country", lit("USA")).drop("ID")
USA_DF = USA_DF.withColumn("VaccinationDate", USA_DF["VaccinationDate"].cast(StringType()))
USA_DF.printSchema()
USA_DF.show()
#Testing
#print(test_Data_frame_data(USA_DF))

root
 |-- Name: string (nullable = true)
 |-- VaccinationType: string (nullable = true)
 |-- VaccinationDate: string (nullable = true)
 |-- Country: string (nullable = false)

+----+---------------+---------------+-------+
|Name|VaccinationType|VaccinationDate|Country|
+----+---------------+---------------+-------+
| Sam|            EFG|        6152022|    USA|
|John|            XYZ|        1052022|    USA|
|Mike|            ABC|       12282021|    USA|
+----+---------------+---------------+-------+



In [14]:
# Converting to Date Format
USA_DF = USA_DF.select(col("Name"), col("VaccinationType"), to_date(col("VaccinationDate"),"MMddyyyy").alias("date"), col("Country"))
USA_DF.show()

+----+---------------+----------+-------+
|Name|VaccinationType|      date|Country|
+----+---------------+----------+-------+
| Sam|            EFG|      null|    USA|
|John|            XYZ|      null|    USA|
|Mike|            ABC|2021-12-28|    USA|
+----+---------------+----------+-------+



In [15]:
IND_DF = IND_DF.drop("DOB","Free or Paid","ID")
IND_DF = IND_DF.withColumn("Country", lit("IND"))
IND_DF.show()

+------+---------------+-------------------+-------+
|  Name|VaccinationType|    VaccinationDate|Country|
+------+---------------+-------------------+-------+
| Vikas|            XYZ|2022-01-01 00:00:00|    IND|
| Rahul|            ABC|2022-03-05 00:00:00|    IND|
|Sameer|            ABC|2022-02-20 00:00:00|    IND|
+------+---------------+-------------------+-------+



In [16]:
# Converting to Date Format
IND_DF = IND_DF.select(col("Name"), col("VaccinationType"), to_date(col("VaccinationDate"),"yyyy-MM-dd").alias("date"), col("Country"))
IND_DF.show()

#Testing
#print(test_Data_frame_data(IND_DF))

+------+---------------+----------+-------+
|  Name|VaccinationType|      date|Country|
+------+---------------+----------+-------+
| Vikas|            XYZ|2022-01-01|    IND|
| Rahul|            ABC|2022-03-05|    IND|
|Sameer|            ABC|2022-02-20|    IND|
+------+---------------+----------+-------+



In [17]:
# Data merging into single source of truth
import datetime


dfs = [AUS_DF,IND_DF,USA_DF]
Combine_df = reduce(DataFrame.unionAll, dfs)


Combine_df.show()

+---------+---------------+----------+-------+
|     Name|VaccinationType|      date|Country|
+---------+---------------+----------+-------+
|     Mike|            LMN|2022-05-11|    AUS|
|Jonnathan|            XYZ|      null|    AUS|
| Cristina|            ABC|2022-03-12|    AUS|
|    Vikas|            XYZ|2022-01-01|    IND|
|    Rahul|            ABC|2022-03-05|    IND|
|   Sameer|            ABC|2022-02-20|    IND|
|      Sam|            EFG|      null|    USA|
|     John|            XYZ|      null|    USA|
|     Mike|            ABC|2021-12-28|    USA|
+---------+---------------+----------+-------+

