In [1]:
from pyspark.sql import SparkSession
import getpass
username = getpass.getuser()
spark = SparkSession. \
        builder. \
        config('spark.ui.port','0'). \
        config("spark.sql.warehouse.dir", f"/user/itv013286/warehouse"). \
        enableHiveSupport(). \
        master('yarn'). \
        getOrCreate()

In [2]:
spark

Consider the Covid19 Dataset

Cases
=====
date,state,positive,negative,pending,hospitalizedCurrently,hospitalizedCumulative,inIcuCurrently,inIcuCumulative,onVentilatorCurrently,onVentilatorCumulative,recovered,dataQualityGrade,lastUpdateEt,dateModified,checkTimeEt,death,hospitalized,dateChecked,totalTestsViral,positiveTestsViral,negativeTestsViral,positiveCasesViral,deathConfirmed,deathProbable,fips,positiveIncrease,negativeIncrease,total,totalTestResults,totalTestResultsIncrease,posNeg,deathIncrease,hospitalizedIncrease,hash,commercialScore,negativeRegularScore,negativeScore,positiveScore,score,grade

States
=====
state,notes,covid19Site,covid19SiteSecondary,covid19SiteTertiary,twitter,covid19SiteOld,name,fips,pui,pum

#### 1.Find the top10 states with the highest no.of positive cases.


In [3]:
rdd = spark.sparkContext.textFile("/public/trendytech/covid19/cases/covid_dataset_cases.csv")


In [4]:
rdd.take(10)

['20200122,AP,2,0,48,26,15,18,2,38,10,34,B,18,19/05/2022,23,24,29,34,19,45,5,44,42,49,53,0,0,2,2,0,2,0,0,8f8db794931706272489cddd51e917a4a69c8c9b,0,0,0,0,0',
 '20200123,AP,2,0,48,41,2,20,30,40,5,50,B,1,08/11/2022,14,7,33,36,14,18,36,37,45,8,53,0,0,2,2,0,2,0,0,e16af2a6a8f060355ff5ba499a28309a262c0b1e,0,0,0,0,0',
 '20200124,HP,2,0,16,14,5,29,43,22,11,11,D,31,17/05/2022,10,37,11,25,45,25,2,32,30,41,53,0,0,2,2,0,2,0,0,094154f68e74bfc30b977cdee888f9c07be4360e,0,0,0,0,0',
 '20200125,HP,2,0,10,13,41,50,26,19,34,8,D,40,07/10/2022,32,5,33,9,50,31,18,38,7,16,53,0,0,2,2,0,2,0,0,9b52ca94dd2a996822542ea5f17a7363e7ad91cf,0,0,0,0,0',
 '20200126,AS,2,0,15,43,23,45,20,46,15,30,D,31,28/12/2022,22,14,1,29,2,24,15,12,9,10,53,0,0,2,2,0,2,0,0,7acb526e14f20a29cc74a0b32a37328bc6eac6c2,0,0,0,0,0',
 '20200127,HR,2,0,31,46,24,46,41,32,39,20,C,25,06/10/2022,32,16,9,21,14,40,14,49,32,49,53,0,0,2,2,0,2,0,0,90960ca688f971a0b7e4dc14b893f9c5d76d2bb6,0,0,0,0,0',
 '20200128,KA,2,0,35,1,17,33,46,4,29,30,C,10,12/02/2022,4

In [5]:
map_rdd = rdd.map(lambda x : (x.split(",")[1],(int(x.split(",")[2]))))

In [6]:
map_rdd.take(10)

[('AP', 2),
 ('AP', 2),
 ('HP', 2),
 ('HP', 2),
 ('AS', 2),
 ('HR', 2),
 ('KA', 2),
 ('KA', 3),
 ('WA', 3),
 ('WA', 3)]

In [7]:
reduced_rdd = map_rdd.reduceByKey(lambda x,y : x+y).sortBy(lambda x : x[1], ascending = False)

In [8]:
reduced_rdd.take(10)

[('WA', 1701),
 ('GA', 1017),
 ('MH', 730),
 ('MI', 61),
 ('CA', 53),
 ('GJ', 35),
 ('BR', 23),
 ('JH', 13),
 ('CG', 8),
 ('RI', 6)]

#### 2.Find the total count of people in ICU currently

In [17]:
map_rdd = rdd.map(lambda x : int(x.split(",")[7]))

In [18]:
map_rdd.take(10)

[18, 20, 29, 50, 45, 46, 33, 44, 23, 41]

In [19]:
map_rdd.reduce(lambda x,y: x+y)

1344

#### 3.Find the top15 States having maximum no. of recovery

In [27]:
map_rdd1 = rdd.map(lambda x: (x.split(",")[1],(int(x.split(",")[11])))).reduceByKey(lambda x,y:x+y)

In [31]:
sorted_rdd = map_rdd1.sortBy(lambda x: x[1],ascending = False)

In [32]:
sorted_rdd.take(15)

[('WA', 451),
 ('MH', 165),
 ('MI', 101),
 ('GA', 87),
 ('AP', 84),
 ('RI', 72),
 ('BR', 68),
 ('JH', 50),
 ('KA', 43),
 ('AZ', 38),
 ('AS', 30),
 ('GJ', 27),
 ('CA', 23),
 ('HR', 20),
 ('HP', 19)]

#### 4.Find the top 3 States having least no.of deaths

In [34]:
map_rdd = rdd.map(lambda x : (x.split(",")[1],int(x.split(",")[23])))

In [35]:
map_rdd.take(10)

[('AP', 42),
 ('AP', 45),
 ('HP', 30),
 ('HP', 7),
 ('AS', 9),
 ('HR', 32),
 ('KA', 26),
 ('KA', 31),
 ('WA', 5),
 ('WA', 17)]

In [36]:
result = map_rdd.reduceByKey(lambda x,y : x+y).sortBy(lambda x: x[1])

In [37]:
result.take(5)

[('AS', 9), ('JH', 10), ('CG', 31), ('HR', 32), ('HP', 37)]

#### 5.Find the total number of people hospitalized currently.

In [38]:
map_rdd = rdd.map(lambda x :int(x.split(",")[5]))

In [39]:
map_rdd.reduce(lambda x,y :x+y)

1319

#### 6.List the twitter handle and fipscode for the top15 states with the highest number of totalcases.

In [5]:
twitter_rdd = spark.sparkContext.textFile("/public/trendytech/covid19/states/covid_dataset_states.csv")

In [6]:
twitter_rdd.take(20)

['HP,null,https://covid19.hp.gov.in/ ,https://covid-19archive.org/,https://www.azdhs.gov,@HPCovid,https://arcg.is/0brSGj,null,53,,',
 'AS,null,https://covid19.hp.gov.in/ ,https://covid-19archive.org/,https://www.azdhs.gov,@ASCovid,null,null,6,null,null',
 'HR,null,https://covid19.hp.gov.in/ ,https://covid-19archive.org/,https://www.azdhs.gov,@HRCovid,null,null,9,null,null',
 'KA,null,https://covid19.hp.gov.in/ ,https://covid-19archive.org/,https://www.azdhs.gov,@KACovid,null,null,53,null,null',
 'WA,null,https://covid19.hp.gov.in/ ,https://covid-19archive.org/,https://www.azdhs.gov,@WACovid,null,null,44,null,null',
 'CG,null,https://covid19.hp.gov.in/ ,https://covid-19archive.org/,https://www.azdhs.gov,@CGCovid,null,null,53,null,null',
 'BR,null,https://covid19.hp.gov.in/ ,https://covid-19archive.org/,https://www.azdhs.gov,@BRCovid,null,null,53,null,null',
 'JH,null,https://covid19.hp.gov.in/ ,https://covid-19archive.org/,null,@JHCovid,null,null,53,null,null',
 'GJ,null,https://covid19

In [7]:
map_rdd = rdd.map(lambda x: (x.split(",")[1],int(x.split(",")[28])))

In [8]:
mapped_rdd = twitter_rdd.map(lambda x : (x.split(",")[0],(x.split(",")[5],int(x.split(",")[8]))))

In [9]:
mapped_rdd.take(10)

[('HP', ('@HPCovid', 53)),
 ('AS', ('@ASCovid', 6)),
 ('HR', ('@HRCovid', 9)),
 ('KA', ('@KACovid', 53)),
 ('WA', ('@WACovid', 44)),
 ('CG', ('@CGCovid', 53)),
 ('BR', ('@BRCovid', 53)),
 ('JH', ('@JHCovid', 53)),
 ('GJ', ('@GJCovid', 44)),
 ('MH', ('@MHCovid', 26))]

In [10]:
total_count = map_rdd.reduceByKey(lambda x,y :x+y)

In [11]:
total_count.take(10)

[('AP', 4),
 ('HP', 4),
 ('AS', 2),
 ('CG', 8),
 ('BR', 23),
 ('JH', 13),
 ('GJ', 35),
 ('MH', 730),
 ('RI', 16),
 ('CA', 515)]

In [12]:
join_rdd = total_count.join(mapped_rdd)

In [20]:
result1 = join_rdd.sortBy(lambda x :x[1][0], ascending=False)

In [21]:
result1.take(10)

[('WA', (2100, ('@WACovid', 44))),
 ('GA', (1034, ('@GACovid', 44))),
 ('MH', (730, ('@MHCovid', 26))),
 ('CA', (515, ('@CACovid', 4))),
 ('MI', (61, ('@MICovid', 53))),
 ('GJ', (35, ('@GJCovid', 44))),
 ('AZ', (34, ('@AZCovid', 53))),
 ('BR', (23, ('@BRCovid', 53))),
 ('RI', (16, ('@RICovid', 26))),
 ('JH', (13, ('@JHCovid', 53)))]