In [0]:
import pandas as pd
import numpy as np
import datetime
from geopy.geocoders import Nominatim  # to library to work on Geo
geolocator = Nominatim(user_agent="Covid analysis")

# CoronaVirus Analysis : Data from John Hopkins University 

Coronavirus data collated everyday from Github. Source is given below :- 

https://github.com/CSSEGISandData/COVID-19/raw/ 

Updated till 24/4/2020 

Collating data for different dates and doing some analysis


In [0]:
def date_range(start, end):
    r = (end+datetime.timedelta(days=1)-start).days
    return [start+datetime.timedelta(days=i) for i in range(r)]
 
start = datetime.date(2020,1,22)
end = datetime.date(2020,4,26)
dateList = date_range(start, end)
datelist = []
for i in dateList :
  datelist.append(i.strftime("%m-%d-%Y")) 

In [0]:
# Getting data into list
list_data = []
for i in datelist : 
  try :
    df = pd.read_csv('https://github.com/CSSEGISandData/COVID-19/raw/master/csse_covid_19_data/csse_covid_19_daily_reports/{}.csv'.format(i))
    df['date'] = i
    list_data.append(df)
  except : 
    print(i)
    
## concatenating all the list elemets(i.e daily data) into a dataframe
df = pd.concat(list_data , ignore_index= True)

Data Cleansing
    1. Finding duplicate/similar columns
    2. Finding count of null values in duplicate columns
    3. Choosing a column which has less null values and replacing the null values from duplicate column
    4. Dropping duplicate columns 
    

In [0]:
df['Country_Region'] = np.where(df['Country_Region'].isnull() , df['Country/Region'] , df['Country_Region'])

In [0]:
df['Province_State'] = np.where(df['Province_State'].isnull() , df['Province/State'] , df['Province_State'])

In [0]:
df['Latitude'] = np.where(df['Latitude'].isnull() , df['Lat'] , df['Latitude'])
df['Longitude'] = np.where(df['Longitude'].isnull() , df['Long_'] , df['Longitude'])

In [0]:
df['Last_Update'] = np.where(df['Last_Update'].isnull() , df['Last Update'] , df['Last_Update'])

In [0]:
df = df.drop(['Province/State', 'Country/Region', 'Last Update','Lat','Long_'], axis = 'columns' )

Filling or Replacing NULL/NaN values with 0

In [0]:
df[['Confirmed','Deaths','Recovered']] = df[['Confirmed','Deaths','Recovered']].fillna(0)

Getting Deaths per confirmed

In [0]:
df['Death_Per_Confirmed'] = (df['Deaths'] / (df['Confirmed']+0.000000001))*100

In [12]:
country = df.groupby(['Country_Region','date'])[['Confirmed','Deaths','Recovered','Active','Death_Per_Confirmed']]
type(country)  # pandas.core.groupby.generic.DataFrameGroupBy, its a group by object

country = country.sum().reset_index()
country
# type(country) # pandas.core.frame.DataFrame, now has become a dataframe

Unnamed: 0,Country_Region,date,Confirmed,Deaths,Recovered,Active,Death_Per_Confirmed
0,Azerbaijan,02-28-2020,1.0,0.0,0.0,0.0,0.0
1,Afghanistan,02-24-2020,1.0,0.0,0.0,0.0,0.0
2,Afghanistan,02-25-2020,1.0,0.0,0.0,0.0,0.0
3,Afghanistan,02-26-2020,1.0,0.0,0.0,0.0,0.0
4,Afghanistan,02-27-2020,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...
10256,occupied Palestinian territory,03-12-2020,0.0,0.0,0.0,0.0,0.0
10257,occupied Palestinian territory,03-14-2020,0.0,0.0,0.0,0.0,0.0
10258,occupied Palestinian territory,03-15-2020,0.0,0.0,0.0,0.0,0.0
10259,occupied Palestinian territory,03-16-2020,0.0,0.0,0.0,0.0,0.0


In [0]:
first_case = country.groupby('Country_Region')['date'].min().to_frame().reset_index().rename(columns = {'date':'first_date'})

In [0]:
country = pd.merge(country , first_case , on = 'Country_Region', how = 'left' )

In [15]:
# country = country.drop(columns = ['first_date_x','first_date_y','first_date'])
country

Unnamed: 0,Country_Region,date,Confirmed,Deaths,Recovered,Active,Death_Per_Confirmed,first_date
0,Azerbaijan,02-28-2020,1.0,0.0,0.0,0.0,0.0,02-28-2020
1,Afghanistan,02-24-2020,1.0,0.0,0.0,0.0,0.0,02-24-2020
2,Afghanistan,02-25-2020,1.0,0.0,0.0,0.0,0.0,02-24-2020
3,Afghanistan,02-26-2020,1.0,0.0,0.0,0.0,0.0,02-24-2020
4,Afghanistan,02-27-2020,1.0,0.0,0.0,0.0,0.0,02-24-2020
...,...,...,...,...,...,...,...,...
10256,occupied Palestinian territory,03-12-2020,0.0,0.0,0.0,0.0,0.0,03-10-2020
10257,occupied Palestinian territory,03-14-2020,0.0,0.0,0.0,0.0,0.0,03-10-2020
10258,occupied Palestinian territory,03-15-2020,0.0,0.0,0.0,0.0,0.0,03-10-2020
10259,occupied Palestinian territory,03-16-2020,0.0,0.0,0.0,0.0,0.0,03-10-2020


In [16]:
print(df.columns)
print(df.isnull().sum())
print(df.query(" Province_State != Province_State "))
print(np.where(df['Country_Region'].isnull()))

Index(['Confirmed', 'Deaths', 'Recovered', 'date', 'Latitude', 'Longitude',
       'FIPS', 'Admin2', 'Province_State', 'Country_Region', 'Last_Update',
       'Active', 'Combined_Key', 'Death_Per_Confirmed'],
      dtype='object')
Confirmed                  0
Deaths                     0
Recovered                  0
date                       0
Latitude                4310
Longitude               4310
FIPS                   17602
Admin2                 17128
Province_State          9639
Country_Region             0
Last_Update                0
Active                  7617
Combined_Key            7617
Death_Per_Confirmed        0
dtype: int64
        Confirmed  Deaths  ...        Combined_Key Death_Per_Confirmed
35            2.0     0.0  ...                 NaN            0.000000
36            2.0     0.0  ...                 NaN            0.000000
37            1.0     0.0  ...                 NaN            0.000000
73            1.0     0.0  ...                 NaN            0.00

Creating new dataframe for India only



In [0]:
covid_in = df.query("Country_Region.str.upper() == 'INDIA' ").reset_index()

In [23]:
covid_in.isnull().sum()
np.where(covid_in['Latitude'].notnull())
covid_in.describe()
covid_in.sort_values("Last_Update",ascending = True)
covid_in.query(" Latitude == 'NaN' " ).head(2)
covid_in

Unnamed: 0,index,Confirmed,Deaths,Recovered,date,Latitude,Longitude,FIPS,Admin2,Province_State,Country_Region,Last_Update,Active,Combined_Key,Death_Per_Confirmed,location
0,430,1.0,0.0,0.0,01-30-2020,,,,,,India,1/30/20 16:00,,,0.000000,Unknown
1,491,1.0,0.0,0.0,01-31-2020,,,,,,India,1/31/2020 23:59,,,0.000000,Unknown
2,547,1.0,0.0,0.0,02-01-2020,,,,,,India,1/31/2020 8:15,,,0.000000,Unknown
3,607,2.0,0.0,0.0,02-02-2020,,,,,,India,2020-02-02T06:03:08,,,0.000000,Unknown
4,672,3.0,0.0,0.0,02-03-2020,,,,,,India,2020-02-03T21:43:02,,,0.000000,Unknown
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83,104753,21370.0,681.0,4370.0,04-22-2020,20.593684,78.96288,,,,India,2020-04-22 23:30:32,16319.0,India,3.186710,"20.593684,78.96288"
84,107865,23077.0,721.0,5012.0,04-23-2020,20.593684,78.96288,,,,India,2020-04-24 03:30:31,17344.0,India,3.124323,"20.593684,78.96288"
85,110993,24530.0,780.0,5498.0,04-24-2020,20.593684,78.96288,,,,India,2020-04-25 06:30:33,18252.0,India,3.179780,"20.593684,78.96288"
86,114127,26283.0,825.0,5939.0,04-25-2020,20.593684,78.96288,,,,India,2020-04-26 02:30:31,19519.0,India,3.138911,"20.593684,78.96288"


Adding new column location and  updating it as Unknown if latitude or longitude is not available

In [0]:
covid_in['location'] = np.where((covid_in['Latitude'].isnull() | covid_in['Longitude'].isnull()) , 'Unknown' , covid_in['Latitude'].astype(str)+','+covid_in['Longitude'].astype(str) )

Replacing location coordinates with geographical address using Geopy Library

In [20]:
# covid_in = covid_in.reset_index()
covid_in_nounknown = covid_in[(covid_in["location"]!= "Unknown")].reset_index(drop = True)
covid_in_nounknown.head(2)

Unnamed: 0,index,Confirmed,Deaths,Recovered,date,Latitude,Longitude,FIPS,Admin2,Province_State,Country_Region,Last_Update,Active,Combined_Key,Death_Per_Confirmed,location
0,2895,3.0,0.0,3.0,03-01-2020,21.0,78.0,,,,India,2020-02-16T07:43:02,,,0.0,"21.0,78.0"
1,3016,5.0,0.0,3.0,03-02-2020,21.0,78.0,,,,India,2020-03-02T22:33:09,,,0.0,"21.0,78.0"


In [21]:
n = 0 
for i in covid_in_nounknown['location']:
    # print(geolocator.reverse(i))
    print(i)
    geo = geolocator.reverse(i)
    print(geo)
    covid_in_nounknown.at[n,'location'] = geo
    print(covid_in_nounknown.at[n,'location'] )
    n += 1


21.0,78.0
Mozari, Teosa, Amravati, Maharashtra, India
Mozari, Teosa, Amravati, Maharashtra, India
21.0,78.0
Mozari, Teosa, Amravati, Maharashtra, India
Mozari, Teosa, Amravati, Maharashtra, India
21.0,78.0
Mozari, Teosa, Amravati, Maharashtra, India
Mozari, Teosa, Amravati, Maharashtra, India
21.0,78.0
Mozari, Teosa, Amravati, Maharashtra, India
Mozari, Teosa, Amravati, Maharashtra, India
21.0,78.0
Mozari, Teosa, Amravati, Maharashtra, India
Mozari, Teosa, Amravati, Maharashtra, India
21.0,78.0
Mozari, Teosa, Amravati, Maharashtra, India
Mozari, Teosa, Amravati, Maharashtra, India
21.0,78.0
Mozari, Teosa, Amravati, Maharashtra, India
Mozari, Teosa, Amravati, Maharashtra, India
21.0,78.0
Mozari, Teosa, Amravati, Maharashtra, India
Mozari, Teosa, Amravati, Maharashtra, India
21.0,78.0
Mozari, Teosa, Amravati, Maharashtra, India
Mozari, Teosa, Amravati, Maharashtra, India
21.0,78.0
Mozari, Teosa, Amravati, Maharashtra, India
Mozari, Teosa, Amravati, Maharashtra, India
21.0,78.0
Mozari, Te

In [30]:
covid_in[['Latitude','Longitude']].head(88)

Unnamed: 0,Latitude,Longitude
0,,
1,,
2,,
3,,
4,,
...,...,...
83,20.593684,78.96288
84,20.593684,78.96288
85,20.593684,78.96288
86,20.593684,78.96288


In [0]:
covid_in_nounknown[[0:1],['location']]

SyntaxError: invalid syntax (<ipython-input-25-9f9371d44770>, line 1)

In [0]:
covid_in_nounknown.at[0,'location'] = 'India'