# COVID-19 Data Analisys

This Notebook represents some manipulations done on Covid-19's Data where we extract knowledge in near-realtime from : 


*   COVID-19 Data provided by the Center for Systems Science and Engineering (CSSE) at Johns Hopkins University existing on the git's repository "CSSEGISandData/COVID-19"
https://github.com/CSSEGISandData/COVID-19

*   Data of countries and it's capitals to fill missing Data of States

*   Data extracted using Twetter API where we analys tweets related to Covid-19

In [10]:
#!pip install ipywidgets
#!pip install matplotlib

In [124]:
import pandas as pd 
import numpy as np
import ipywidgets as widgets 
from IPython.display import display 
from urllib.parse import urlencode 
import requests 
import re 
import matplotlib.pyplot as plt
import unidecode

GOOGLE_API_KEY='xxxxxxxxxxx'

First thing we load data of confirmed / deaths /recovered cases around the word we change column names for country and state and fill missing values of states using Google API Geocode

In [72]:
def extract_adress(lon,lat,key,data_type='json'):
    """
    This function's purpous is to extract a human's readeble adress from a lat, lon cordinates
    input: lat,lon cordinates passed
         data_type data type to retrieve json by default 
         key : API key to access geocode service
    output : Adress 
    """
    GOOGLE_API_KEY=key
    endpoint = f"https://maps.googleapis.com/maps/api/geocode/{data_type}"
    goe_latlon=(lat,lon)
    params = {"latlng": str(goe_latlon)[1:-1],"key": GOOGLE_API_KEY}
    url_params = urlencode(params)
    sample = f"https://maps.googleapis.com/maps/api/geocode/json?latlng=40.714224,-73.961452&key=YOUR_API_KEY"
    url = f"{endpoint}?{url_params}"
    r = requests.get(url)
    if r.status_code not in range(200,299):
        #print(r.status_code)
        return {}
    else:
        response=r.json()
    return response


english_check = re.compile(r"[\w']+")
word_check = re.compile(r"[A-z]+")
regrex_skip_characters= '[^A-Za-zöéèàùüêôîóČšçó]+'

def get_state(lon,lat,key,country=''):
    """
    This function's purpous is to get the state related to (lat, lon) cordinates or returns the country's name if state isn't detected
    input: lat,lon cordinates passed
         key : API key to access geocode service
    output : state 
    """
    response = ''
    try:
        response1 = extract_adress(lon,lat,key=GOOGLE_API_KEY)
        response = response1['results'][0]['formatted_address']

        if len((response).split(","))<2: ## address contains only the country so fill state with country 
            state=response
            state = unidecode.unidecode(state)
            if not english_check.match((re.sub(' *', '',state.lower()))):
                state=country
            else:
                pass
            return state

        else:
            state = re.sub(regrex_skip_characters, ' ', response.split(",")[-2])## check last second elm if contains state
            state = re.sub(' +', ' ',state)
            state = unidecode.unidecode(state)
            if not english_check.match((re.sub(' *', '',state.lower()))):
                state = response.split(",")[-3]
                state = re.sub(regrex_skip_characters, ' ', state)## check last third elm if contains state
                state = re.sub(' +', ' ',state)
                state = unidecode.unidecode(state)
                if not english_check.match((re.sub(' *', '',state))):
                    state=country
                else:
                    pass
                
                return state
            else: 
                pass
            return state
        return state
        
    except:
        state=country
        return state

In [73]:
confirmed_data = pd.read_csv("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv")
death_data = pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv')
recovered_data = pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv')

In [74]:
confirmed_data = confirmed_data.rename({'Province/State': 'State','Country/Region':'Country'},axis='columns')
death_data = death_data.rename({'Province/State': 'State','Country/Region':'Country'},axis='columns')
recovered_data = recovered_data.rename({'Province/State': 'State','Country/Region':'Country'},axis='columns')

In [75]:
confirmed_data["State_cleaned"] = confirmed_data[(confirmed_data['State']).isnull()][['Lat','Long','Country']].apply(lambda x : get_state(lat=x['Lat'],lon=x['Long'],country=x['Country'],key=GOOGLE_API_KEY),axis=1)
death_data["State_cleaned"] = death_data[(death_data['State']).isnull()][['Lat','Long','Country']].apply(lambda x : get_state(lat=x['Lat'],lon=x['Long'],country=x['Country'],key=GOOGLE_API_KEY),axis=1)
recovered_data["State_cleaned"] = recovered_data[(recovered_data['State']).isnull()][['Lat','Long','Country']].apply(lambda x : get_state(lat=x['Lat'],lon=x['Long'],country=x['Country'],key=GOOGLE_API_KEY),axis=1)

second thing we merge dataframes of confirmed / deaths /recovered cases around the word into a single dataframe aftering pivoting the counts of each date (column->rows)



In [80]:
confirmed_data = confirmed_data.melt(['State','State_cleaned', 'Country', 'Lat', 'Long'])
confirmed_data = confirmed_data.rename({'variable': 'Date','value':'Confirmed'},axis='columns')

death_data = death_data.melt(['State','State_cleaned', 'Country', 'Lat', 'Long'])
death_data = death_data.rename({'variable': 'Date','value':'Death'},axis='columns')

recovered_data = recovered_data.melt(['State','State_cleaned', 'Country', 'Lat', 'Long'])
recovered_data = recovered_data.rename({'variable': 'Date','value':'Recovered'},axis='columns')

In [81]:
combined_data = [confirmed_data,death_data,recovered_data]
combined_data = [df.set_index(['State','State_cleaned', 'Country', 'Lat', 'Long','Date']) for df in combined_data]
combined_data = combined_data[0].join(combined_data[1:]).reset_index()
combined_data[['Lat', 'Long','Confirmed','Death','Recovered']] = combined_data[['Lat', 'Long','Confirmed','Death','Recovered']].apply(pd.to_numeric)
combined_data[['Date']] = combined_data[['Date']].apply(pd.to_datetime)

  uniq_tuples = lib.fast_unique_multiple([self._values, other._values], sort=sort)


Some states are marked missing in our dataset this is why we're going to try to get them throught localisation(lon,lat) using the API of Google Maps geocode in Google Cloud Platform if the state was found otherwise we'll replace it with the country's name for as a first step and later we're going to replace it with the country's capital name

In [9]:
combined_data.isnull().any()

State         True
Country      False
Lat          False
Long         False
Date         False
Confirmed    False
Death         True
Recovered     True
dtype: bool

Those are the countries that have null values in states to complete

In [17]:
combined_data[(combined_data.State.isnull())==True]['Country'].value_counts()

United Arab Emirates    252
Argentina               252
Sri Lanka               252
Qatar                   252
Angola                  252
                       ... 
Albania                 252
Switzerland             252
Grenada                 252
Egypt                   252
Romania                 252
Name: Country, Length: 185, dtype: int64

First thing let's make a copy of our dataset so we won't have to redo the processing using Google maps API, 

In [83]:
combined_data_cp=combined_data.copy()

Here we're going to merge existing states from the original Dataset with the one we found

In [84]:
combined_data_cp['State_c'] = combined_data_cp.apply(lambda x:x['State'] if (np.all(pd.notnull(x['State']))) else x['State_cleaned'],axis=1)

In [98]:
combined_data_cp[['Country','State_c', 'Lat', 'Long', 'Date', 'Confirmed',
       'Death', 'Recovered']].to_csv("covid.csv",sep=',')

The folowing countries represent the ones where it's difficult to retrieve the state this is why we've putting the country's name insteed but later we're going to use it's capital's name instead

In [118]:
combined_data_cp[combined_data_cp.Country==combined_data_cp.State_c]['Country'].value_counts()

MS Zaandam            253
Thailand              253
Djibouti              253
Lithuania             253
Diamond Princess      253
Bulgaria              253
Iran                  253
Philippines           253
Western Sahara        253
Iraq                  253
Saudi Arabia          253
Maldives              253
Pakistan              253
West Bank and Gaza    253
Burma                 253
Name: Country, dtype: int64

In [104]:
import pandas as pd

combined_data=pd.read_csv("covid.csv",sep=',',index_col=0)

capital_data = pd.read_csv('https://raw.githubusercontent.com/icyrockcom/country-capitals/master/data/country-list.csv')
capital_data = capital_data.rename({'capital': 'Capital','country':'Country'},axis='columns')

combined_data_cptl_merged = pd.merge(capital_data[['Country','Capital']],combined_data, on='Country',how='right')

In [114]:
combined_data_cptl_merged['State_cleaned'] = combined_data_cptl_merged[['Capital','Country','State_c']].apply(lambda x:x['Capital'] if ((x['State_c']==x['Country'])) else x['State_c'],axis=1)

combined_data_cptl_merged[['Country','State_cleaned', 'Lat', 'Long', 'Date', 'Confirmed',
       'Death', 'Recovered']].to_csv("covid.csv",sep=',')

In [120]:
combined_data_cptl_merged.head()

Unnamed: 0,Country,Capital,State_c,Lat,Long,Date,Confirmed,Death,Recovered,State_cleaned
0,Afghanistan,Kabul,Nawur,33.93911,67.709953,2020-01-22,0.0,0.0,0.0,Nawur
1,Afghanistan,Kabul,Nawur,33.93911,67.709953,2020-01-23,0.0,0.0,0.0,Nawur
2,Afghanistan,Kabul,Nawur,33.93911,67.709953,2020-01-24,0.0,0.0,0.0,Nawur
3,Afghanistan,Kabul,Nawur,33.93911,67.709953,2020-01-25,0.0,0.0,0.0,Nawur
4,Afghanistan,Kabul,Nawur,33.93911,67.709953,2020-01-26,0.0,0.0,0.0,Nawur
