# ANALYSIS OF NIGERIA'S COVID-19 DATA

## Data Collection

### A - NCDC website scrap
Get relevant COVID-19 data from the NCDC website - https://covid19.ncdc.gov.ng

In [None]:
# Import libraries
import os
import glob
import requests
import numpy as np
import pandas as pd
import datetime as dt
import csv
from bs4 import BeautifulSoup
import seaborn as sns
sns.set_style("darkgrid")
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('fivethirtyeight')  
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Request the NCDC page containing the required info
page = requests.get('https://covid19.ncdc.gov.ng')

# Parse `page` content with `BeautifulSoup`
soup = BeautifulSoup(page.content, 'html.parser')

#### Extract summary info about samples tested and COVID-19 cases.
* Save the info as a pandas Series 

In [None]:
# After inspecting the source of the `page` object and the contents of the `soup` object:
covid_summary = {}

for title, stat in zip(soup.select('h6.text-white'), soup.select('h2 > span')):
    covid_summary[title.text] = stat.text.replace(',', '')
    
covid_summary_S = pd.Series(covid_summary, name='COVID')

# Convert from object to integer data type
covid_summary_S = pd.to_numeric(covid_summary_S)
covid_summary_S

#### Extract data from the `Confirmed Cases by States` table on the NCDC website

In [None]:
# Create a list containing the column names of the table
header_list = []

for th in soup.select('th'):
    header_list.append(th.text)
    
header_list

In [None]:
# Extract table data
table_data = soup.select('tr')[1:] # row[0] contains info about column name
data_list = []

for tr in table_data:
    state = tr.select('td')[0].text.strip()
    cases_c = tr.select('td')[1].text.strip()
    cases_a = tr.select('td')[2].text.strip()
    discharged = tr.select('td')[3].text.strip()
    deaths = tr.select('td')[4].text.strip()
    data_list.append([state, cases_c, cases_a, discharged, deaths])
    
# Save the data in `data_list` to a DataFrame.
covid_data_states = pd.DataFrame(data_list, columns=header_list)

In [None]:
# Set the current working directory and export `covid_data_states` as csv
os.chdir("/Users/osala/Downloads/Nigeria-COVID-19-Data-Analysis-Using-Python-main")

covid_data_states.to_csv("COVID-19 data by state.csv", index=False)

### B - John Hopkins Data Repository 
* Global Daily Confirmed Cases - Click [Here](https://github.com/CSSEGISandData/COVID-19/blob/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv)
* Global Daily Recovered Cases - Click [Here](https://github.com/CSSEGISandData/COVID-19/blob/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv)
* Global Daily Death Cases - Click [Here](https://github.com/CSSEGISandData/COVID-19/blob/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv)

#### Save COVID-19 data from the GitHub repo links above to a DataFrame and also export as csv.

In [None]:
confirmed_url = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv'
confirmed_dly_gbl = pd.read_csv(confirmed_url)
confirmed_dly_gbl.to_csv("time_series_covid19_confirmed_global", index=False)

recovered_url = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv'
recovered_dly_gbl = pd.read_csv(recovered_url)
recovered_dly_gbl.to_csv("time_series_covid19_recovered_global", index=False)

deaths_url = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv'
deaths_dly_gbl = pd.read_csv(deaths_url)
deaths_dly_gbl.to_csv("time_series_covid19_deaths_global", index=False)

### C - External Data 
1. `covid_external.csv`: This contains data about Nigeria's Community Vulnerability Index. [Data dictionary](https://docs.google.com/document/d/1BqQvTH-dV1xD5QWoMHlcY0PQC39RZ7-0hjqccozaG9k/edit). [Resource link](https://covid-static-assets.s3.amazonaws.com/Africa+CCVI+methodology.pdf).
2. `Budget data.csv`: Info about initial and revised budgets per state
3. `RealGDP.csv`: Nigeria's GDP data from 2014 to 2020

In [None]:
covid_ext = pd.read_csv('covid_external.csv')
budget = pd.read_csv('Budget data.csv')
gdp = pd.read_csv('RealGDP.csv')

## View the data
Obtain basic information about all the collected data using the `head()` and `info()` methods.

In [None]:
data = ['covid_data_states', 'confirmed_dly_gbl', 'recovered_dly_gbl', 'deaths_dly_gbl',
        'covid_ext', 'budget', 'gdp']

for dataf in data:
    print(dataf.upper())
    print(eval(f'{dataf}.head()'), '\n')
    print(eval(f'{dataf}.info()'), '\n\n\n')

## Data Cleaning and Preparation 
* Rename the columns of the scraped data.
* Remove commas (,) in the numerical data.
* Convert to the appropriate data type.
* Extract data for Nigeria from the global daily cases data.

#### A - Clean the scraped data

In [None]:
# Rename some columns in `covid_data_states`
covid_data_states.rename(columns={'No. of Cases (Lab Confirmed)': 'Lab Confirmed Cases',
                                  'No. of Cases (on admission)': 'Cases on Admission',
                                  'No. Discharged': 'Discharged', 'No. of Deaths': 'Deaths'}, inplace=True)

# Remove commas from numeric data and convert to the int datatype
covid_data_states.replace({',': ''}, regex=True, inplace=True)

for column in covid_data_states.iloc[:,1:]:
    covid_data_states[column] = pd.to_numeric(covid_data_states[column])

covid_data_states.info()
covid_data_states.head(10)

* There are no missing data

#### B - Get a Pandas DataFrame for Daily Confirmed Cases in Nigeria. Column names are `Date` and `No of Cases`

In [None]:
# Create a filter to select only cases in Nigeria
bool_c = confirmed_dly_gbl['Country/Region'] == 'Nigeria'
confirmed_dly_nig = confirmed_dly_gbl.loc[bool_c]   # Filter applied
cases_c = int(confirmed_dly_nig.index.values)       # Index of the Boolean slice: useful when renaming below

# Transpose the data, select rows containing only dates and cases, reset the index, then rename the columns
confirmed_dly_nig = confirmed_dly_nig.transpose().iloc[4:, :].reset_index().rename(
                    columns={'index': 'Date', cases_c: 'No of Cases'})

# Convert the columns to the appropriate datetime and numeric datatypes
confirmed_dly_nig['Date'] = pd.to_datetime(confirmed_dly_nig['Date'])
confirmed_dly_nig['No of Cases'] = pd.to_numeric(confirmed_dly_nig['No of Cases'])

'''
# Alternatively:
# Select columns containing only dates and cases
confirmed_dly_nig = confirmed_dly_nig.iloc[:, 4:]

# Create a list of lists containing dates and corresponding values for each date
confirmed_dly_nigeria_list = [[date, case] for date, case in zip(confirmed_dly_nigeria.columns.tolist(),
                                                                 confirmed_dly_nigeria.values[0].tolist())]

# Create a new dataframe from the list above
confirmed_dly_nigeria = pd.DataFrame(confirmed_dly_nigeria_list, columns=['Date', 'No of Cases'])
'''
confirmed_dly_nig.info()
confirmed_dly_nig

#### C - Get a Pandas DataFrame for Daily Recovered Cases in Nigeria. Columns are `Date` and `No of Cases`

In [None]:
# Similar process as in B above
bool_r = recovered_dly_gbl['Country/Region'] == 'Nigeria'
recovered_dly_nig = recovered_dly_gbl.loc[bool_r]
cases_r = int(recovered_dly_nig.index.values)

recovered_dly_nig = recovered_dly_nig.transpose().iloc[4:, :].reset_index().rename(
                    columns={'index': 'Date', cases_r: 'No of Cases'})

recovered_dly_nig['Date'] = pd.to_datetime(recovered_dly_nig['Date'])
recovered_dly_nig['No of Cases'] = pd.to_numeric(recovered_dly_nig['No of Cases'])
recovered_dly_nig

#### D - Get a Pandas DataFrame for Daily Death Cases in Nigeria. Columns are `Date` and `No of Cases`

In [None]:
#Similar process as in B above
bool_d = deaths_dly_gbl['Country/Region'] == 'Nigeria'
deaths_dly_nig = deaths_dly_gbl.loc[bool_d]
cases_d = int(deaths_dly_nig.index.values)

deaths_dly_nig = deaths_dly_nig.transpose().iloc[4:, :].reset_index().rename(
                    columns={'index': 'Date', cases_d: 'No of Cases'})

deaths_dly_nig['Date'] = pd.to_datetime(deaths_dly_nig['Date'])
deaths_dly_nig['No of Cases'] = pd.to_numeric(deaths_dly_nig['No of Cases'])

deaths_dly_nig

## Analysis
Here I will perform analysis on the data sets.

#### Summary of Nigeria's COVID-19 statistics

In [None]:
dark_grid_style = sns.set_theme(style="darkgrid")

covid_summary_S.plot(kind='bar', figsize=(10,4), colormap='Pastel1', fontsize=12, logy=True, ylabel='Log Values',
                     rot=0.5)
plt.title('SAMPLES TESTED FOR COVID-19 AND OUTCOMES', fontsize=16)

* A small proportion of samples tested returned positive for COVID-19.
* Most of the confirmed cases were discharged with a tiny percentage resulting in death.

#### A - Generate a plot showing the top 10 states in terms of lab-confirmed COVID-19 cases

In [None]:
sns.catplot(x="States Affected", y="Lab Confirmed Cases", kind='bar', aspect=4, height=4, palette='pastel',
            data=covid_data_states.nlargest(10, 'Lab Confirmed Cases'))
plt.title('TOP 10 STATES WITH CONFIRMED CASES', fontsize=15)

* Lagos has, by a large margin (more than twice), the highest number of confirmed cases; then FCT and Kaduna.

#### B - Generate a plot that shows the top 10 states in terms of discharged COVID cases.

In [None]:
sns.catplot(x="States Affected", y="Discharged", kind='bar', aspect=4, height=3, palette='pastel',
                  data=covid_data_states.nlargest(10, 'Discharged'))
plt.title('TOP 10 STATES WITH DISCHARGED CASES', fontsize=15)

* The same pattern as above is also seen here with Lagos having the highest number of discharged cases

#### C - Plot the top 10 states with the highest death cases

In [None]:
sns.catplot(x="States Affected", y="Deaths", kind='bar', aspect=4, height=3, palette='pastel',
                  data=covid_data_states.nlargest(10, 'Deaths'))
plt.title('TOP 10 STATES BY DEATHS', fontsize=15)

* There's a change here. Lagos has the highest number of deaths from COVID-19, followed by Edo then FCT.

#### D - Generate a line plot for the total daily confirmed, recovered and death cases in Nigeria

In [None]:
sns.relplot(data=confirmed_dly_nig, kind="line", x="Date", y="No of Cases", aspect=4, height=3, palette='pastel')
plt.title('CUMULATIVE CONFIRMED CASES', fontsize=15) 

sns.relplot(data=recovered_dly_nig, kind="line", x="Date", y="No of Cases", aspect=4, height=3, palette='pastel',
           color='green')
plt.title('CUMULATIVE RECOVERED CASES', fontsize=15)

sns.relplot(data=deaths_dly_nig, kind="line", x="Date", y="No of Cases", aspect=4, height=3, palette='pastel',
           color='red')
plt.title('CUMULATIVE DEATHS', fontsize=15)

* All three line plots show a two-phased logarithmic growth pattern.
* The first increase in cases and deaths occured around 5-6 months after the onset of the pandemic while the second increase occured early this year.

#### E - Determine the daily infection rate.
* Generate a line plot for the above.
* Do the same for the daily recovered and death rates

In [None]:
# Use the `.diff` method to derive no of cases per day
confirmed_dly_nig['Daily Cases'] = confirmed_dly_nig['No of Cases'].diff()
confirmed_dly_nig

In [None]:
sns.relplot(data=confirmed_dly_nig, kind="line",
            x="Date", y="Daily Cases", aspect=4, height=3, palette='pastel')
plt.title('DAILY CONFIRMED CASES', fontsize=15)

* There was an increase in daily confirmed cases around the middle of 2020 and a higher (up to 4-fold) increase earlier this year.

In [None]:
recovered_dly_nig['Daily Cases'] = recovered_dly_nig['No of Cases'].diff()

sns.relplot(data=recovered_dly_nig, kind="line", color='green',
            x="Date", y="Daily Cases", aspect=4, height=3, palette='pastel')
plt.title('DAILY RECOVERIES', fontsize=15)

* Sharp spikes in the no of daily recoveries are seen at several intervals in the time series.
* The sharpest spike is seen around the 8th month of 2020.
* A relatively sustained increase in recovery cases is seen early this year.

In [None]:
deaths_dly_nig['Daily Cases'] = deaths_dly_nig['No of Cases'].diff()

sns.relplot(data=deaths_dly_nig, kind="line", color='red',
            x="Date", y="Daily Cases", aspect=4, height=3, palette='pastel')
plt.title('DAILY DEATH RATES', fontsize=15)

* The daily death rates also follow the general pattern of a two-phased increase in the number of confirmed cases and recoveries with a more jagged outline.
* The jagged outlines suggest that reports may not have arrived daily as cases might have accumulated over weekends and then reported on the first day of the week.

#### F - Calculate maximum infection rate in a day (number of new cases); the maximum no of recoveries in a day, and the maximum daily death rate
* Find the corresponding dates

In [None]:
confirmed_dly_nig.iloc[confirmed_dly_nig['Daily Cases'].idxmax(), [0, 2]]

**The maximum daily infection rate is 2,464 on the 23rd of January, 2021**

In [None]:
recovered_dly_nig.iloc[recovered_dly_nig['Daily Cases'].idxmax(), [0, 2]]

**The maximum no of daily recoveries is 11,188 on the 4th of August, 2020**

In [None]:
deaths_dly_nig.iloc[deaths_dly_nig['Daily Cases'].idxmax(), [0, 2]]

**The maximum daily death rate, 31, occurred on the 16th of June, 2020**

#### G - Determine any relationship between the `covid_ext` and the NCDC COVID-19 dataset. 
* Combine both data sets on a common column `States`.
* Generate a line plot with the top 10 confirmed cases and the overall COVID-19 community vulnerability index (`Overall CCVI Index`) on the same axis.

In [None]:
# Correct spelling errors within the 'states' column of covid_ext.
covid_ext.iloc[[3,30], 0] = ['Nasarawa', 'Cross River']

# Rename the `States Affected` and `states` columns to `States` in both dataframes
covid_data_states.rename(columns={'States Affected': 'States'}, inplace=True)
covid_ext.rename(columns={'states': 'States'}, inplace=True)

# Merge both dataframes using `pd.merge()`
covid_states_ext = pd.merge(covid_data_states, covid_ext, how='outer', on='States')

covid_states_ext.info()
covid_states_ext.head(10)

In [None]:
fig = plt.figure(figsize = (9,6))
plt.title('RELATIONSHIP BETWEEN CONFIRMED CASES AND OVERALL CCVI INDEX', fontsize=16)

ax1 = fig.add_subplot(111)
ax1.plot('States', 'Lab Confirmed Cases', 'b-', data=covid_states_ext.nlargest(10, 'Lab Confirmed Cases'))
ax1.set_ylabel('Lab Confirmed Cases', color='b')
for tl in ax1.get_yticklabels():
    tl.set_color('b')

ax2 = ax1.twinx()
ax2.plot('States', 'Overall CCVI Index', 'r--', data=covid_states_ext.nlargest(10, 'Lab Confirmed Cases'))
ax2.set_ylabel('Overall CCVI Index', color='r')
for tl in ax2.get_yticklabels():
    tl.set_color('r')
    
plt.grid(b=None)

* Lagos has the highest number of cases and correspondingly the lowest overall CCVI index

In [None]:
fig = plt.figure(figsize = (9,6))
plt.title('RELATIONSHIP BETWEEN DISCHARGED CASES AND OVERALL CCVI INDEX', fontsize=16)

ax1 = fig.add_subplot(111)
ax1.plot('States', 'Discharged', 'g-', data=covid_states_ext.nlargest(10, 'Discharged'))
ax1.set_ylabel('Discharged Cases', color='g')
for tl in ax1.get_yticklabels():
    tl.set_color('g')

ax2 = ax1.twinx()
ax2.plot('States', 'Overall CCVI Index', 'r--', data=covid_states_ext.nlargest(10, 'Discharged'))
ax2.set_ylabel('Overall CCVI Index', color='r')
for tl in ax2.get_yticklabels():
    tl.set_color('r')
    
plt.grid(b=None)    

* Lagos also has the highest number of discharged cases despite its low overall CCVI index

In [None]:
fig = plt.figure(figsize = (9,6))
plt.title('RELATIONSHIP BETWEEN COVID DEATHS AND OVERALL CCVI INDEX', fontsize=16)

ax1 = fig.add_subplot(111)
ax1.plot('States', 'Deaths', 'black', data=covid_states_ext.nlargest(10, 'Deaths'))
ax1.set_ylabel('COVID Deaths', color='black')
for tl in ax1.get_yticklabels():
    tl.set_color('black')

ax2 = ax1.twinx()
ax2.plot('States', 'Overall CCVI Index', 'r--', data=covid_states_ext.nlargest(10, 'Deaths'))
ax2.set_ylabel('Overall CCVI Index', color='r')
for tl in ax2.get_yticklabels():
    tl.set_color('r')
    
plt.grid(b=None)

* There's a clearer relationship between the overall CCVI index and deaths. States with a low overall CCVI index tend to have a higher number of COVID-19 deaths with exceptions.
* A notable exception is Ondo State.

#### H - Determine the relationship between the `covid_ext` and the NCDC COVID-19 dataset. 
* Generate a regression plot between two variables to visualize the linear relationships - `Lab Confirmed Cases` and `Population Density`.

In [None]:
sns.regplot(y='Lab Confirmed Cases', x='Population Density',
            data=covid_states_ext)
plt.title('POPULATION DENSITY VS CONFIRMED CASES', fontsize=14)

* Population density is not a reliable predictor of confirmed cases

In [None]:
sns.regplot(y='Discharged', x='Population Density',
            data=covid_states_ext, color='green')
plt.title('POPULATION DENSITY VS DISCHARGED CASES', fontsize=14)

* Population density is a poor predictor of discharged cases

In [None]:
sns.regplot(y='Deaths', x='Population Density',
            data=covid_states_ext, color='red')
plt.title('POPULATION DENSITY VS DEATHS', fontsize=14)

* Population density is a poor predictor of the number of deaths

#### I - Determine the effect of the pandemic on the economy by comparing the real GDP value pre-COVID-19 with real GDP in 2020 (especially Q2 2020)
Steps

* Generate a `barplot` from `gdp` using the GDP values for each year & quarters. Create subplots of each quarter on one graph.
<br>

Using `pd.melt()`, create a DataFrame for the plot mentioned above. 
* Set quarter legend to lower left.
* Using `axhline`, draw a horizontal line through the graph at the value of Q2 2020.

In [None]:
# Melt the `gdp` dataframe and sort by the `Year` and `GDP value` columns 
melted_gdp = gdp.melt(['Year'], var_name='Quarter', value_name='GDP value', ignore_index=False).sort_values(
                                                                                            ['Year', 'Quarter'])
melted_gdp.head(10)

In [None]:
plt.figure(figsize = (11,4))
gdp_g = sns.barplot(x='Year', y="GDP value", hue='Quarter', palette='pastel',
                  data=melted_gdp)
gdp_g.legend(loc=3, title='Quarter', bbox_to_anchor=(-0.17,0))
gdp_g.axhline(15890000.00) # value of Q2 2020
gdp_g.set_title('QUARTERLY GDP VALUES FROM 2014 TO 2020', fontdict={'fontsize':17,'fontweight':'bold'})

* There is a slight decrease in the second quarter 2020 GDP value compared to the pre-COVID 2019 value.
* There's however no significant difference between second quarter GDP values before 2019 and Q2 values in 2020.
* All the years show an increase in GDP from Q1 to Q4 except 2020 in which there's a decrease in GDP from Q1 to Q2. This corresponds to the period of the first COVID-19 wave and national lockdown.

#### J - View the initial and revised bugdet for the year 2020 using a `barplot`

In [None]:
budget.plot('states', kind='bar', figsize=(12,5), fontsize=12, xlabel='States')
plt.title('INITIAL AND REVISED BUDGETS DURING THE PANDEMIC', fontsize=15)

* The state with the largest relative decrease in budget is Cross River followed by Lagos state. However, Cross River is not among the top 10 states most affected by COVID-19.

#### K - View monthly COVID-19 statistics

In [None]:
# Extract `Year` and `Month` data from the `Date` column.
confirmed_dly_nig['Year'] = confirmed_dly_nig['Date'].dt.year
confirmed_dly_nig['Month'] = confirmed_dly_nig['Date'].dt.month

# Group by year and month, then sum up no of monthly cases.
con_covid_mthly = confirmed_dly_nig.groupby(['Year', 'Month']).sum()

# Plot monthly data
con_covid_mthly.plot(y='Daily Cases', kind='bar', figsize=(11,5), ylabel='Cases', legend=False)
plt.title('AGGREGATE CONFIRMED CASES PER MONTH', fontsize=16)

* The highest number of monthly confirmed cases was seen in January followed by February

In [None]:
recovered_dly_nig['Year'] = recovered_dly_nig['Date'].dt.year
recovered_dly_nig['Month'] = recovered_dly_nig['Date'].dt.month
rec_covid_mthly = recovered_dly_nig.groupby(['Year', 'Month']).sum()

rec_covid_mthly.plot(y='Daily Cases', kind='bar', figsize=(11,5), ylabel='Cases', legend=False, color='green')
plt.title('AGGREGATE RECOVERED CASES PER MONTH', fontsize=16)

* The same pattern is seen as for confirmed cases but the difference in cases between the two months is much smaller.

In [None]:
deaths_dly_nig['Year'] = deaths_dly_nig['Date'].dt.year
deaths_dly_nig['Month'] = deaths_dly_nig['Date'].dt.month
dths_covid_mthly = deaths_dly_nig.groupby(['Year', 'Month']).sum()

dths_covid_mthly.plot(y='Daily Cases', kind='bar', figsize=(11,5), ylabel='Cases', legend=False, color='red')

* A two-phased spike in deaths is seen. However, the highest number of monthly deaths is seen in February, 2021 following the spike in number of confirmed and discharged cases that occurred in January.
* The month within which the second highest number of deaths was recorded is June, 2020.
* A similar profile of the number of deaths per month during both waves may be an indication of a reduced mortality rate during the second wave.