<a href="https://colab.research.google.com/github/SamiOmran/Covid-19-Analysis-and-Visualization-using-Plotly-Express/blob/master/Covid.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# import libraris
# Data analysis and Manipulation
import plotly.graph_objs as go
import plotly.io as pio
import plotly.express as px
import pandas as pd

# Data Visualization
import matplotlib.pyplot as plt

# Importing Plotly
import plotly.offline as py
py.init_notebook_mode(connected=True)

# Initializing Plotly
pio.renderers.default = 'colab'

# <font color='blue-red'> **Helper functions**

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

def explore_numeric(df, x, figsize=(6,5) ):
  '''Creates a seaborn histplot and boxplot with a share x-axis,
  Prints statements about null values, cardinality, and checks for
  constant/quasi-constant features.
  Source:{PASTE IN FINAL LESSON LINK}
  '''

  ## Save null value counts and percent for printing
  null_count = df[x].isna().sum()
  null_perc = null_count/len(df)* 100


  ## Making our figure with gridspec for subplots
  gridspec = {'height_ratios':[0.7,0.3]}
  fig, axes = plt.subplots(nrows=2, figsize=figsize,
                           sharex=True, gridspec_kw=gridspec)
  # Histogram on Top
  sns.histplot(data=df, x=x, ax=axes[0])

  # Boxplot on Bottom
  sns.boxplot(data=df, x=x, ax=axes[1])

  ## Adding a title
  axes[0].set_title(f'Column: {x}', fontweight='bold')

  ## Adjusting subplots to best fill Figure
  fig.tight_layout()

  # Ensure plot is shown before message
  plt.show()


  # Print null value info
  print(f'- NaN\'s Found: {null_count} ({round(null_perc,2)}%)')
  # Print cardinality info
  nunique = df[x].nunique()
  print(f'- Unique Values: {nunique}')


  # Get the most most common value, its count as # and as %
  most_common_val_count = df[x].value_counts(dropna=False).head(1)
  most_common_val = most_common_val_count.index[0]
  freq = most_common_val_count.values[0]
  perc_most_common = freq / len(df) * 100

  print(f'- Most common value: \'{most_common_val}\' occurs {freq} times ({round(perc_most_common,2)}%)')

  # print message if quasi-constant or constant (most common val more than 98% of data)
  if perc_most_common > 98:
    print(f'\n- [!] Warning: \'{x}\' is a constant or quasi-constant feature and should be dropped.')
  else:
    print('- Not constant or quasi-constant.')
  return fig, axes

In [None]:
def check_nan_values(df, x, quasi=0.98, drop=False) -> bool | None:
  null_count = df[x].isna().sum()
  null_perc = float(null_count/len(df)* 100)

  if not drop:
    # Print null value info
    print(f'- NaN\'s Found in "{x}": {null_count} ({round(null_perc,2)}%)')
  else:
    return null_perc > quasi

# <font color='blue-red'> **Import Data**


In [None]:
import pandas as pd

In [None]:
covid = pd.read_csv('covid.csv')
covid.head()

Unnamed: 0,Country/Region,Continent,Population,TotalCases,NewCases,TotalDeaths,NewDeaths,TotalRecovered,NewRecovered,ActiveCases,"Serious,Critical",Tot Cases/1M pop,Deaths/1M pop,TotalTests,Tests/1M pop,WHO Region,iso_alpha
0,USA,North America,331198100.0,5032179,,162804.0,,2576668.0,,2292707.0,18296.0,15194.0,492.0,63139605.0,190640.0,Americas,USA
1,Brazil,South America,212710700.0,2917562,,98644.0,,2047660.0,,771258.0,8318.0,13716.0,464.0,13206188.0,62085.0,Americas,BRA
2,India,Asia,1381345000.0,2025409,,41638.0,,1377384.0,,606387.0,8944.0,1466.0,30.0,22149351.0,16035.0,South-EastAsia,IND
3,Russia,Europe,145940900.0,871894,,14606.0,,676357.0,,180931.0,2300.0,5974.0,100.0,29716907.0,203623.0,Europe,RUS
4,South Africa,Africa,59381570.0,538184,,9604.0,,387316.0,,141264.0,539.0,9063.0,162.0,3149807.0,53044.0,Africa,ZAF


In [None]:
covid.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 209 entries, 0 to 208
Data columns (total 17 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Country/Region    209 non-null    object 
 1   Continent         208 non-null    object 
 2   Population        208 non-null    float64
 3   TotalCases        209 non-null    int64  
 4   NewCases          4 non-null      float64
 5   TotalDeaths       188 non-null    float64
 6   NewDeaths         3 non-null      float64
 7   TotalRecovered    205 non-null    float64
 8   NewRecovered      3 non-null      float64
 9   ActiveCases       205 non-null    float64
 10  Serious,Critical  122 non-null    float64
 11  Tot Cases/1M pop  208 non-null    float64
 12  Deaths/1M pop     187 non-null    float64
 13  TotalTests        191 non-null    float64
 14  Tests/1M pop      191 non-null    float64
 15  WHO Region        184 non-null    object 
 16  iso_alpha         209 non-null    object 
dt

In [None]:
covid_grouped = pd.read_csv('covid_grouped.csv')
covid_grouped

Unnamed: 0,Date,Country/Region,Confirmed,Deaths,Recovered,Active,New cases,New deaths,New recovered,WHO Region,iso_alpha
0,2020-01-22,Afghanistan,0,0,0,0,0,0,0,Eastern Mediterranean,AFG
1,2020-01-22,Albania,0,0,0,0,0,0,0,Europe,ALB
2,2020-01-22,Algeria,0,0,0,0,0,0,0,Africa,DZA
3,2020-01-22,Andorra,0,0,0,0,0,0,0,Europe,AND
4,2020-01-22,Angola,0,0,0,0,0,0,0,Africa,AGO
...,...,...,...,...,...,...,...,...,...,...,...
35151,2020-07-27,West Bank and Gaza,10621,78,3752,6791,152,2,0,Eastern Mediterranean,
35152,2020-07-27,Western Sahara,10,1,8,1,0,0,0,Africa,ESH
35153,2020-07-27,Yemen,1691,483,833,375,10,4,36,Eastern Mediterranean,YEM
35154,2020-07-27,Zambia,4552,140,2815,1597,71,1,465,Africa,ZMB


In [None]:
covid_grouped.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35156 entries, 0 to 35155
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Date            35156 non-null  object
 1   Country/Region  35156 non-null  object
 2   Confirmed       35156 non-null  int64 
 3   Deaths          35156 non-null  int64 
 4   Recovered       35156 non-null  int64 
 5   Active          35156 non-null  int64 
 6   New cases       35156 non-null  int64 
 7   New deaths      35156 non-null  int64 
 8   New recovered   35156 non-null  int64 
 9   WHO Region      35156 non-null  object
 10  iso_alpha       35156 non-null  object
dtypes: int64(7), object(4)
memory usage: 3.0+ MB


# <font color='blue-red'> **Clean Data**

In [None]:
covid.duplicated().sum()

np.int64(0)

In [None]:
numeric_cols = covid.select_dtypes(include='number').columns
cols_drop = {}

for col in numeric_cols:
  cols_drop[col] = check_nan_values(covid, col, drop=True)

cols_drop

{'Population': False,
 'TotalCases': False,
 'NewCases': True,
 'TotalDeaths': True,
 'NewDeaths': True,
 'TotalRecovered': True,
 'NewRecovered': True,
 'ActiveCases': True,
 'Serious,Critical': True,
 'Tot Cases/1M pop': False,
 'Deaths/1M pop': True,
 'TotalTests': True,
 'Tests/1M pop': True}

For covid dataframe, we will drop 3 columns:


1. NewCases
2. NewDeaths
3. NewRecovered

as they quasi-constant for nan values

In [None]:
covid.drop(columns=['NewCases', 'NewDeaths', 'NewRecovered'], inplace=True)

# <font color='blue-red'> **Visualization**

In [None]:
# Import create_table Figure Factory
from plotly.figure_factory import create_table

In [56]:
colorscale = [[0, '#4d004c'], [.5, '#f2e5ff'], [1, '#ffffff']]
table = create_table(covid.head(15), colorscale=colorscale)
py.iplot(table)

In [57]:
px.bar(covid.head(15), x = 'Country/Region', y = 'TotalCases',
       color = 'TotalCases', height = 500,
       hover_data = ['Country/Region', 'Continent'],
       )