In [35]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import plotly.express as px
import pathlib
import pandas as pd

In [36]:
# Step 2: Data Preparation & Cleaning

# Load the dataset into a DataFrame
df = pd.read_csv("surv_variants.csv")
df

Unnamed: 0,Country,first_seq,num_seqs,last_seq,variant,censure_date,duration,censored,mortality_rate,total_cases,total_deaths,growth_rate
0,China,2019-10-22,3,2020-06-03,S.Q677,2020-06-21,225,True,0.052983,18259.460123,967.435583,16447.430706
1,USA,2020-03-03,26022,2021-11-19,S.Q677,2020-11-01,626,False,0.016111,256577.596234,4133.841031,84748.745876
2,Brazil,2020-03-09,1553,2021-11-12,S.Q677,2020-11-07,613,False,0.027846,101550.140466,2827.764288,42356.907426
3,Australia,2020-03-20,88,2021-11-14,S.Q677,2020-11-18,604,False,0.009905,1402.239579,13.889752,1227.122500
4,Sweden,2020-03-20,810,2021-11-19,S.Q677,2020-11-18,609,False,0.012711,1659.148728,21.089425,2521.042925
...,...,...,...,...,...,...,...,...,...,...,...,...
4108,Kazakhstan,2021-08-17,88,2021-08-19,21J.Delta,2022-04-17,2,True,0.014838,52244.958678,775.206612,7722.500000
4109,Cyprus,2021-08-28,1,2021-08-28,21J.Delta,2022-04-28,0,True,0.004407,487.565957,2.148936,
4110,Guyana,2021-09-06,18,2021-09-08,21J.Delta,2022-05-07,2,True,0.024429,3855.168000,94.176000,131.000000
4111,Comoros,2021-10-05,11,2021-10-26,21J.Delta,2022-06-05,21,True,0.034727,1012.239130,35.152174,3.152456


In [37]:
# Display the shape of the DataFrame
print("Number of rows and columns:", df.shape)

Number of rows and columns: (4113, 12)


In [38]:
# Display the first few rows of the DataFrame
print(df.head())

     Country   first_seq  num_seqs    last_seq variant censure_date  duration  \
0      China  2019-10-22         3  2020-06-03  S.Q677   2020-06-21       225   
1        USA  2020-03-03     26022  2021-11-19  S.Q677   2020-11-01       626   
2     Brazil  2020-03-09      1553  2021-11-12  S.Q677   2020-11-07       613   
3  Australia  2020-03-20        88  2021-11-14  S.Q677   2020-11-18       604   
4     Sweden  2020-03-20       810  2021-11-19  S.Q677   2020-11-18       609   

   censored  mortality_rate    total_cases  total_deaths   growth_rate  
0      True        0.052983   18259.460123    967.435583  16447.430706  
1     False        0.016111  256577.596234   4133.841031  84748.745876  
2     False        0.027846  101550.140466   2827.764288  42356.907426  
3     False        0.009905    1402.239579     13.889752   1227.122500  
4     False        0.012711    1659.148728     21.089425   2521.042925  


In [39]:
# Summary statistics for numeric columns
print(df.describe())


           num_seqs     duration  mortality_rate   total_cases   total_deaths  \
count  4.113000e+03  4113.000000     4113.000000  4.113000e+03    4113.000000   
mean   4.216428e+03   182.552638        0.019360  1.198316e+05    2118.266613   
std    3.971929e+04   161.950948        0.014504  1.027564e+06   15801.688542   
min    1.000000e+00     0.000000        0.000000  2.015926e-04       0.000000   
25%    4.000000e+00    39.000000        0.010835  1.349603e+02       1.948052   
50%    3.000000e+01   155.000000        0.016106  1.280419e+03      20.188889   
75%    2.700000e+02   280.000000        0.024168  1.435006e+04     248.201653   
max    1.289311e+06   669.000000        0.145330  3.597978e+07  513790.255546   

         growth_rate  
count    3585.000000  
mean    11682.192504  
std     41524.513816  
min         0.000000  
25%       154.923372  
50%       882.398620  
75%      3827.861639  
max    600412.400000  


In [40]:
# Check for missing values
print("Missing values in each column:\n", df.isnull().sum())

# Drop rows with missing values
df.dropna(inplace=True)

Missing values in each column:
 Country             0
first_seq           0
num_seqs            0
last_seq            0
variant             0
censure_date        0
duration            0
censored            0
mortality_rate      0
total_cases         0
total_deaths        0
growth_rate       528
dtype: int64


In [41]:
df.columns

Index(['Country', 'first_seq', 'num_seqs', 'last_seq', 'variant',
       'censure_date', 'duration', 'censored', 'mortality_rate', 'total_cases',
       'total_deaths', 'growth_rate'],
      dtype='object')

In [42]:
df.columns.tolist()

['Country',
 'first_seq',
 'num_seqs',
 'last_seq',
 'variant',
 'censure_date',
 'duration',
 'censored',
 'mortality_rate',
 'total_cases',
 'total_deaths',
 'growth_rate']

In [43]:
# 1. Bar Chart: Variant Counts
variant_counts = df['variant'].value_counts().reset_index()
variant_counts.columns = ['variant', 'count']
fig = px.bar(variant_counts, x='variant', y='count', labels={'variant':'Variant', 'count':'Count'}, title='Variant Counts')
fig.show()

This bar chart shows how many times each COVID-19 variant appears in the dataset. Each bar represents a different variant, and the height of the bar shows the number of records for that variant. This helps us quickly see which variants are most and least common in the data.

In [44]:
# 2. Pie Chart: Variant Distribution
fig = px.pie(df, names='variant', title='Variant Distribution')
fig.show()

In [45]:
# 3. Line Chart: Total Cases Over Time (by censure_date)
df_sorted = df.sort_values('censure_date')
fig = px.line(df_sorted, x='censure_date', y='total_cases', color='variant', title='Total Cases Over Time by Variant')
fig.show()

In [46]:
# 4. Scatter Plot: Total Cases vs. Total Deaths
fig = px.scatter(df, x='total_cases', y='total_deaths', color='variant', title='Total Cases vs. Total Deaths by Variant')
fig.show()

In [47]:
# 5. Histogram: Total Cases Distribution
fig = px.histogram(df, x='total_cases', nbins=30, title='Total Cases Distribution')
fig.show()

In [48]:
# 6. Box Plot: Mortality Rate by Variant
fig = px.box(df, x='variant', y='mortality_rate', title='Mortality Rate by Variant')
fig.show()

In [49]:
# 7. Area Chart: Cumulative Total Cases Over Time
df_sorted['cumulative_cases'] = df_sorted['total_cases'].cumsum()
fig = px.area(df_sorted, x='censure_date', y='cumulative_cases', title='Cumulative Total Cases Over Time')
fig.show()

In [50]:
# 8. Treemap: Total Cases by Country and Variant
fig = px.treemap(df, path=['Country', 'variant'], values='total_cases', title='Total Cases by Country and Variant')
fig.show()

In [51]:
# 9. Sunburst Chart: Total Deaths by Country and Variant
fig = px.sunburst(df, path=['Country', 'variant'], values='total_deaths', title='Total Deaths by Country and Variant')
fig.show()

In [52]:
# 10. Animated Line Chart: Total Cases Over Time by Variant
fig = px.line(df_sorted, x='censure_date', y='total_cases', color='variant', title='Total Cases Over Time by Variant', animation_frame='variant')
fig.show()

In [53]:
# 11. Bar Chart: Total Deaths by Country
country_deaths = df.groupby('Country')['total_deaths'].sum().reset_index()
fig = px.bar(country_deaths, x='Country', y='total_deaths', title='Total Deaths by Country')
fig.show()

In [54]:
# 12. Box Plot: Growth Rate by Variant
fig = px.box(df, x='variant', y='growth_rate', title='Growth Rate by Variant')
fig.show()

In [55]:
# 13. Scatter Plot: Duration vs. Total Cases
fig = px.scatter(df, x='duration', y='total_cases', color='variant', title='Duration vs. Total Cases by Variant')
fig.show()

In [56]:
# 14. Histogram: Mortality Rate Distribution
fig = px.histogram(df, x='mortality_rate', nbins=30, title='Mortality Rate Distribution')
fig.show()

In [57]:
# Total Deaths by Variant (Bar Chart)
variant_deaths = df.groupby('variant')['total_deaths'].sum().reset_index()
fig = px.bar(variant_deaths, x='variant', y='total_deaths', title='Total Deaths by Variant', labels={'variant': 'Variant', 'total_deaths': 'Total Deaths'})
fig.show()