#### Import relevant libraries

In [1]:
import numpy as np
import pandas as pd
from scipy import stats
import statistics as stat

# Load the dataset
covid_data = pd.read_csv("data/covid-data.csv")

# Check the first few rows to ensure the data is loaded correctly
covid_data.head()

Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,...,female_smokers,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,excess_mortality_cumulative_absolute,excess_mortality_cumulative,excess_mortality,excess_mortality_cumulative_per_million
0,AFG,Asia,Afghanistan,24/02/2020,5,5,,,,,...,,,37.746,0.5,64.83,0.511,,,,
1,AFG,Asia,Afghanistan,25/02/2020,5,0,,,,,...,,,37.746,0.5,64.83,0.511,,,,
2,AFG,Asia,Afghanistan,26/02/2020,5,0,,,,,...,,,37.746,0.5,64.83,0.511,,,,
3,AFG,Asia,Afghanistan,27/02/2020,5,0,,,,,...,,,37.746,0.5,64.83,0.511,,,,
4,AFG,Asia,Afghanistan,28/02/2020,5,0,,,,,...,,,37.746,0.5,64.83,0.511,,,,


In [2]:
covid_data = covid_data[['iso_code','continent','location','date','total_cases','new_cases']]

In [3]:
covid_data.head(5)

Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases
0,AFG,Asia,Afghanistan,24/02/2020,5,5
1,AFG,Asia,Afghanistan,25/02/2020,5,0
2,AFG,Asia,Afghanistan,26/02/2020,5,0
3,AFG,Asia,Afghanistan,27/02/2020,5,0
4,AFG,Asia,Afghanistan,28/02/2020,5,0


In [4]:
covid_data.dtypes

iso_code       object
continent      object
location       object
date           object
total_cases     int64
new_cases       int64
dtype: object

In [5]:
covid_data.shape

(5818, 6)

In [8]:
# Using pandas
mean_pandas = covid_data.mean()

# Using numpy
mean_numpy = np.mean(covid_data, axis=0)

# Using statistics (for a single column, assuming 'column_name' is one of your columns)
mean_stat = stat.mean(covid_data['total_cases'])

print("Pandas Mean:\n", mean_pandas)
print("Numpy Mean:\n", mean_numpy)
print("Statistics Mean for a specific column:\n", mean_stat)

Pandas Mean:
 total_cases    3.392399e+06
new_cases      8.814366e+03
dtype: float64
Numpy Mean:
 total_cases    3.392399e+06
new_cases      8.814366e+03
dtype: float64
Statistics Mean for a specific column:
 3392398.6914747334


In [7]:
# Using pandas
mean_pandas = covid_data[['total_cases', 'new_cases']].mean()

# Using numpy
mean_numpy = np.mean(covid_data[['total_cases', 'new_cases']], axis=0)

print("Pandas Mean:\n", mean_pandas)
print("Numpy Mean:\n", mean_numpy)

Pandas Mean:
 total_cases    3.392399e+06
new_cases      8.814366e+03
dtype: float64
Numpy Mean:
 total_cases    3.392399e+06
new_cases      8.814366e+03
dtype: float64


In [9]:
# Using pandas
median_pandas = covid_data.median()

# Using numpy
median_numpy = np.median(covid_data, axis=0)

# Using statistics
median_stat = stat.median(covid_data['total_cases'])

print("Pandas Median:\n", median_pandas)
print("Numpy Median:\n", median_numpy)
print("Statistics Median for a specific column:\n", median_stat)

TypeError: unsupported operand type(s) for /: 'str' and 'int'

In [10]:
# Using pandas
median_pandas = covid_data[['total_cases', 'new_cases']].median()

# Using numpy
median_numpy = np.median(covid_data[['total_cases', 'new_cases']], axis=0)

print("Pandas Median:\n", median_pandas)
print("Numpy Median:\n", median_numpy)

Pandas Median:
 total_cases    176305.0
new_cases         261.0
dtype: float64
Numpy Median:
 [176305.    261.]


In [17]:
# Using pandas
mode_pandas = covid_data['total_cases'].mode()

# Using statistics
mode_stat = stat.mode(covid_data['total_cases'])

print("Pandas Mode:\n", mode_pandas)
print("Statistics Mode for a specific column:\n", mode_stat)

Pandas Mode:
 0    1
dtype: int64
Statistics Mode for a specific column:
 1


In [18]:
# Using pandas
variance_pandas = covid_data.var()

# Using numpy
variance_numpy = np.var(covid_data, axis=0)

# Using statistics
variance_stat = stat.variance(covid_data['total_cases'])

print("Pandas Variance:\n", variance_pandas)
print("Numpy Variance:\n", variance_numpy)
print("Statistics Variance for a specific column:\n", variance_stat)

Pandas Variance:
 total_cases    5.837733e+13
new_cases      4.513995e+08
dtype: float64
Numpy Variance:
 total_cases    5.836730e+13
new_cases      4.513219e+08
dtype: float64
Statistics Variance for a specific column:
 58377331856144.664


In [19]:
# Using pandas
std_pandas = covid_data.std()

# Using numpy
std_numpy = np.std(covid_data, axis=0)

# Using statistics
std_stat = stat.stdev(covid_data['total_cases'])

print("Pandas Standard Deviation:\n", std_pandas)
print("Numpy Standard Deviation:\n", std_numpy)
print("Statistics Standard Deviation for a specific column:\n", std_stat)

Pandas Standard Deviation:
 total_cases    7.640506e+06
new_cases      2.124616e+04
dtype: float64
Numpy Standard Deviation:
 total_cases    7.639849e+06
new_cases      2.124434e+04
dtype: float64
Statistics Standard Deviation for a specific column:
 7640505.994771856


In [21]:
# Using scipy
skewness_scipy = covid_data[['total_cases', 'new_cases']].apply(lambda x: stats.skew(x.dropna()))

print("Scipy Skewness:\n", skewness_scipy)



Scipy Skewness:
 total_cases    2.742075
new_cases      4.270346
dtype: float64


In [22]:
# Using scipy
kurtosis_scipy = covid_data[['total_cases', 'new_cases']].apply(lambda x: stats.kurtosis(x.dropna()))

print("Scipy Kurtosis:\n", kurtosis_scipy)


Scipy Kurtosis:
 total_cases     6.669474
new_cases      27.763556
dtype: float64


In [23]:
# Min and Max using pandas
min_pandas = covid_data[['total_cases', 'new_cases']].min()
max_pandas = covid_data[['total_cases', 'new_cases']].max()

print("Pandas Minimum:\n", min_pandas)
print("Pandas Maximum:\n", max_pandas)


Pandas Minimum:
 total_cases    1
new_cases      0
dtype: int64
Pandas Maximum:
 total_cases    34724189
new_cases        287149
dtype: int64


In [24]:
# Using numpy
percentiles_numpy = covid_data[['total_cases', 'new_cases']].apply(lambda x: np.percentile(x.dropna(), [25, 50, 75]))

print("Numpy Percentiles (25th, 50th, 75th):\n", percentiles_numpy)


Numpy Percentiles (25th, 50th, 75th):
    total_cases  new_cases
0      40975.0       24.0
1     176305.0      261.0
2    1317075.5     3666.0


In [25]:
# Using pandas describe function
summary_table = covid_data[['total_cases', 'new_cases']].describe()

print("Summary Table:\n", summary_table)


Summary Table:
         total_cases      new_cases
count  5.818000e+03    5818.000000
mean   3.392399e+06    8814.365761
std    7.640506e+06   21246.164422
min    1.000000e+00       0.000000
25%    4.097500e+04      24.000000
50%    1.763050e+05     261.000000
75%    1.317076e+06    3666.000000
max    3.472419e+07  287149.000000


In [26]:
# Using pandas to calculate the IQR
IQR_pandas = covid_data[['total_cases', 'new_cases']].apply(lambda x: np.percentile(x.dropna(), 75) - np.percentile(x.dropna(), 25))

print("Pandas Interquartile Range (IQR):\n", IQR_pandas)


Pandas Interquartile Range (IQR):
 total_cases    1276100.5
new_cases         3642.0
dtype: float64


In [27]:
# Using pandas
correlation_pandas = covid_data[['total_cases', 'new_cases']].corr()

# Using numpy
correlation_numpy = np.corrcoef(covid_data['total_cases'], covid_data['new_cases'])

print("Pandas Correlation:\n", correlation_pandas)
print("Numpy Correlation:\n", correlation_numpy)


Pandas Correlation:
              total_cases  new_cases
total_cases     1.000000   0.521164
new_cases       0.521164   1.000000
Numpy Correlation:
 [[1.         0.52116444]
 [0.52116444 1.        ]]


In [28]:
# Using pandas
range_pandas = covid_data[['total_cases', 'new_cases']].apply(lambda x: x.max() - x.min())

print("Pandas Range:\n", range_pandas)


Pandas Range:
 total_cases    34724188
new_cases        287149
dtype: int64


In [29]:
# Using pandas
covariance_pandas = covid_data[['total_cases', 'new_cases']].cov()

print("Pandas Covariance:\n", covariance_pandas)


Pandas Covariance:
               total_cases     new_cases
total_cases  5.837733e+13  8.460138e+10
new_cases    8.460138e+10  4.513995e+08


In [30]:
summary_extended = pd.DataFrame({
    "Mean": covid_data[['total_cases', 'new_cases']].mean(),
    "Median": covid_data[['total_cases', 'new_cases']].median(),
    "Variance": covid_data[['total_cases', 'new_cases']].var(),
    "Standard Deviation": covid_data[['total_cases', 'new_cases']].std(),
    "IQR": covid_data[['total_cases', 'new_cases']].apply(lambda x: np.percentile(x.dropna(), 75) - np.percentile(x.dropna(), 25)),
    "Min": covid_data[['total_cases', 'new_cases']].min(),
    "Max": covid_data[['total_cases', 'new_cases']].max(),
    "Range": covid_data[['total_cases', 'new_cases']].apply(lambda x: x.max() - x.min())
})

print("Extended Summary Statistics Table:\n", summary_extended)


Extended Summary Statistics Table:
                      Mean    Median      Variance  Standard Deviation  \
total_cases  3.392399e+06  176305.0  5.837733e+13        7.640506e+06   
new_cases    8.814366e+03     261.0  4.513995e+08        2.124616e+04   

                   IQR  Min       Max     Range  
total_cases  1276100.5    1  34724189  34724188  
new_cases       3642.0    0    287149    287149  
