In [1]:
import polars as pl
import plotly.express as px

In [2]:
df = pl.read_csv("covid_worldwide.csv")
# print(type(df))
df.head()

Serial Number,Country,Total Cases,Total Deaths,Total Recovered,Active Cases,Total Test,Population
i64,str,str,str,str,str,str,str
1,"""USA""","""104,196,861""","""1,132,935""","""101,322,779""","""1,741,147""","""1,159,832,679""","""334,805,269"""
2,"""India""","""44,682,784""","""530,740""","""44,150,289""","""1,755""","""915,265,788""","""1,406,631,776"""
3,"""France""","""39,524,311""","""164,233""","""39,264,546""","""95,532""","""271,490,188""","""65,584,518"""
4,"""Germany""","""37,779,833""","""165,711""","""37,398,100""","""216,022""","""122,332,384""","""83,883,596"""
5,"""Brazil""","""36,824,580""","""697,074""","""35,919,372""","""208,134""","""63,776,166""","""215,353,593"""


In [3]:
# Is there any missing data? If so, perform missing data treatment of your choice.
missing = df.null_count()
print(missing) # there is missing data in total recovered, total test, total deaths, and population

shape: (1, 8)
┌────────────┬─────────┬───────────┬────────────┬────────────┬────────────┬──────────┬─────────────┐
│ Serial     ┆ Country ┆ Total     ┆ Total      ┆ Total      ┆ Active     ┆ Total    ┆ Population  │
│ Number     ┆ ---     ┆ Cases     ┆ Deaths     ┆ Recovered  ┆ Cases      ┆ Test     ┆ ---         │
│ ---        ┆ u32     ┆ ---       ┆ ---        ┆ ---        ┆ ---        ┆ ---      ┆ u32         │
│ u32        ┆         ┆ u32       ┆ u32        ┆ u32        ┆ u32        ┆ u32      ┆             │
╞════════════╪═════════╪═══════════╪════════════╪════════════╪════════════╪══════════╪═════════════╡
│ 0          ┆ 0       ┆ 0         ┆ 6          ┆ 2          ┆ 0          ┆ 18       ┆ 3           │
└────────────┴─────────┴───────────┴────────────┴────────────┴────────────┴──────────┴─────────────┘


In [4]:
# filling the na
df = df.fill_null("0")
print(df.null_count())

shape: (1, 8)
┌────────────┬─────────┬───────────┬────────────┬────────────┬────────────┬──────────┬─────────────┐
│ Serial     ┆ Country ┆ Total     ┆ Total      ┆ Total      ┆ Active     ┆ Total    ┆ Population  │
│ Number     ┆ ---     ┆ Cases     ┆ Deaths     ┆ Recovered  ┆ Cases      ┆ Test     ┆ ---         │
│ ---        ┆ u32     ┆ ---       ┆ ---        ┆ ---        ┆ ---        ┆ ---      ┆ u32         │
│ u32        ┆         ┆ u32       ┆ u32        ┆ u32        ┆ u32        ┆ u32      ┆             │
╞════════════╪═════════╪═══════════╪════════════╪════════════╪════════════╪══════════╪═════════════╡
│ 0          ┆ 0       ┆ 0         ┆ 0          ┆ 0          ┆ 0          ┆ 0        ┆ 0           │
└────────────┴─────────┴───────────┴────────────┴────────────┴────────────┴──────────┴─────────────┘


In [5]:
# What are the data types? Looks like most of the numeric columns need to be changed. Convert them to floats and integers as necessary.
# data types shown for above
# we will convert the columns that are realistically numeric

list_of_string_columns  = ["Total Cases", "Total Deaths", "Total Recovered", "Active Cases", "Total Test", "Population"]

for col in list_of_string_columns:
    df = df.with_columns(pl.col(col).str.replace_all(",", ""))

for col in list_of_string_columns:
    df = df.with_columns(pl.col(col).cast(pl.Int64, strict=False))

df.head()

Serial Number,Country,Total Cases,Total Deaths,Total Recovered,Active Cases,Total Test,Population
i64,str,i64,i64,i64,i64,i64,i64
1,"""USA""",104196861,1132935,101322779,1741147,1159832679,334805269
2,"""India""",44682784,530740,44150289,1755,915265788,1406631776
3,"""France""",39524311,164233,39264546,95532,271490188,65584518
4,"""Germany""",37779833,165711,37398100,216022,122332384,83883596
5,"""Brazil""",36824580,697074,35919372,208134,63776166,215353593


In [6]:
# How many different countries had the virus?
num_countries = df["Country"].n_unique()
print(num_countries, "countries")

231 countries


In [7]:
# Create a geographical plot of the distribution of deaths from around the world.
# will be on tableau dashboard

In [8]:
# What are the top 5 countries in active cases?
df.sort("Active Cases", descending=True)[["Country", "Active Cases"]].head(5)

Country,Active Cases
str,i64
"""Japan""",10952618
"""USA""",1741147
"""Poland""",925549
"""Vietnam""",870843
"""Mexico""",429421


In [9]:
# What are the top 5 countries in total recoveries?
df.sort("Total Recovered", descending=True)[["Country", "Total Recovered"]].head(5)

Country,Total Recovered
str,i64
"""USA""",101322779
"""India""",44150289
"""France""",39264546
"""Germany""",37398100
"""Brazil""",35919372


In [10]:
# Create your own question and answer it.
# What are the top 5 countries in total deaths?
df.sort("Total Deaths", descending=True)[["Country", "Total Deaths"]].head(5)

Country,Total Deaths
str,i64
"""USA""",1132935
"""Brazil""",697074
"""India""",530740
"""Russia""",395108
"""Mexico""",332198


# Dashboard
https://public.tableau.com/shared/346P5MS8G?:display_count=n&:origin=viz_share_link