# COVID-19 Data Loading and Preprocessing
We will load the COVID-19 data, select the relevant columns, and ensure that the data types are correct.


In [12]:
# Import libraries
import numpy as np
import pandas as pd


In [13]:
# Define the data loading and preprocessing function
def load_and_preprocess_data(filepath):
    """
    Load the COVID-19 data from a csv file and preprocess it by selecting specific columns and
    ensuring correct data types.

    Parameters:
    - filepath: str, the path to the csv file

    Returns:
    - DataFrame with selected columns and converted data types
    """
    try:
        # Load the data into a DataFrame
        data = pd.read_csv(filepath)

        # Select relevant columns
        columns_of_interest = ['iso_code', 'continent', 'location', 'date', 'total_cases', 'new_cases']
        data = data[columns_of_interest]

        # Convert date column to datetime
        data['date'] = pd.to_datetime(data['date'])

        # Ensure numeric columns are of appropriate data type
        numeric_columns = ['total_cases', 'new_cases']
        data[numeric_columns] = data[numeric_columns].apply(pd.to_numeric, errors='coerce')

        return data

    except FileNotFoundError:
        print(f"The file {filepath} does not exist.")
        return None
    except Exception as e:
        print(f"An error occurred: {e}")
        return None


## Load the Data
Now we will load the data using the function we defined above.


In [14]:
# Load the data
covid_data_filepath = "covid-data.csv"
covid_data = load_and_preprocess_data(covid_data_filepath)


  data['date'] = pd.to_datetime(data['date'])


## Inspect the Data
Let's look at the first few rows of the dataframe, check the data types of the columns, and the shape of the dataframe.


In [16]:
# Display the first few rows of the dataframe
if covid_data is not None:
    display(covid_data.head(5))
else:
    print("Data loading failed.")


Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases
0,AFG,Asia,Afghanistan,2020-02-24,5,5
1,AFG,Asia,Afghanistan,2020-02-25,5,0
2,AFG,Asia,Afghanistan,2020-02-26,5,0
3,AFG,Asia,Afghanistan,2020-02-27,5,0
4,AFG,Asia,Afghanistan,2020-02-28,5,0


In [17]:
# Print data types
if covid_data is not None:
    print(covid_data.dtypes)


iso_code               object
continent              object
location               object
date           datetime64[ns]
total_cases             int64
new_cases               int64
dtype: object


In [18]:
# Print the shape of the dataframe
if covid_data is not None:
    print(f"The shape of the DataFrame is: {covid_data.shape}")


The shape of the DataFrame is: (5818, 6)


## Analyze the Data
We will calculate various percentiles for the 'new_cases' column to understand its distribution.


In [19]:
# Calculate and print various percentiles for the 'new_cases' column
if covid_data is not None:
    percentiles_to_calculate = [25, 50, 60, 75, 90]
    percentiles = np.percentile(covid_data['new_cases'].dropna(), percentiles_to_calculate)
    for p, value in zip(percentiles_to_calculate, percentiles):
        print(f"{p}th percentile for new cases: {value}")


25th percentile for new cases: 24.0
50th percentile for new cases: 261.0
60th percentile for new cases: 591.3999999999996
75th percentile for new cases: 3666.0
90th percentile for new cases: 32886.600000000006
