In [16]:
import pandas as pd
import numpy as np

def clean_and_preprocess(file_path):
    # Reading CSV file into a DataFrame
    df = pd.read_csv(file_path)
    
    # Defining the columns to check for negative values
    columns_to_check = ['population', 'Cases', 'Recovered', 'Deaths', 'Tests']
    
    # Replacing NaN and infinite values with 0 for numerical columns to safely convert to integers
    df[columns_to_check] = df[columns_to_check].replace([np.inf, -np.inf], np.nan).fillna(0)
    
    # Converting negative values to positive integer values in the specified columns
    for col in columns_to_check:
        df[col] = df[col].abs().astype(int)
    
    # Handling missing values for other columns
    for col in df.columns:
        if df[col].dtype in ['int64', 'float64'] and col not in columns_to_check:
            # Filling missing values with the mean for numerical columns
            df[col].fillna(df[col].mean(), inplace=True)
        elif df[col].dtype == 'object':
            # Filling missing values with the most frequent value for categorical columns
            df[col].fillna(df[col].mode()[0], inplace=True)
    
    # Normalizing numerical columns
    numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns
    df[numerical_cols] = (df[numerical_cols] - df[numerical_cols].mean()) / df[numerical_cols].std()
    
    # Encoding categorical columns using one-hot encoding
    categorical_cols = df.select_dtypes(include=['object', 'bool']).columns
    df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)
    
    return df

# Example usage:
csv_file_path = 'Documents/covid_19.csv'
df_cleaned = clean_and_preprocess(csv_file_path)
print(df_cleaned.head())
df_cleaned.isnull().sum()


   population  Cases  Recovered  Deaths  Tests  country_Africa  \
0        6115   2166          2       0      0           False   
1        3539   1930       1930       0   8632           False   
2        4965   1403       1376       8  17762           False   
3           0    712        699      13      0           False   
4         799     29         29       0      0           False   

   country_Albania  country_Algeria  country_All  country_Andorra  ...  \
0            False            False        False            False  ...   
1            False            False        False            False  ...   
2            False            False        False            False  ...   
3            False            False        False            False  ...   
4            False            False        False            False  ...   

   continent_Europe  continent_North-America  continent_Oceania  \
0             False                    False              False   
1             False     

population                        0
Cases                             0
Recovered                         0
Deaths                            0
Tests                             0
                                 ..
time_2024-06-30T16:15:12+00:00    0
time_2024-06-30T16:15:13+00:00    0
time_2024-06-30T16:15:14+00:00    0
time_2024-06-30T16:15:15+00:00    0
time_2024-06-30T16:15:16+00:00    0
Length: 254, dtype: int64