In [2]:
import pandas as pd

# Upload the file manually in Colab
from google.colab import files

uploaded = files.upload()  # This will prompt you to upload the file

# Specify the file name directly (it should match the name of the uploaded file)
file_path = "country_wise_latest.csv"  # Change this to match your file name
data = pd.read_csv(file_path)

# Step 1: Inspect the dataset
print("Dataset Overview:")
print(data.head())

print("\nDataset Info:")
print(data.info())

# Step 2: Check for missing data
missing_data = data.isnull().sum()
print("\nMissing Data Summary:")
print(missing_data)

# Step 3: Fill or handle missing values
data.fillna(0, inplace=True)  # Replace all missing values with 0
print("\nMissing values filled with 0.")

# Step 4: Check for duplicate rows
duplicate_rows = data.duplicated().sum()
print(f"\nNumber of duplicate rows: {duplicate_rows}")

# Step 5: Standardize column names
data.columns = [col.replace(" ", "_") for col in data.columns]
print("\nStandardized Column Names:")
print(data.columns)

# Step 6: Validate no negative values in numerical columns
negative_values = (data.select_dtypes(include=['int64', 'float64']) < 0).sum()
print("\nNegative Values Summary:")
print(negative_values)

# Step 7: Validate derived columns
data['Calculated_Active'] = data['Confirmed'] - data['Deaths'] - data['Recovered']
mismatch_active = (data['Active'] != data['Calculated_Active']).sum()
print(f"\nNumber of mismatched Active cases: {mismatch_active}")

# Drop the calculated column after validation
data.drop(columns=['Calculated_Active'], inplace=True)

# Save the cleaned dataset
cleaned_file_path = "country_wise_latest_cleaned.csv"
data.to_csv(cleaned_file_path, index=False)
print(f"\nCleaned dataset saved to: {cleaned_file_path}")


Saving country_wise_latest.csv to country_wise_latest (1).csv
Dataset Overview:
  Country/Region  Confirmed  Deaths  Recovered  Active  New cases  New deaths  \
0    Afghanistan      36263    1269      25198    9796        106          10   
1        Albania       4880     144       2745    1991        117           6   
2        Algeria      27973    1163      18837    7973        616           8   
3        Andorra        907      52        803      52         10           0   
4         Angola        950      41        242     667         18           1   

   New recovered  Deaths / 100 Cases  Recovered / 100 Cases  \
0             18                3.50                  69.49   
1             63                2.95                  56.25   
2            749                4.16                  67.34   
3              0                5.73                  88.53   
4              0                4.32                  25.47   

   Deaths / 100 Recovered  Confirmed last week  1 week