In [1]:
import pandas as pd 
import numpy as np 

In [2]:
#The File Handling Part
def save_data_to_csv(data, dataset):
    data.to_csv(dataset, index=False)
    print(f"Data successfully saved to {dataset}")

def load_data_from_csv(dataset_path="C:\\Users\\KUBER\\OneDrive\\Documents\\country_wise_latest.csv"):
    data = pd.read_csv(dataset_path)
    print(f"Data successfully loaded from {dataset_path}")
    return data


In [3]:
#The Exception Handling Part
class DataCleaningError(Exception):
    pass
class MissingRequiredColumnsError(DataCleaningError):
    pass
class UnexpectedDataTypeError(DataCleaningError):
    pass

In [4]:
#Using the File Handling section to load the data from csv file to the dataset 

dataset = load_data_from_csv()

Data successfully loaded from C:\Users\KUBER\OneDrive\Documents\country_wise_latest.csv


In [5]:
'''Data Cleaning Section Starts from here !!'''
#Returns the first 10 rows of the csv file

dataset.head(10)

Unnamed: 0,Country/Region,Confirmed,Deaths,Recovered,Active,New cases,New deaths,New recovered,Deaths / 100 Cases,Recovered / 100 Cases,Deaths / 100 Recovered,Confirmed last week,1 week change,1 week % increase,WHO Region
0,Afghanistan,36263,1269,25198,9796,106,10,18,3.5,69.49,5.04,35526,737,2.07,Eastern Mediterranean
1,Albania,4880,144,2745,1991,117,6,63,2.95,56.25,5.25,4171,709,17.0,Europe
2,Algeria,27973,1163,18837,7973,616,8,749,4.16,67.34,6.17,23691,4282,18.07,Africa
3,Andorra,907,52,803,52,10,0,0,5.73,88.53,6.48,884,23,2.6,Europe
4,Angola,950,41,242,667,18,1,0,4.32,25.47,16.94,749,201,26.84,Africa
5,Antigua and Barbuda,86,3,65,18,4,0,5,3.49,75.58,4.62,76,10,13.16,Americas
6,Argentina,167416,3059,72575,91782,4890,120,2057,1.83,43.35,4.21,130774,36642,28.02,Americas
7,Armenia,37390,711,26665,10014,73,6,187,1.9,71.32,2.67,34981,2409,6.89,Europe
8,Australia,15303,167,9311,5825,368,6,137,1.09,60.84,1.79,12428,2875,23.13,Western Pacific
9,Austria,20558,713,18246,1599,86,1,37,3.47,88.75,3.91,19743,815,4.13,Europe


In [6]:
#returns the number of rows and column of the dataset

dataset.shape

(187, 15)

In [7]:
#Returns Data information about the dataset

dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 187 entries, 0 to 186
Data columns (total 15 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Country/Region          187 non-null    object 
 1   Confirmed               187 non-null    int64  
 2   Deaths                  187 non-null    int64  
 3   Recovered               187 non-null    int64  
 4   Active                  187 non-null    int64  
 5   New cases               187 non-null    int64  
 6   New deaths              187 non-null    int64  
 7   New recovered           187 non-null    int64  
 8   Deaths / 100 Cases      187 non-null    float64
 9   Recovered / 100 Cases   187 non-null    float64
 10  Deaths / 100 Recovered  187 non-null    float64
 11  Confirmed last week     187 non-null    int64  
 12  1 week change           187 non-null    int64  
 13  1 week % increase       187 non-null    float64
 14  WHO Region              187 non-null    ob

In [8]:
#It is used for getting number of unique values in the columns of the Dataset

dataset.nunique()

Country/Region            187
Confirmed                 184
Deaths                    150
Recovered                 178
Active                    173
New cases                 122
New deaths                 38
New recovered             103
Deaths / 100 Cases        145
Recovered / 100 Cases     177
Deaths / 100 Recovered    155
Confirmed last week       183
1 week change             162
1 week % increase         169
WHO Region                  6
dtype: int64

In [9]:
#It checks whether there are any null in the dataset

dataset.isnull().sum()

Country/Region            0
Confirmed                 0
Deaths                    0
Recovered                 0
Active                    0
New cases                 0
New deaths                0
New recovered             0
Deaths / 100 Cases        0
Recovered / 100 Cases     0
Deaths / 100 Recovered    0
Confirmed last week       0
1 week change             0
1 week % increase         0
WHO Region                0
dtype: int64

In [10]:
#Drops the null values and returns the dataset to its original state

dataset = dataset.dropna()

In [11]:
#It checks whether there are any null in the dataset

dataset.isnull().sum()  

Country/Region            0
Confirmed                 0
Deaths                    0
Recovered                 0
Active                    0
New cases                 0
New deaths                0
New recovered             0
Deaths / 100 Cases        0
Recovered / 100 Cases     0
Deaths / 100 Recovered    0
Confirmed last week       0
1 week change             0
1 week % increase         0
WHO Region                0
dtype: int64

In [12]:
#Used for converting the datatypes of the column to correct datatypes 

for col in dataset.columns[1:-1]:
    if dataset[col].dtype == 'object':
        dataset[col] = dataset[col].astype('str')

In [13]:
#Returns Data information about the dataset

dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 187 entries, 0 to 186
Data columns (total 15 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Country/Region          187 non-null    object 
 1   Confirmed               187 non-null    int64  
 2   Deaths                  187 non-null    int64  
 3   Recovered               187 non-null    int64  
 4   Active                  187 non-null    int64  
 5   New cases               187 non-null    int64  
 6   New deaths              187 non-null    int64  
 7   New recovered           187 non-null    int64  
 8   Deaths / 100 Cases      187 non-null    float64
 9   Recovered / 100 Cases   187 non-null    float64
 10  Deaths / 100 Recovered  187 non-null    float64
 11  Confirmed last week     187 non-null    int64  
 12  1 week change           187 non-null    int64  
 13  1 week % increase       187 non-null    float64
 14  WHO Region              187 non-null    ob

In [14]:
#It is used to get a quick overview of the statistical summary of the data in a DataFrame

dataset.describe()

  sqr = _ensure_numeric((avg - values) ** 2)


Unnamed: 0,Confirmed,Deaths,Recovered,Active,New cases,New deaths,New recovered,Deaths / 100 Cases,Recovered / 100 Cases,Deaths / 100 Recovered,Confirmed last week,1 week change,1 week % increase
count,187.0,187.0,187.0,187.0,187.0,187.0,187.0,187.0,187.0,187.0,187.0,187.0,187.0
mean,88130.94,3497.518717,50631.48,34001.94,1222.957219,28.957219,933.812834,3.019519,64.820535,inf,78682.48,9448.459893,13.606203
std,383318.7,14100.002482,190188.2,213326.2,5710.37479,120.037173,4197.719635,3.454302,26.287694,,338273.7,47491.127684,24.509838
min,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,-47.0,-3.84
25%,1114.0,18.5,626.5,141.5,4.0,0.0,0.0,0.945,48.77,1.45,1051.5,49.0,2.775
50%,5059.0,108.0,2815.0,1600.0,49.0,1.0,22.0,2.15,71.32,3.62,5020.0,432.0,6.89
75%,40460.5,734.0,22606.0,9149.0,419.5,6.0,221.0,3.875,86.885,6.44,37080.5,3172.0,16.855
max,4290259.0,148011.0,1846641.0,2816444.0,56336.0,1076.0,33728.0,28.56,100.0,inf,3834677.0,455582.0,226.32


In [15]:
#Return the number of columns in the dataset

dataset.columns

Index(['Country/Region', 'Confirmed', 'Deaths', 'Recovered', 'Active',
       'New cases', 'New deaths', 'New recovered', 'Deaths / 100 Cases',
       'Recovered / 100 Cases', 'Deaths / 100 Recovered',
       'Confirmed last week', '1 week change', '1 week % increase',
       'WHO Region'],
      dtype='object')

In [16]:
try:
    if 'Confirmed' not in dataset.columns:
        raise DataCleaningError("Missing 'Confirmed' column")
    
    if 'Confirmed' == object:
        raise UnexpectedDataTypeError("The 'Confirmed' column is in Object Datatype")
    
except DataCleaningError as e:
    print(f"Error during data cleaning: {e}")


In [17]:
#Perfroming some basic analysis on the dataset

total_confirmed = dataset['Confirmed'].sum()
total_deaths = dataset['Deaths'].sum()
total_recovered = dataset['Recovered'].sum()

death_rate = (total_deaths / total_confirmed) * 100
recovery_rate = (total_recovered / total_confirmed) * 100

print(f'Total Confirmed: {total_confirmed}')
print(f'Total Deaths: {total_deaths}')
print(f'Total Recovered: {total_recovered}')
print(f'Death Rate: {death_rate:.2f}%')
print(f'Recovery Rate: {recovery_rate:.2f}%')

Total Confirmed: 16480485
Total Deaths: 654036
Total Recovered: 9468087
Death Rate: 3.97%
Recovery Rate: 57.45%


In [19]:
#Saving the Clean Data without any null values to a new file called 'clean_country_wise_data.csv'

save_data_to_csv(dataset, 'clean_country_wise_data.csv')

Data successfully saved to clean_country_wise_data.csv


In [20]:
dataset.columns

Index(['Country/Region', 'Confirmed', 'Deaths', 'Recovered', 'Active',
       'New cases', 'New deaths', 'New recovered', 'Deaths / 100 Cases',
       'Recovered / 100 Cases', 'Deaths / 100 Recovered',
       'Confirmed last week', '1 week change', '1 week % increase',
       'WHO Region'],
      dtype='object')