In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import os

# Path to the folder containing the CSV files in Google Drive
folder_path = '/content/drive/MyDrive/csse_covid_19_daily_reports'  # Adjust this path as needed

# Output file path (save in Google Drive)
output_file_path = '/content/drive/MyDrive/combined_dataset.csv'

# Get a list of all CSV files in the folder
csv_files = [file for file in os.listdir(folder_path) if file.endswith('.csv')]

# Create an empty list to store DataFrames
df_list = []

# Read each CSV file and append to the list
for file in csv_files:
    file_path = os.path.join(folder_path, file)
    temp_df = pd.read_csv(file_path)
    df_list.append(temp_df)

# Concatenate all DataFrames in the list into one large DataFrame
combined_df = pd.concat(df_list, ignore_index=True)

# Save the combined DataFrame to a CSV file in Google Drive
combined_df.to_csv(output_file_path, index=False)

print(f"Combined dataset saved to {output_file_path}")


Combined dataset saved to /content/drive/MyDrive/combined_dataset.csv


In [None]:
print(combined_df.head())

   FIPS Admin2 Province_State Country_Region          Last_Update       Lat  \
0   NaN    NaN            NaN    Afghanistan  2023-02-10 20:20:57  33.93911   
1   NaN    NaN            NaN        Albania  2023-02-10 20:20:57  41.15330   
2   NaN    NaN            NaN        Algeria  2023-02-10 20:20:57  28.03390   
3   NaN    NaN            NaN        Andorra  2023-02-10 20:20:57  42.50630   
4   NaN    NaN            NaN         Angola  2023-02-10 20:20:57 -11.20270   

       Long_  Confirmed  Deaths  Recovered  ...  Combined_Key Incident_Rate  \
0  67.709953   208943.0  7896.0        NaN  ...   Afghanistan    536.737489   
1  20.168300   334229.0  3596.0        NaN  ...       Albania  11614.045451   
2   1.659600   271406.0  6881.0        NaN  ...       Algeria    618.927126   
3   1.521800    47860.0   165.0        NaN  ...       Andorra  61942.664855   
4  17.873900   105184.0  1931.0        NaN  ...        Angola    320.036336   

   Case_Fatality_Ratio  Province/State Country/Reg

In [None]:
# Display the total number of rows
total_rows = combined_df.shape[0]
print(f"Total number of rows in the combined dataset: {total_rows}")

Total number of rows in the combined dataset: 4327543


In [None]:
import pandas as pd

# Path to the uploaded CSV file in Google Drive
csv_file_path = '/content/drive/MyDrive/combined_dataset.csv'  # Update with the actual file name

# Output file path (save the modified file in Google Drive)
output_file_path = '/content/drive/MyDrive/modified_dataset.csv'  # Path for the modified file

# Read the CSV file into a DataFrame
df = pd.read_csv(csv_file_path)

# Remove the 'FIPS' and 'Admin2' columns if they exist
df = df.drop(columns=['FIPS', 'Admin2'], errors='ignore')

# Save the modified DataFrame to a new CSV file in Google Drive
df.to_csv(output_file_path, index=False)

print(f"Modified dataset saved to {output_file_path}")

# Display the first 5 rows of the modified dataset
print(df.head())  # Shows the first 5 rows of the modified DataFrame

# Display the total number of rows in the modified dataset
total_rows = df.shape[0]
print(f"Total number of rows in the modified dataset: {total_rows}")


  df = pd.read_csv(csv_file_path)


Modified dataset saved to /content/drive/MyDrive/modified_dataset.csv
  Province_State Country_Region          Last_Update       Lat      Long_  \
0            NaN    Afghanistan  2023-02-10 20:20:57  33.93911  67.709953   
1            NaN        Albania  2023-02-10 20:20:57  41.15330  20.168300   
2            NaN        Algeria  2023-02-10 20:20:57  28.03390   1.659600   
3            NaN        Andorra  2023-02-10 20:20:57  42.50630   1.521800   
4            NaN         Angola  2023-02-10 20:20:57 -11.20270  17.873900   

   Confirmed  Deaths  Recovered  Active Combined_Key  Incident_Rate  \
0   208943.0  7896.0        NaN     NaN  Afghanistan     536.737489   
1   334229.0  3596.0        NaN     NaN      Albania   11614.045451   
2   271406.0  6881.0        NaN     NaN      Algeria     618.927126   
3    47860.0   165.0        NaN     NaN      Andorra   61942.664855   
4   105184.0  1931.0        NaN     NaN       Angola     320.036336   

   Case_Fatality_Ratio Province/State Co

In [None]:
import pandas as pd

# Path to your uploaded CSV file in Google Drive
csv_file_path = '/content/drive/MyDrive/modified_dataset.csv'  # Update with the actual file name

# Output file path for cleaned data
output_file_path = '/content/drive/MyDrive/cleaned_dataset.csv'  # Path for the cleaned file

# Step 1: Read the dataset
df = pd.read_csv(csv_file_path)

# Step 2: Drop unnecessary columns (e.g., 'FIPS', 'Admin2')
df = df.drop(columns=['FIPS', 'Admin2'], errors='ignore')

# Step 3: Check for missing values
missing_data = df.isnull().sum()  # Count of missing values in each column
print("Missing Data:")
print(missing_data)

# Handle missing values - Example: Drop rows with missing values in critical columns
# You can adjust the strategy based on the column and the context of your data.
df = df.dropna(subset=['Confirmed', 'Deaths', 'Recovered'], how='any')  # Drop rows where these columns are NaN

# Alternatively, you can fill missing values (impute) for non-critical columns, if needed:
# df['Column_Name'].fillna(df['Column_Name'].mean(), inplace=True)  # Impute with the mean (for numeric columns)

# Step 4: Remove duplicate rows
df = df.drop_duplicates()

# Step 5: Convert data types
# Convert the 'Last_Update' column to datetime format
df['Last_Update'] = pd.to_datetime(df['Last_Update'], errors='coerce')

# Step 6: Check and handle outliers (if applicable)
# For example, you can visualize distributions or remove extreme values for numerical columns like 'Confirmed', 'Deaths', etc.
# This step depends on the data and should be customized based on the dataset.

# Step 7: Standardize any categorical data (e.g., string columns)
df['Country_Region'] = df['Country_Region'].str.strip().str.title()  # Standardize country names (capitalization)

# Step 8: Save the cleaned dataset
df.to_csv(output_file_path, index=False)

print(f"Cleaned dataset saved to {output_file_path}")

# Optional: Display the first 5 rows of the cleaned dataset
print(df.head())

# Optional: Display total number of rows in the cleaned dataset
total_rows = df.shape[0]
print(f"Total number of rows in the cleaned dataset: {total_rows}")


  df = pd.read_csv(csv_file_path)


Missing Data:
Province_State          205488
Country_Region            9797
Last_Update               9797
Lat                     104980
Long_                   104980
Confirmed                   28
Deaths                     433
Recovered              2852503
Active                 2861916
Combined_Key              9797
Incident_Rate           965347
Case_Fatality_Ratio     923759
Province/State         4321456
Country/Region         4317746
Last Update            4317746
Latitude               4322051
Longitude              4322051
Incidence_Rate         3681842
Case-Fatality_Ratio    3678226
dtype: int64
Cleaned dataset saved to /content/drive/MyDrive/cleaned_dataset.csv
     Province_State Country_Region         Last_Update       Lat      Long_  \
4016            NaN    Afghanistan 2021-02-13 05:22:12  33.93911  67.709953   
4017            NaN        Albania 2021-02-13 05:22:12  41.15330  20.168300   
4018            NaN        Algeria 2021-02-13 05:22:12  28.03390   1.659600   


In [None]:
df['Lat'].fillna(df['Lat'].mean(), inplace=True)  # Fill missing latitude values with the mean
df['Long_'].fillna(df['Long_'].mean(), inplace=True)  # Fill missing longitude values with the mean


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Lat'].fillna(df['Lat'].mean(), inplace=True)  # Fill missing latitude values with the mean
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Long_'].fillna(df['Long_'].mean(), inplace=True)  # Fill missing longitude values with the mean
