<a href="https://colab.research.google.com/github/Subhasree456/Week1-StudentPerformance-EDA/blob/main/COVID19-EDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# =========================================
# COVID-19 EDA
# =========================================

# 1. Install Kaggle
!pip install -q kaggle

# 2. Upload kaggle.json
from google.colab import files
files.upload()

# 3. Set Kaggle permissions
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

# 4. Download COVID-19 dataset from Kaggle
!kaggle datasets download -d imdevskp/corona-virus-report

# 5. Unzip dataset
!unzip -q corona-virus-report.zip

# 6. Import libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# 7. Load dataset
df = pd.read_csv("covid_19_clean_complete.csv")

# 8. Select required columns
df = df[['Date', 'Country/Region', 'Confirmed', 'Recovered', 'Deaths']]
df.columns = ['Date', 'Country', 'Confirmed', 'Recovered', 'Deaths']

# 9. Data Cleaning
df['Date'] = pd.to_datetime(df['Date'])
df = df.drop_duplicates()
df = df.fillna(0)

print(" Data Loaded & Cleaned\n")

# 10. EDA
print(" First 5 rows:\n", df.head(), "\n")
print(" Summary Statistics:\n", df.describe(), "\n")
print(" Number of countries:", df['Country'].nunique(), "\n")

# Top affected countries
top_confirmed = df.groupby('Country')['Confirmed'].max().sort_values(ascending=False).head(10)
print(" Top 10 Most Affected Countries:\n", top_confirmed, "\n")

# 11. Visualizations
sns.set_style("whitegrid")

# Line chart – confirmed cases
global_cases = df.groupby('Date')['Confirmed'].sum()
plt.figure(figsize=(10,5))
plt.plot(global_cases)
plt.title("Global Confirmed COVID-19 Cases Over Time")
plt.xlabel("Date")
plt.ylabel("Confirmed Cases")
plt.show()

# Bar chart – top deaths
top_deaths = df.groupby('Country')['Deaths'].max().sort_values(ascending=False).head(10)
plt.figure(figsize=(10,5))
top_deaths.plot(kind='bar')
plt.title("Top 10 Countries with Highest Deaths")
plt.xlabel("Country")
plt.ylabel("Deaths")
plt.show()

# Pie chart – recovered vs deaths
plt.figure(figsize=(6,6))
plt.pie(
    [df['Recovered'].sum(), df['Deaths'].sum()],
    labels=['Recovered', 'Deaths'],
    autopct='%1.1f%%',
    startangle=140
)
plt.title("Global Recovered vs Deaths")
plt.show()

# 12. Observations
print(" Observations:")
print("1. COVID-19 cases increased rapidly over time.")
print("2. Few countries contribute to majority of deaths.")
print("3. Recovery rate is significantly higher than death rate.")
print("4. EDA helps understand global pandemic impact.")