# COVID-19 Dataset â€“ Exploratory Data Analysis (EDA)

Focus:
- Feature distributions
- Missing (null) values

**Dataset path:** `covid19/.csv`

## 1. Load Data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

real_path = 'covid19/.csv'
df = pd.read_csv(real_path)

print('Dataset shape:', df.shape)
df.head()

## 2. Data Types

In [None]:
df.info()

## 3. Missing Values

In [None]:
missing_abs = df.isna().sum().sort_values(ascending=False)
missing_pct = (df.isna().mean() * 100).sort_values(ascending=False)

missing_df = pd.DataFrame({
    'missing_count': missing_abs,
    'missing_percent': missing_pct
})

missing_df[missing_df.missing_count > 0].head(20)

## 4. Missing Values Plot

In [None]:
plt.figure(figsize=(10,5))
missing_pct[missing_pct > 0].plot(kind='bar')
plt.ylabel('Missing (%)')
plt.title('Missing Values per Column')
plt.tight_layout()
plt.show()

## 5. Numeric Distributions

In [None]:
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
for col in num_cols:
    plt.figure()
    df[col].dropna().hist(bins=30)
    plt.title(col)
    plt.xlabel(col)
    plt.ylabel('Count')
    plt.show()

## 6. Categorical Distributions

In [None]:
cat_cols = df.select_dtypes(exclude=[np.number]).columns.tolist()
for col in cat_cols:
    plt.figure(figsize=(6,4))
    df[col].value_counts(dropna=False).head(20).plot(kind='bar')
    plt.title(col)
    plt.ylabel('Count')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()

## 7. Notes
- Columns with high missingness may need imputation or removal
- Highly skewed distributions may require transformation before GAN training