In [5]:
# COVID-19 Surveillance Data Analysis

# Task 1: Load and Explore the Dataset

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Configure plots for better style
plt.style.use("seaborn-v0_8")
sns.set_palette("Set2")

# Load the dataset with error handling
try:
    df = pd.read_csv("Surveillance.csv")
    print("Dataset loaded successfully!\n")
except FileNotFoundError:
    print("Error: Surveillance.csv file not found. Please check the file path.")

# Display first few rows
print("First 5 rows of the dataset:")
display(df.head())

# Check dataset info
print("\nDataset Info:")
print(df.info())

# Check for missing values
print("\nMissing values per column:")
print(df.isnull().sum())

# Clean the dataset (if missing values exist)
df = df.dropna()  # here, drop rows with missing values
print("\nAfter cleaning, dataset shape:", df.shape)


# Task 2: Basic Data Analysis

# Compute basic statistics of numerical columns
print("\nBasic Statistics of Numerical Columns:")
display(df.describe())

# Perform grouping on a categorical column
if "Region" in df.columns and "Cases" in df.columns:
    grouped = df.groupby("Region")["Cases"].mean().reset_index()
    print("\nAverage cases per Region:")
    display(grouped)

# Identify interesting patterns (example printed as insight)
if "Cases" in df.columns:
    print("\nInsight: Maximum cases recorded:", df["Cases"].max(), 
          " | Minimum cases recorded:", df["Cases"].min())


# Task 3: Data Visualization

# 1. Line chart (trend over time if 'Date' column exists)
if "Date" in df.columns and "Cases" in df.columns:
    df["Date"] = pd.to_datetime(df["Date"], errors="coerce")
    df_sorted = df.sort_values("Date")
    
    plt.figure(figsize=(10,5))
    plt.plot(df_sorted["Date"], df_sorted["Cases"], marker="o", linestyle="-")
    plt.title("COVID-19 Cases Over Time")
    plt.xlabel("Date")
    plt.ylabel("Number of Cases")
    plt.grid(True)
    plt.show()

# 2. Bar chart (comparison of averages per category)
if "Region" in df.columns and "Cases" in df.columns:
    plt.figure(figsize=(8,5))
    sns.barplot(x="Region", y="Cases", data=df, estimator=np.mean, ci=None)
    plt.title("Average COVID-19 Cases by Region")
    plt.xlabel("Region")
    plt.ylabel("Average Cases")
    plt.show()

# 3. Histogram of a numerical column
if "Cases" in df.columns:
    plt.figure(figsize=(8,5))
    plt.hist(df["Cases"], bins=10, color="skyblue", edgecolor="black")
    plt.title("Distribution of COVID-19 Cases")
    plt.xlabel("Number of Cases")
    plt.ylabel("Frequency")
    plt.show()

# 4. Scatter plot between two numerical columns
if "Cases" in df.columns and "Deaths" in df.columns:
    plt.figure(figsize=(8,5))
    sns.scatterplot(x="Cases", y="Deaths", data=df, hue="Region")
    plt.title("Scatter Plot of Cases vs Deaths")
    plt.xlabel("Cases")
    plt.ylabel("Deaths")
    plt.legend(title="Region")
    plt.show()

print("\nAnalysis complete! ✅")


Dataset loaded successfully!

First 5 rows of the dataset:


Unnamed: 0,A01,A02,A03,A04,A05,A06,A07,Categories
0,+,+,+,+,+,-,-,PUS
1,+,+,-,+,+,-,-,PUS
2,+,+,+,+,-,+,-,PUS
3,+,+,-,+,-,+,-,PUS
4,+,-,-,-,-,-,+,PUS



Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14 entries, 0 to 13
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   A01         14 non-null     object
 1   A02         14 non-null     object
 2   A03         14 non-null     object
 3   A04         14 non-null     object
 4   A05         14 non-null     object
 5   A06         14 non-null     object
 6   A07         14 non-null     object
 7   Categories  14 non-null     object
dtypes: object(8)
memory usage: 1.0+ KB
None

Missing values per column:
A01           0
A02           0
A03           0
A04           0
A05           0
A06           0
A07           0
Categories    0
dtype: int64

After cleaning, dataset shape: (14, 8)

Basic Statistics of Numerical Columns:


Unnamed: 0,A01,A02,A03,A04,A05,A06,A07,Categories
count,14,14,14,14,14,14,14,14
unique,2,2,2,2,2,2,2,3
top,+,+,-,+,-,-,-,PUS
freq,10,10,10,9,10,10,9,8



Analysis complete! ✅
