# __Impact Analysis of Monkeypox Case Study__

___

## **Data Import**

### Import Library

In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

### Gathering Data (Import File)

In [None]:
# Load the dataset
while True:
    try:
        start_year = int(input("Enter the start year (example: 2022): "))
        start_month = int(input("Enter the start month (1-12): "))

        end_year = int(input("Enter the end year (example: 2024): "))
        end_month = int(input("Enter the end month (1-12): "))

        # Input Validation
        if start_month < 1 or start_month > 12 or end_month < 1 or end_month > 12:
            print("Month must be between 1 and 12. Please try again.")
        elif start_year > end_year or (start_year == end_year and start_month > end_month):
            print("The start date cannot be later than the end date. Please try again.")
        else:
            break
    except ValueError:
        print("Invalid input. Please enter valid year and month numbers (example: 2022 and 5 for May).")

# Construct the file name based on the input
output_folder = 'data/processed'

# Format the file name according to the selected year and month range
file_name = f"monkeypox_{start_year}_{start_month}_to_{end_year}_{end_month}_processed.csv"
file_path = os.path.join(output_folder, file_name)

# Check if the file exists
if os.path.exists(file_path):
    df = pd.read_csv(file_path)
    print(f"Data successfully loaded from {file_path}")
else:
    print(f"File {file_path} not found.")

In [None]:
# showing the entire of dataset
print("\nProcessed data preview:")
file_path.head()

In [None]:
# Checking data type and column of dataset
file_path.info()

## **Exploratory Data Analysis (EDA)**

In [None]:
# Classification of Countries and Regions/Continents
# List of regions/continents that are not countries
regions_or_continents = ['World', 'Asia', 'Europe', 'Africa', 'North America', 'South America', 'Oceania']  # You can add it if needed

# Added a new column 'location_type' to classify between 'Country' and 'Region/Continent'
file_path['location_type'] = file_path['location'].apply(lambda x: 'Region/Continent' if x in regions_or_continents else 'Country')

# Checking if the clustering was successful
print("\nLocation types (Countries vs Regions/Continents):")
print(file_path['location_type'].value_counts())

In [None]:
# Grouping country and non-country data for separate analysis
countries_data = file_path[file_path['location_type'] == 'Country']
regions_data = file_path[file_path['location_type'] == 'Region/Continent']

# Optional: Displays the number of countries and regions identified
print(f"\nNumber of Countries: {countries_data['location'].nunique()}")
print(f"Number of Regions/Continents: {regions_data['location'].nunique()}")

In [None]:
# View the dataset after being classified between countries and non-countries
print("\nClassified dataset:")
file_path.head()

**Epidemiologic Trends: Investigating the Spread of Monkeypox**

Looking at trends in the spread of Monkeypox with a focus on the factors of new cases, total cases, and mortality rates.

In [None]:
# EDA: Mengonversi kolom 'date' menjadi format tahun dan menghitung jumlah per tahun
file_path.loc[:, 'year'] = pd.to_datetime(file_path['date']).dt.year

# Menghitung jumlah (SUM) new_cases dan total_cases per tahun
data_cases_yearly_sum = file_path.groupby('year').agg({
    'new_cases': 'sum',
    'total_cases': 'sum'
}).reset_index()

# Menghitung jumlah (SUM) new_cases dan total_cases per tahun
data_deaths_yearly_sum = file_path.groupby('year').agg({
    'new_deaths': 'sum',
    'total_deaths': 'sum'
}).reset_index()

**Regional Comparisons: Country/Region Comparisons**

Comparing countries to understand how Monkeypox affects different regions.


In [None]:
# View total cases and deaths in each country
data_grouped_by_location_countries = countries_data.groupby('location').agg({
    'total_cases': 'sum',
    'total_deaths': 'sum'
}).reset_index()

# View total cases and deaths in each region/continent
data_grouped_by_location_regions = regions_data.groupby('location').agg({
    'total_cases': 'sum',
    'total_deaths': 'sum'
}).reset_index()

**Demographic Trends: Impact by Country**

Analyze the impact of Monkeypox by Countries.

In [None]:
# A look at the countries with the highest total cases
top_countries = data_grouped_by_location_countries.nlargest(10, 'total_cases')

**Temporal Analysis: Time of Deployment Analysis**

Analyzing how the spread of Monkeypox changes over time.


In [None]:
# Make sure the 'date' field is in datetime format
file_path.loc[:, 'date'] = pd.to_datetime(file_path['date'], errors='coerce')

# Create a 'month' column in Year-Month format
file_path.loc[:, 'month'] = file_path['date'].dt.strftime('%Y-%m')

# Convert the 'new_cases' and 'new_deaths' columns to numeric
file_path.loc[:, 'new_cases'] = pd.to_numeric(file_path['new_cases'], errors='coerce')
file_path.loc[:, 'new_deaths'] = pd.to_numeric(file_path['new_deaths'], errors='coerce')

# Group data by 'month' and calculate total new cases and deaths by month
cases_per_month = file_path.groupby('month').agg({
    'new_cases': 'sum',
    'new_deaths': 'sum'
}).reset_index()


**Identification of High-risk Regions: Top Locations by Case Fatality Ratio**

Identify high-risk areas based on case prevalence ratios.


In [None]:
# Calculating CFR
data_grouped_by_location_countries['CFR'] = data_grouped_by_location_countries['total_deaths'] / data_grouped_by_location_countries['total_cases'] * 100  # dalam persen

# Identify areas with high CFR
high_cfr_locations = data_grouped_by_location_countries.nlargest(10, 'CFR')

## **Data Visualization**

**Epidemiologic Trends: Investigating the Spread of Monkeypox**

In [None]:
# Visualize the trend of new cases and total cases over time
# Create two subplots (one for new_cases and one for total_cases)
fig, axes = plt.subplots(2, 1, figsize=(10, 6))

# new_cases trend visualization
sns.lineplot(x='year', y='new_cases', data=data_cases_yearly_sum, label='New Cases - Countries', color='blue', ax=axes[0], linewidth=1)
axes[0].set_title('Trends in New Cases')
axes[0].set_xlabel('Year')
axes[0].set_ylabel('Number of New Cases')

# Adding labels to data points for new_cases (yearly totals)
for year in sorted(data_cases_yearly_sum['year']):
    new_cases_value = data_cases_yearly_sum[data_cases_yearly_sum['year'] == year]['new_cases'].sum()  # Total per tahun
    axes[0].text(year, new_cases_value, f'{new_cases_value:,.0f}', color='blue', ha='center', va='bottom', fontsize=9)

# Visualisasi tren total_cases
sns.lineplot(x='year', y='total_cases', data=data_cases_yearly_sum, label='Total Cases - Countries', color='orange', ax=axes[1], linewidth=1)
axes[1].set_title('Trends in Total Cases')
axes[1].set_xlabel('Year')
axes[1].set_ylabel('Number of Total Cases')

# Add a label to the data point for total_cases (grand total for each year)
for year in sorted(data_cases_yearly_sum['year']):
    total_cases_value = data_cases_yearly_sum[data_cases_yearly_sum['year'] == year]['total_cases'].sum()  # Total per tahun
    axes[1].text(year, total_cases_value, f'{total_cases_value:,.0f}', color='orange', ha='center', va='bottom', fontsize=9)

# Added X-axis rotation
for ax in axes:
    ax.set_xticks(sorted(data_cases_yearly_sum['year'].unique()))
    ax.set_xticklabels(sorted(data_cases_yearly_sum['year'].unique()), rotation=45)

# Tighter layout settings
plt.tight_layout()

# Show graph
plt.show()

In [None]:
# Visualize the trend of new deaths and total deaths over time
# Create two subplots (one for new_deaths and one for total_deaths)
fig, axes = plt.subplots(2, 1, figsize=(10, 6))

# Visualisasi tren new_cases
sns.lineplot(x='year', y='new_deaths', data=data_deaths_yearly_sum, label='New Deaths', color='blue', ax=axes[0], linewidth=1)
axes[0].set_title('Trends in New Deaths')
axes[0].set_xlabel('Year')
axes[0].set_ylabel('Number of New Deaths')

# Adding labels to data points for new_deaths (yearly totals)
for year in sorted(data_deaths_yearly_sum['year']):
    new_deaths_value = data_deaths_yearly_sum[data_deaths_yearly_sum['year'] == year]['new_deaths'].sum()  # Total per tahun
    axes[0].text(year, new_deaths_value, f'{new_deaths_value:,.0f}', color='blue', ha='center', va='bottom', fontsize=9)

# total_deaths trend visualization
sns.lineplot(x='year', y='total_deaths', data=data_deaths_yearly_sum, label='Total Deaths', color='orange', ax=axes[1], linewidth=1)
axes[1].set_title('Trends in Total Deaths')
axes[1].set_xlabel('Year')
axes[1].set_ylabel('Number of Total Deaths')

# Adding labels to data points for total_deaths (grand total for each year)
for year in sorted(data_deaths_yearly_sum['year']):
    total_deaths_value = data_deaths_yearly_sum[data_deaths_yearly_sum['year'] == year]['total_deaths'].sum()  # Total per tahun
    axes[1].text(year, total_deaths_value, f'{total_deaths_value:,.0f}', color='orange', ha='center', va='bottom', fontsize=9)

# Added X-axis rotation
for ax in axes:
    ax.set_xticks(sorted(data_deaths_yearly_sum['year'].unique()))
    ax.set_xticklabels(sorted(data_deaths_yearly_sum['year'].unique()), rotation=45)

# Tighter layout settings
plt.tight_layout()

# Show graph
plt.show()

**Regional Comparisons: Country/Region Comparisons**

In [None]:
# Sort data by total_cases in descending order
data_sorted = data_grouped_by_location_countries.sort_values('total_cases', ascending=False)

# Sort data by total_cases in descending order
data_sorted['total_deaths'] = data_grouped_by_location_countries['total_deaths']

# Display tables that already have 'total_cases' and 'total_deaths' columns
pd.set_option('display.max_rows', None)  # Optional, if you want to display all rows
data_sorted[['location', 'total_cases', 'total_deaths']]

In [None]:
# Sort data by total_cases in descending order
data_sorted_by_region_cases = data_grouped_by_location_regions.sort_values('total_cases', ascending=False)

# Add the total_deaths column to data_sorted_by_region_cases
data_sorted_by_region_cases['total_deaths'] = data_grouped_by_location_regions['total_deaths']

# Display a table containing location, total_cases, and total_deaths columns by region
pd.set_option('display.max_rows', None)  # Optional, if you want to display all rows
data_sorted_by_region_cases[['location', 'total_cases', 'total_deaths']]


**Demographic Trends: Impact by Country**

In [None]:
# Visualization: Countries with the Highest Total Cases
plt.figure(figsize=(10, 6))
sns.barplot(x='total_cases', y='location', data=top_countries)
plt.title('Top 10 Countries with Highest Total Cases (Countries)')
plt.xlabel('Total Cases')
plt.ylabel('Country')
plt.tight_layout()
plt.show()

**Temporal Analysis: Time of Deployment Analysis**

In [None]:
# Visualization: New Case Development by Month
# Create a figure with two subplots
fig, axes = plt.subplots(2, 1, figsize=(10, 12))

# First plot: New Cases
sns.lineplot(x='month', y='new_cases', data=cases_per_month, label='New Cases', ax=axes[0], color='blue')
axes[0].set_title('Development of New Cases per Month')
axes[0].set_xlabel('Month')
axes[0].set_ylabel('Number of New Cases')
axes[0].tick_params(axis='x', rotation=45)

# Annotate each data point for New Cases
for i in range(len(cases_per_month)):
    axes[0].annotate(
        cases_per_month['new_cases'].iloc[i], 
        (cases_per_month['month'].iloc[i], cases_per_month['new_cases'].iloc[i]), 
        textcoords="offset points", xytext=(0,5), ha='center', fontsize=9, color='blue'
    )

# Second plot: New Deaths
sns.lineplot(x='month', y='new_deaths', data=cases_per_month, label='New Deaths', ax=axes[1], color='red')
axes[1].set_title('Development of New Deaths per Month')
axes[1].set_xlabel('Month')
axes[1].set_ylabel('Number of New Deaths')
axes[1].tick_params(axis='x', rotation=45)

# Annotate each data point for New Deaths
for i in range(len(cases_per_month)):
    axes[1].annotate(
        cases_per_month['new_deaths'].iloc[i], 
        (cases_per_month['month'].iloc[i], cases_per_month['new_deaths'].iloc[i]), 
        textcoords="offset points", xytext=(0,5), ha='center', fontsize=9, color='red'
    )

# Adjust the layout for better spacing
plt.tight_layout()

# Show the plots
plt.show()

**Identification of High-risk Regions: Top Locations by Case Fatality Ratio**

In [None]:
# CFR visualization
plt.figure(figsize=(10, 6))
sns.barplot(x='CFR', y='location', data=high_cfr_locations, palette='rocket')
plt.title('Top Countries by Case Fatality Ratio (CFR)')
plt.xlabel('CFR (%)')
plt.ylabel('Location')
plt.show()