# __Impact Analysis of Monkeypox Case Study__

___

## **Check Out README for Business Understanding & Data Understanding**

## **Data Preparation**

### Import Library

In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

### Gathering Data (Import File)

In [None]:
# Load the dataset
while True:
    try:
        start_year = int(input("Enter the start year (example: 2022): "))
        start_month = int(input("Enter the start month (1-12): "))

        end_year = int(input("Enter the end year (example: 2024): "))
        end_month = int(input("Enter the end month (1-12): "))

        # Input Validation
        if start_month < 1 or start_month > 12 or end_month < 1 or end_month > 12:
            print("Month must be between 1 and 12. Please try again.")
        elif start_year > end_year or (start_year == end_year and start_month > end_month):
            print("The start date cannot be later than the end date. Please try again.")
        else:
            break
    except ValueError:
        print("Invalid input. Please enter valid year and month numbers (example: 2022 and 5 for May).")

# Construct the file name based on the input
output_folder = 'data/raw/filtered'

# Format the file name according to the selected year and month range
file_name = f"monkeypox_{start_year}_{start_month}_to_{end_year}_{end_month}_filtered.csv"
file_path = os.path.join(output_folder, file_name)

# Check if the file exists
if os.path.exists(file_path):
    df = pd.read_csv(file_path)
    print(f"Data successfully loaded from {file_path}")
else:
    print(f"File {file_path} not found.")

### Check Data

In [None]:
# Count rows of dataset
jumlah_data = len(df)
print("Total data:", jumlah_data)

In [None]:
# View the first 5 rows of the dataset
print("First 5 rows of the dataset:")
df.head()

### Assessing Data

In [None]:
# Counting the number of duplicate entries
print("Number of duplications: ", df.duplicated().sum())
print("\n")

# Counting the number of null values in each column
print("Null Data:")
for key, data in df.isnull().sum().items():
    print(f"{key}: {data}")

In [None]:
# Checking dataset dimensions (number of rows and columns)
print("\nShape of the dataset:")
df.shape

In [None]:
# Checking data type, column, and missing values information
print("\nInfo of the dataset:")
df.info()

In [None]:
# Checking the number of missing values per column
print("\nMissing values per column:")
print(df.isnull().sum())

### Cleaning Data

#### Invalid Date

In [None]:
# Convert the 'date' column to datetime type
df['date'] = pd.to_datetime(df['date'], errors='coerce')

# Checking failed data converted to datetime
invalid_dates = df[df['date'].isna()]
print("\nInvalid date entries (rows with missing dates after conversion):")
print(invalid_dates)

#### Missing Values

In [None]:
# Addressing missing values
# For rows that contain missing values in the new_cases, new_deaths, total_cases, or total_deaths columns, we will remove them
data_cleaned = df.dropna(subset=['new_cases', 'new_deaths', 'total_cases', 'total_deaths'])

# Verify that there are no more missing values
print("\nMissing values after cleaning:")
print(data_cleaned.isnull().sum())

#### Duplicates

In [None]:
# Checking if there are duplicate values
duplicates = data_cleaned.duplicated().sum()
print(f"\nNumber of duplicate rows: {duplicates}")

# If there are duplicates, we will remove them
if duplicates > 0:
    data_cleaned = data_cleaned.drop_duplicates()

# Verify the data dimension after cleaning
print(f"\nShape of the dataset after cleaning: {data_cleaned.shape}")

#### Strange or Out of The Normal Range

In [None]:
# Checking for strange or out-of-bounds values (e.g. negative cases)
negative_cases = data_cleaned[(data_cleaned['new_cases'] < 0) | (data_cleaned['new_deaths'] < 0)]
print("\nRows with negative case values (if any):")
print(negative_cases)

# If there are invalid negative values, they can be removed
data_cleaned = data_cleaned[(data_cleaned['new_cases'] >= 0) & (data_cleaned['new_deaths'] >= 0)]

#### Outliers

In [None]:
# Checking for outliers in the new_cases and total_cases columns with IQR
Q1 = data_cleaned['new_cases'].quantile(0.25)
Q3 = data_cleaned['new_cases'].quantile(0.75)
IQR = Q3 - Q1

outliers = data_cleaned[(data_cleaned['new_cases'] < (Q1 - 1.5 * IQR)) | (data_cleaned['new_cases'] > (Q3 + 1.5 * IQR))]
print("\nPotential outliers based on new_cases:")
outliers.head()

#### Cleaned

In [None]:
# showing the entire of dataset
print("\nCleaned data preview:")
data_cleaned.head()

In [None]:
# Checking data type and column of dataset
data_cleaned.info()

## **Export to File**

In [None]:
# Path to save the modified CSV file
while True:
    try:
        start_year = int(input("Enter the start year (example: 2022): "))
        start_month = int(input("Enter the start month (1-12): "))

        end_year = int(input("Enter the end year (example: 2024): "))
        end_month = int(input("Enter the end month (1-12): "))

        # Input validation
        if start_month < 1 or start_month > 12 or end_month < 1 or end_month > 12:
            print("Month must be between 1 and 12. Please try again.")
        elif start_year > end_year or (start_year == end_year and start_month > end_month):
            print("The start date cannot be later than the end date. Please try again.")
        else:
            break
    except ValueError:
        print("Invalid input. Please enter valid year and month numbers (example: 2022 and 5 for May).")

In [None]:
# Path to save the processed file
output_folder = 'data/data_processed'
os.makedirs(output_folder, exist_ok=True)  # Ensure folder exists

# Construct the file name based on the year and month range
output_file_path = os.path.join(
    output_folder, f'monkeypox_{start_year}_{start_month}_to_{end_year}_{end_month}_processed.csv')

# Save the processed data to a CSV file
data_cleaned.to_csv(output_file_path, index=False)

print(f"The file has been saved to: {output_file_path}")