In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
import missingno as mnso 

In [2]:
Patient = pd.read_csv('PatientInfo.csv')
Policy = pd.read_csv('Policy.csv')
Region = pd.read_csv('Region.csv')
SearchTrend = pd.read_csv('SearchTrend.csv')
Floating = pd.read_csv('SeoulFloating.csv')
Time  = pd.read_csv('Time.csv')
TimeAge = pd.read_csv('TimeAge.csv')
TimeGender = pd.read_csv('TimeGender.csv')
TimePro = pd.read_csv('TimeProvince.csv')
Wheather = pd.read_csv('Weather.csv')
  



In [3]:
def overview(datasets):
    print("=" * 20 + " Basic Overview of Dataset " + "=" * 20)

    # First five rows
    print("\n" + "=" * 10 + " First Five Rows " + "=" * 10)
    print(datasets.head(5))

   

    # Dataset shape
    print("\n" + "=" * 10 + " Dataset Shape " + "=" * 10)
    print(f"Total columns: {len(datasets.columns)}")
    print(f"Total rows: {len(datasets)}")

    # Null values
    print("\n" + "=" * 10 + " Null Values in Each Column " + "=" * 10)
    print(datasets.isnull().sum())

    # Duplicate rows
    print("\n" + "=" * 10 + " Duplicate Rows " + "=" * 10)
    dup_count = datasets.duplicated().sum()
    print(f"Total duplicate rows: {dup_count}")

    # Statistical summary
    print("\n" + "=" * 10 + " Statistical Summary " + "=" * 10)
    print(datasets.describe())


In [4]:
overview(Patient)


   patient_id     sex  age country province         city  \
0  1000000001    male  50s   Korea    Seoul   Gangseo-gu   
1  1000000002    male  30s   Korea    Seoul  Jungnang-gu   
2  1000000003    male  50s   Korea    Seoul    Jongno-gu   
3  1000000004    male  20s   Korea    Seoul      Mapo-gu   
4  1000000005  female  20s   Korea    Seoul  Seongbuk-gu   

         infection_case infected_by contact_number symptom_onset_date  \
0       overseas inflow         NaN             75         2020-01-22   
1       overseas inflow         NaN             31                NaN   
2  contact with patient  2002000001             17                NaN   
3       overseas inflow         NaN              9         2020-01-26   
4  contact with patient  1000000002              2                NaN   

  confirmed_date released_date deceased_date     state  
0     2020-01-23    2020-02-05           NaN  released  
1     2020-01-30    2020-03-02           NaN  released  
2     2020-01-30    2020-02-

In [5]:
dataset_names = ["Patient", "Policy", "Region", "SearchTrend", "Floating", "Time", "TimeAge", "TimeGender", "TimePro", "Wheather"]
datasets = [Patient, Policy, Region, SearchTrend, Floating, Time, TimeAge, TimeGender, TimePro, Wheather]

for name, df in zip(dataset_names, datasets):
    null_columns = df.columns[df.isnull().any()]
    if not null_columns.empty:
        print(f"\n{name} has Null values:")
        for col in null_columns:
            pct = ((df[col].isnull().sum() / len(df)) * 100).round(0)
            print(f"- {col}: {df[col].isnull().sum()} nulls ({pct}%)")
    else:
        print(f"\n{name} has no null values.")

    print("\n" + "=" * 90 + "\n")

    # Drop columns with >30% nulls
    high_null_cols = [col for col in df.columns if ((df[col].isnull().sum() / len(df)) * 100).round(0) > 30.0]
    columns_dropped = df[high_null_cols].copy()
    
    for col in high_null_cols:
        df.drop(col, axis=1, inplace=True)
        print(f"Column '{col}' deleted successfully.")

    # Fill missing values in remaining columns
    for col in df.columns:
        if df[col].isnull().any():
            if df[col].dtype == 'object':
                df[col] = df[col].fillna(df[col].mode()[0])
            else:
                df[col] = df[col].fillna(df[col].median())
            print(f"Successfully replaced null values in '{col}'.")




Patient has Null values:
- sex: 1122 nulls (22.0%)
- age: 1380 nulls (27.0%)
- city: 94 nulls (2.0%)
- infection_case: 919 nulls (18.0%)
- infected_by: 3819 nulls (74.0%)
- contact_number: 4374 nulls (85.0%)
- symptom_onset_date: 4475 nulls (87.0%)
- confirmed_date: 3 nulls (0.0%)
- released_date: 3578 nulls (69.0%)
- deceased_date: 5099 nulls (99.0%)


Column 'infected_by' deleted successfully.
Column 'contact_number' deleted successfully.
Column 'symptom_onset_date' deleted successfully.
Column 'released_date' deleted successfully.
Column 'deceased_date' deleted successfully.
Successfully replaced null values in 'sex'.
Successfully replaced null values in 'age'.
Successfully replaced null values in 'city'.
Successfully replaced null values in 'infection_case'.
Successfully replaced null values in 'confirmed_date'.

Policy has Null values:
- detail: 2 nulls (3.0%)
- end_date: 37 nulls (61.0%)


Column 'end_date' deleted successfully.
Successfully replaced null values in 'detail'.

Re

In [6]:
overview(Policy)


   policy_id country         type                      gov_policy  \
0          1   Korea        Alert  Infectious Disease Alert Level   
1          2   Korea        Alert  Infectious Disease Alert Level   
2          3   Korea        Alert  Infectious Disease Alert Level   
3          4   Korea        Alert  Infectious Disease Alert Level   
4          5   Korea  Immigration   Special Immigration Procedure   

             detail  start_date  
0    Level 1 (Blue)  2020-01-03  
1  Level 2 (Yellow)  2020-01-20  
2  Level 3 (Orange)  2020-01-28  
3     Level 4 (Red)  2020-02-23  
4        from China  2020-02-04  

Total columns: 6
Total rows: 61

policy_id     0
country       0
type          0
gov_policy    0
detail        0
start_date    0
dtype: int64

Total duplicate rows: 0

       policy_id
count  61.000000
mean   31.000000
std    17.752934
min     1.000000
25%    16.000000
50%    31.000000
75%    46.000000
max    61.000000
