<a href="https://colab.research.google.com/github/UOH-Group3-Project/Hackathon_group3/blob/main/Group3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. SETUP & DATA LOADING

In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# machiiine learning libs
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
try:
    from lightgbm import LGBMClassifier
    LIGHTGBM_AVAILABLE = True
except:
    LIGHTGBM_AVAILABLE = False
    print("Note: LightGBM not available, will use alternative models")

# Evaluation Metrics
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             f1_score, roc_auc_score, classification_report,
                             confusion_matrix, roc_curve)

# Explainability
import shap

#random seed
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

# plotting style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

print("✓ All libraries imported successfully!")
print(f"Random State: {RANDOM_STATE}")

✓ All libraries imported successfully!
Random State: 42


## Load Datasets

In [20]:
# training data
train_df = pd.read_csv('/content/drive/MyDrive/dataset_A_training.csv')
print(f"\n✓ Training data loaded: {train_df.shape}")

#testing
test_df = pd.read_csv('/content/drive/MyDrive/dataset_A_testing.csv')
print(f"✓ Testing data loaded: {test_df.shape}")

# sample submission format
example_submission = pd.read_csv('/content/drive/MyDrive/dataset_A_example_submission.csv')
print(f"✓ Example submission loaded: {example_submission.shape}")


✓ Training data loaded: (4756, 31)
✓ Testing data loaded: (4749, 30)
✓ Example submission loaded: (4749, 2)


In [19]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Initial Data Exploration

In [22]:
print("TRAINING DATA OVERVIEW")

print("\nFirst 5 rows:")
display(train_df.head())

print("\nDataset Information:")
train_df.info()

print("\nBasic Statistics:")
display(train_df.describe())

print("\nMissing Values:")
missing_counts = train_df.isnull().sum()
missing_percent = (missing_counts / len(train_df) * 100).round(2)
missing_df = pd.DataFrame({
    'Missing Count': missing_counts[missing_counts > 0],
    'Percentage': missing_percent[missing_counts > 0]
}).sort_values('Percentage', ascending=False)
display(missing_df)

print("TESTING DATA OVERVIEW")
print(f"\nShape: {test_df.shape}")
print(f"Columns: {test_df.columns.tolist()}")

# Verify target variable is not in test set
if 'seasonal_vaccine' in test_df.columns:
    print("⚠ Warning: Target variable found in test set!")
else:
    print("✓ Target variable correctly absent from test set")

TRAINING DATA OVERVIEW

First 5 rows:


Unnamed: 0,respondent_id,flu_concern,flu_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,...,sex,income_poverty,marital_status,rent_or_own,employment_status,census_msa,household_adults,household_children,employment_sector,seasonal_vaccine
0,1,2.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,...,Male,"<= $75,000, Above Poverty",Not Married,Rent,Employed,Non-MSA,0.0,0.0,entertainment,0
1,2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,Male,,Not Married,Rent,Employed,"MSA, Principle City",1.0,0.0,real_estate,0
2,3,2.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,...,Male,"> $75,000",Married,Own,Not in Labor Force,"MSA, Not Principle City",1.0,0.0,,0
3,4,1.0,2.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,Male,"<= $75,000, Above Poverty",Married,Own,Employed,"MSA, Not Principle City",1.0,2.0,utilities,1
4,5,2.0,2.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,...,Female,Below Poverty,Not Married,Rent,Not in Labor Force,"MSA, Not Principle City",2.0,0.0,,0



Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4756 entries, 0 to 4755
Data columns (total 31 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   respondent_id                4756 non-null   int64  
 1   flu_concern                  4744 non-null   float64
 2   flu_knowledge                4733 non-null   float64
 3   behavioral_antiviral_meds    4742 non-null   float64
 4   behavioral_avoidance         4724 non-null   float64
 5   behavioral_face_mask         4753 non-null   float64
 6   behavioral_wash_hands        4748 non-null   float64
 7   behavioral_large_gatherings  4747 non-null   float64
 8   behavioral_outside_home      4747 non-null   float64
 9   behavioral_touch_face        4737 non-null   float64
 10  doctor_recc_seasonal         4435 non-null   float64
 11  chronic_med_condition        4568 non-null   float64
 12  child_under_6_months         4604 non-null   float64
 

Unnamed: 0,respondent_id,flu_concern,flu_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,...,chronic_med_condition,child_under_6_months,health_worker,health_insurance,opinion_seas_vacc_effective,opinion_seas_risk,opinion_seas_sick_from_vacc,household_adults,household_children,seasonal_vaccine
count,4756.0,4744.0,4733.0,4742.0,4724.0,4753.0,4748.0,4747.0,4747.0,4737.0,...,4568.0,4604.0,4607.0,2831.0,4662.0,4654.0,4655.0,4712.0,4712.0,4756.0
mean,2378.5,1.643128,1.293049,0.052299,0.734124,0.073848,0.836563,0.365494,0.336634,0.68841,...,0.302102,0.091442,0.136965,0.889085,4.095024,2.87688,2.12739,0.895586,0.541596,0.536375
std,1373.08327,0.91981,0.618902,0.222652,0.441846,0.261551,0.369803,0.481619,0.472608,0.463192,...,0.459219,0.288268,0.343848,0.314082,1.076051,1.409834,1.337812,0.759658,0.93697,0.498728
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0
25%,1189.75,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,4.0,2.0,1.0,0.0,0.0,0.0
50%,2378.5,2.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,4.0,2.0,2.0,1.0,0.0,1.0
75%,3567.25,2.0,2.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,...,1.0,0.0,0.0,1.0,5.0,4.0,4.0,1.0,1.0,1.0
max,4756.0,3.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,5.0,5.0,5.0,3.0,3.0,1.0



Missing Values:


Unnamed: 0,Missing Count,Percentage
employment_sector,2314,48.65
health_insurance,1925,40.48
income_poverty,790,16.61
rent_or_own,381,8.01
doctor_recc_seasonal,321,6.75
education,270,5.68
employment_status,269,5.66
marital_status,265,5.57
chronic_med_condition,188,3.95
child_under_6_months,152,3.2


TESTING DATA OVERVIEW

Shape: (4749, 30)
Columns: ['respondent_id', 'flu_concern', 'flu_knowledge', 'behavioral_antiviral_meds', 'behavioral_avoidance', 'behavioral_face_mask', 'behavioral_wash_hands', 'behavioral_large_gatherings', 'behavioral_outside_home', 'behavioral_touch_face', 'doctor_recc_seasonal', 'chronic_med_condition', 'child_under_6_months', 'health_worker', 'health_insurance', 'opinion_seas_vacc_effective', 'opinion_seas_risk', 'opinion_seas_sick_from_vacc', 'age_group', 'education', 'race', 'sex', 'income_poverty', 'marital_status', 'rent_or_own', 'employment_status', 'census_msa', 'household_adults', 'household_children', 'employment_sector']
✓ Target variable correctly absent from test set
