In [16]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, confusion_matrix

# 1. Load data
df = pd.read_csv("C:\\Users\\taduc\\Downloads\\SAP-CONUHACKS-IX\\Data\\Preprocessed_2\\training_df.csv")
# Map severity: convert 2 and 3 to 1
df['severity'] = df['severity'].apply(lambda x: 1 if x in [2, 3] else x)
# Convert object columns to categorical codes
for col in df.columns:
    if df[col].dtype == 'object':
        df[col] = df[col].astype('category').cat.codes

X = df.drop(columns=["severity"]).values
y = df["severity"].values

# Separate minority (class 1) and majority (class 0) indices
minority_indices = np.where(y == 1)[0]
majority_indices = np.where(y == 0)[0]

# Shuffle minority indices and split: 100 for training and the rest for evaluation
shuffled_minority = np.copy(minority_indices)
np.random.shuffle(shuffled_minority)
min_train = shuffled_minority[:100]
min_val = shuffled_minority[100:]

# For majority indices, perform an 80/20 split
majority_train, majority_val = train_test_split(majority_indices, test_size=0.3, random_state=42)

# Drop 30% of the majority class in training by randomly sampling 70% of them
np.random.seed(42)
num_majority_train = len(majority_train)
sample_size = int(0.7 * num_majority_train)
majority_train_under = np.random.choice(majority_train, size=sample_size, replace=False)

# Combine indices from both groups
train_indices = np.concatenate([majority_train_under, min_train])
val_indices = np.concatenate([majority_val, min_val])

# Create training and evaluation sets
X_train = X[train_indices]
y_train = y[train_indices]
X_val = X[val_indices]
y_val = y[val_indices]

# Random Forest Classifier with higher weight for class 1
rf_model = RandomForestClassifier(n_estimators=50, random_state=42, class_weight={0: 1, 1: 600})
rf_model.fit(X_train, y_train)

# Predictions on validation set
y_pred = rf_model.predict(X_val)

# Evaluate performance
f1 = f1_score(y_val, y_pred)
cm = confusion_matrix(y_val, y_pred)
print("F1 Score on Validation Set:", f1)
print("Confusion Matrix:")
print(cm)

F1 Score on Validation Set: 0.6
Confusion Matrix:
[[10478     2]
 [   18    15]]
