In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, precision_recall_fscore_support

# --- Function: Threshold Tuning with Class 1 F1 Optimization ---
def tune_threshold(model, X_val, y_val, class0_min_prec=0.85, class0_min_rec=0.85):
    probs = model.predict_proba(X_val)[:, 1]
    thresholds = np.arange(0.1, 0.91, 0.01)

    best_f1 = 0
    best_thresh = 0.5
    for t in thresholds:
        preds = (probs >= t).astype(int)
        precision, recall, f1, _ = precision_recall_fscore_support(
            y_val, preds, average=None, labels=[0, 1], zero_division=0
        )
        if precision[0] >= class0_min_prec and recall[0] >= class0_min_rec:
            if f1[1] > best_f1:
                best_f1 = f1[1]
                best_thresh = t

    return best_thresh, probs

# --- Load and preprocess ---
train = pd.read_csv('Train_Data.csv')
test = pd.read_csv('Test_Data.csv')
sample_submission = pd.read_csv(r'C:\Users\parth\Downloads\Sample_Submission.csv')
train = train.dropna(subset=['age_group'])
train['age_group'] = train['age_group'].map({'Adult': 0, 'Senior': 1})

test_ids = test['SEQN']
train = train.drop(columns=['SEQN'])
test = test.drop(columns=['SEQN'])

X = train.drop(columns='age_group')
y = train['age_group']
X_test = test

# --- Train-validation split ---
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# --- Pipeline: Imputer + Random Forest ---
pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy='mean')),
    ("clf", RandomForestClassifier(n_estimators=300, max_depth=10, class_weight='balanced', random_state=42))
])
pipeline.fit(X_train, y_train)

# --- Tune threshold ---
best_thresh, val_probs = tune_threshold(pipeline.named_steps['clf'], pipeline.named_steps['imputer'].transform(X_val), y_val)
val_preds = (val_probs >= best_thresh).astype(int)

# --- Evaluation ---

print(classification_report(y_val, val_preds))

# --- Predict on test ---
X_test_transformed = pipeline.named_steps['imputer'].transform(X_test)
test_probs = pipeline.named_steps['clf'].predict_proba(X_test_transformed)[:, 1]
test_preds = (test_probs >= best_thresh).astype(int)

# --- Save submission ---
submission = pd.DataFrame({'age_group': test_preds})
submission.to_csv("submission_f1_class1_optimized_clean.csv", index=False)


              precision    recall  f1-score   support

           0       0.91      0.85      0.88       340
           1       0.31      0.43      0.36        51

    accuracy                           0.80       391
   macro avg       0.61      0.64      0.62       391
weighted avg       0.83      0.80      0.81       391

