<a href="https://colab.research.google.com/github/Su-ok/MT2025124_ML_Project2/blob/main/BinNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# ==== IMPORTS ====
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

import warnings
warnings.filterwarnings("ignore")

np.random.seed(42)

# ==== LOAD DATA ====
train_path = "/content/drive/MyDrive/ML kaggle data/smoker/train_dataset.csv"
test_path  = "/content/drive/MyDrive/ML kaggle data/smoker/test_dataset.csv"

train=pd.read_csv(train_path)
test=pd.read_csv(test_path)
print("Original train shape:", train.shape)

# Identify feature columns
target_col="smoking"
feature_cols=[c for c in train.columns if c!=target_col]

# === 1) Detect duplicates ===
exact_dup_mask = train.duplicated(subset=feature_cols + [target_col], keep=False)
feat_dup_mask  = train.duplicated(subset=feature_cols, keep=False)

print("Exact duplicates:", exact_dup_mask.sum())
print("Feature duplicates:", feat_dup_mask.sum())

# ==== 2) RESOLVE DUPLICATES =====
grouped = train.groupby(feature_cols)[target_col].agg(
    n_samples      ='size',
    unique_labels  =lambda s: s.unique().tolist(),
    label_counts   =lambda s: s.value_counts().to_dict()
).reset_index()

rows = []
resolved_majority = 0
removed_ties = 0

for _, row in grouped.iterrows():
    feat_vals = {c: row[c] for c in feature_cols}
    labels = row["unique_labels"]
    counts = row["label_counts"]

    if len(labels) == 1:
        # pure duplicates → keep one representative
        rows.append({**feat_vals, target_col: labels[0]})
    else:
        # conflicting labels → majority vote if unique
        max_count = max(counts.values())
        majority_labels = [lbl for lbl, cnt in counts.items() if cnt == max_count]

        if len(majority_labels) == 1:
            rows.append({**feat_vals, target_col: majority_labels[0]})
            resolved_majority += 1
        else:
            # tie → drop group entirely
            removed_ties += row["n_samples"]

print("\nResolved by unique majority:", resolved_majority)
print("Dropped conflicting groups:", removed_ties)

clean_train = pd.DataFrame(rows)
print("\nCleaned train shape:", clean_train.shape)

# ==== 3) PREPARE DATA ====
X = clean_train[feature_cols].values
y = clean_train[target_col].values

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

test_scaled = scaler.transform(test[feature_cols].values)

# Stratified Train–Validation split
X_train, X_val, y_train, y_val = train_test_split(
    X_scaled,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("X_train:", X_train.shape, "X_val:", X_val.shape)

# ==== DEFINE NEURAL NETWORK MODEL ====
mlp = MLPClassifier(
    hidden_layer_sizes=(32, 16),   # 2 hidden layers
    activation="relu",
    solver="adam",
    alpha=0.001,
    max_iter=300,
    learning_rate_init=0.001,
    early_stopping=True,           # stops when validation stops improving
    n_iter_no_change=10,
    validation_fraction=0.2,
    random_state=42
)

print("\nTraining Neural Network using adam...")
mlp.fit(X_train, y_train)

# ==== EVALUATION ====
y_train_pred = mlp.predict(X_train)
y_val_pred   = mlp.predict(X_val)

train_acc = accuracy_score(y_train, y_train_pred)
val_acc   = accuracy_score(y_val, y_val_pred)

print(f"\nTraining Accuracy (MLP):   {train_acc:.6f}")
print(f"Validation Accuracy (MLP): {val_acc:.6f}")

# print("\nClassification Report:")
# print(classification_report(y_val, y_val_pred))

# print("\nConfusion Matrix:")
# print(confusion_matrix(y_val, y_val_pred))

# ==== FINAL TRAINING ON FULL DATA ====
mlp_final = MLPClassifier(
    hidden_layer_sizes=(32, 16),
    activation="relu",
    solver="adam",
    alpha=0.001,
    max_iter=300,
    learning_rate_init=0.001,
    early_stopping=True,           # stops when validation stops improving
    n_iter_no_change=10,
    validation_fraction=0.2,
    random_state=42
)

mlp_final.fit(X_scaled, y)

# Predict on test
test_preds = mlp_final.predict(test_scaled)

# Create submission CSV
submission = pd.DataFrame({
    "smoking": test_preds
})

submission.to_csv("submission_nn_smoker.csv", index=False)
print("\nsubmission_nn_smoker.csv created successfully.")

Original train shape: (38984, 23)
Exact duplicates: 11034
Feature duplicates: 11034

Resolved by unique majority: 0
Dropped conflicting groups: 0

Cleaned train shape: (33467, 23)
X_train: (26773, 22) X_val: (6694, 22)

Training Neural Network using adam...

Training Accuracy (MLP):   0.759833
Validation Accuracy (MLP): 0.747386

submission_nn_smoker.csv created successfully.


In [None]:
print("\nClassification Report:")
print(classification_report(y_val, y_val_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_val, y_val_pred))


Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.82      0.81      4242
           1       0.67      0.62      0.64      2452

    accuracy                           0.75      6694
   macro avg       0.73      0.72      0.72      6694
weighted avg       0.74      0.75      0.74      6694


Confusion Matrix:
[[3495  747]
 [ 944 1508]]
