<a href="https://colab.research.google.com/github/Su-ok/MT2025124_ML_Project2/blob/main/BinLogistic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
np.random.seed(42)

from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# ==== LOAD DATA ====
train_path = "/content/drive/MyDrive/ML kaggle data/smoker/train_dataset.csv"
test_path  = "/content/drive/MyDrive/ML kaggle data/smoker/test_dataset.csv"

train = pd.read_csv(train_path)
test  = pd.read_csv(test_path)

print("Original train shape:", train.shape)

# Identify feature columns
target_col = "smoking"
feature_cols = [c for c in train.columns if c != target_col]

# ==== 1) DETECT DUPLICATES ====
exact_dup_mask = train.duplicated(subset=feature_cols + [target_col], keep=False)
feat_dup_mask  = train.duplicated(subset=feature_cols, keep=False)

print("Exact duplicates:", exact_dup_mask.sum())
print("Feature duplicates:", feat_dup_mask.sum())

# ==== 2) RESOLVE DUPLICATES =====
grouped = train.groupby(feature_cols)[target_col].agg(
    n_samples      ='size',
    unique_labels  =lambda s: s.unique().tolist(),
    label_counts   =lambda s: s.value_counts().to_dict()
).reset_index()

rows = []
resolved_majority = 0
removed_ties = 0

for _, row in grouped.iterrows():
    feat_vals = {c: row[c] for c in feature_cols}
    labels = row["unique_labels"]
    counts = row["label_counts"]

    if len(labels) == 1:
        # pure duplicates → keep one representative
        rows.append({**feat_vals, target_col: labels[0]})
    else:
        # conflicting labels → majority vote if unique
        max_count = max(counts.values())
        majority_labels = [lbl for lbl, cnt in counts.items() if cnt == max_count]

        if len(majority_labels) == 1:
            rows.append({**feat_vals, target_col: majority_labels[0]})
            resolved_majority += 1
        else:
            # tie → drop group entirely
            removed_ties += row["n_samples"]

print("\nResolved by unique majority:", resolved_majority)
print("Dropped conflicting groups:", removed_ties)

clean_train = pd.DataFrame(rows)
print("\nCleaned train shape:", clean_train.shape)

# ==== 3) PREPARE DATA ====
X = clean_train[feature_cols].values
y = clean_train[target_col].values

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

test_scaled = scaler.transform(test[feature_cols].values)

# ==== 4) TRAIN–VALIDATION SPLIT ====
X_train, X_val, y_train, y_val = train_test_split(
    X_scaled, y, test_size=0.2, stratify=y, random_state=42
)

print("X_train:", X_train.shape, "X_val:", X_val.shape)

# ==== 5) LOGISTIC REGRESSION + RANDOM SEARCH ====
logreg = LogisticRegression(max_iter=5000)

param_dist = {
    "C": [0.01, 0.1, 1.0, 10.0],
    "solver": ["lbfgs", "liblinear"],
}

cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

logreg_search = RandomizedSearchCV(
    estimator=logreg,
    param_distributions=param_dist,
    n_iter=8,                  # only 8 unique combos → 8 iterations max
    scoring="accuracy",
    n_jobs=-1,
    cv=cv,
    random_state=42,
    verbose=2
)

print("\n=== Starting RandomizedSearchCV for Logistic Regression ===")
logreg_search.fit(X_train, y_train)

print("\nBest Params:", logreg_search.best_params_)
print("Best CV Accuracy:", logreg_search.best_score_)

best_logreg = logreg_search.best_estimator_

# ==== 6) EVALUATION ====
y_train_pred = best_logreg.predict(X_train)
y_val_pred   = best_logreg.predict(X_val)

print("\nTraining Accuracy:", accuracy_score(y_train, y_train_pred))
print("Validation Accuracy:", accuracy_score(y_val, y_val_pred))

# print("\nClassification Report:")
# print(classification_report(y_val, y_val_pred))

# print("\nConfusion Matrix:")
# print(confusion_matrix(y_val, y_val_pred))

# ==== 7) FINAL TRAINING ON FULL DATA & SUBMISSION ====
best_logreg.fit(X_scaled, y)

test_preds = best_logreg.predict(test_scaled)

submission = pd.DataFrame({"smoking": test_preds})
submission.to_csv("submission_logreg_smoker.csv", index=False)

print("\nSaved submission_logreg_smoker.csv")

Original train shape: (38984, 23)
Exact duplicates: 11034
Feature duplicates: 11034

Resolved by unique majority: 0
Dropped conflicting groups: 0

Cleaned train shape: (33467, 23)
X_train: (26773, 22) X_val: (6694, 22)

=== Starting RandomizedSearchCV for Logistic Regression ===
Fitting 3 folds for each of 8 candidates, totalling 24 fits

Best Params: {'solver': 'liblinear', 'C': 10.0}
Best CV Accuracy: 0.7196055622308687

Training Accuracy: 0.7212863705972435
Validation Accuracy: 0.7260233044517478

Saved submission_logreg_smoker.csv


In [None]:
print("\nClassification Report:")
print(classification_report(y_val, y_val_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_val, y_val_pred))


Classification Report:
              precision    recall  f1-score   support

           0       0.76      0.83      0.79      4242
           1       0.65      0.55      0.60      2452

    accuracy                           0.73      6694
   macro avg       0.70      0.69      0.69      6694
weighted avg       0.72      0.73      0.72      6694


Confusion Matrix:
[[3512  730]
 [1104 1348]]
