In [None]:
# %% [markdown]
# # Part 2: Multi-Label Defect Prediction (Python 3.13.3 Compatible)

# %% 
# 1. Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression, Perceptron
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import hamming_loss, f1_score
import joblib

# %% 
# 2. Load & inspect
df = pd.read_csv(r'D:/6th Semester/Data Science/Assignmentno4/dataset.csv')
print("Shape:", df.shape)
print("Missing values per column:\n", df.isna().sum())

# identify features & labels
X_text     = df['report'].fillna('')
label_cols = [c for c in df.columns if c.startswith('type_')]
y_all      = df[label_cols].values
print("Label distribution (all):\n", df[label_cols].sum())

# %%  
# 3. Train/val/test split (70/15/15)
X_train_val, X_test, y_train_val, y_test = train_test_split(
    X_text, y_all, test_size=0.15, random_state=42
)
X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val, test_size=0.1765, random_state=42
)  # ≈15% of overall data
print("Splits shapes:")
print("  X_train:", X_train.shape, "y_train:", y_train.shape)
print("  X_val:  ", X_val.shape,   "y_val:  ", y_val.shape)
print("  X_test: ", X_test.shape,  "y_test: ", y_test.shape)

# %%  
# 4. Drop labels that are constant in the training set
train_pos_counts = y_train.sum(axis=0)
mask_labels = (train_pos_counts > 0) & (train_pos_counts < y_train.shape[0])
label_cols = [c for c, keep in zip(label_cols, mask_labels) if keep]
y_train = y_train[:, mask_labels]
y_val   = y_val[:,   mask_labels]
y_test  = y_test[:,  mask_labels]
print("Kept labels:", label_cols)
print("New y_train shape:", y_train.shape)

# %%  
# 5. Precision@k helper
def precision_at_k(y_true, y_scores, k=3):
    n, _ = y_true.shape
    accs = []
    for i in range(n):
        topk = np.argsort(y_scores[i])[-k:]
        accs.append(y_true[i, topk].sum() / k)
    return np.mean(accs)

# %%  
# 6. Logistic Regression (One-vs-Rest + GridSearchCV)
tfidf_log = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
pipe_log  = Pipeline([
    ('tfidf', tfidf_log),
    ('clf',  OneVsRestClassifier(LogisticRegression(max_iter=1000)))
])
param_log = {'clf__estimator__C': [0.01, 0.1, 1, 10]}
grid_log  = GridSearchCV(pipe_log, param_log, cv=3, scoring='f1_micro')
grid_log.fit(X_train, y_train)
print("Best C (LogisticRegression):", grid_log.best_params_)

# %%  
# 7. SVM (One-vs-Rest + GridSearchCV)
tfidf_svm = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
pipe_svm  = Pipeline([
    ('tfidf', tfidf_svm),
    ('clf',  OneVsRestClassifier(SVC(kernel='linear', probability=True)))
])
param_svm = {'clf__estimator__C': [0.1, 1, 10]}
grid_svm  = GridSearchCV(pipe_svm, param_svm, cv=3, scoring='f1_micro')
grid_svm.fit(X_train, y_train)
print("Best C (SVM):", grid_svm.best_params_)

# %%  
# 8. Perceptron (batch) & online
# Batch Perceptron
tfidf_per = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
pipe_per  = Pipeline([
    ('tfidf', tfidf_per),
    ('clf',  OneVsRestClassifier(Perceptron(max_iter=1000)))
])
pipe_per.fit(X_train, y_train)

# Online Perceptrons: one Perceptron per label
vectorizer_online = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X_train_mat = vectorizer_online.fit_transform(X_train).toarray()
X_test_mat  = vectorizer_online.transform(X_test).toarray()

n_labels     = y_train.shape[1]
perceptrons  = [Perceptron(max_iter=1, warm_start=True) for _ in range(n_labels)]
# Initialize each with a first partial_fit
for idx, pp in enumerate(perceptrons):
    pp.partial_fit(X_train_mat, y_train[:, idx], classes=[0,1])
# Continue online updates for 5 epochs
for _ in range(5):
    for xi, yi in zip(X_train_mat, y_train):
        xi = xi.reshape(1, -1)
        for idx, pp in enumerate(perceptrons):
            pp.partial_fit(xi, [yi[idx]])

# %%  
# 9. DNN via sklearn’s MLPClassifier (2 hidden layers)
tfidf_mlp = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
pipe_mlp  = Pipeline([
    ('tfidf', tfidf_mlp),
    ('clf',  OneVsRestClassifier(
        MLPClassifier(
            hidden_layer_sizes=(128,64),
            activation='relu',
            solver='adam',
            random_state=42,
            max_iter=100,
            early_stopping=True
        )
    ))
])
pipe_mlp.fit(X_train, y_train)

# %%  
# 10. Evaluation on test set
results = {}

# Logistic Regression
y_pred_log   = grid_log.predict(X_test)
y_score_log  = grid_log.decision_function(X_test)
results['LogisticRegression'] = (y_pred_log, y_score_log)

# SVM
y_pred_svm   = grid_svm.predict(X_test)
y_score_svm  = grid_svm.decision_function(X_test)
results['SVM'] = (y_pred_svm, y_score_svm)

# Perceptron (batch)
y_pred_per   = pipe_per.predict(X_test)
y_score_per  = pipe_per.decision_function(X_test)
results['Perceptron'] = (y_pred_per, y_score_per)

# Perceptron (online)
y_pred_per_on  = np.column_stack([pp.predict(X_test_mat)    for pp in perceptrons])
y_score_per_on = np.column_stack([pp.decision_function(X_test_mat) for pp in perceptrons])
results['Perceptron(online)'] = (y_pred_per_on, y_score_per_on)

# DNN (MLPClassifier)
y_pred_mlp   = pipe_mlp.predict(X_test)
y_score_mlp  = pipe_mlp.predict_proba(X_test)
results['DNN (MLPClassifier)'] = (y_pred_mlp, y_score_mlp)

# Print metrics with zero_division handling
print("\n=== Test Results ===")
for name, (y_pred, y_score) in results.items():
    print(f"\n--- {name} ---")
    print("Hamming Loss :", hamming_loss(y_test,                y_pred))
    print("Micro F1     :", f1_score(y_test,      y_pred, average='micro', zero_division=0))
    print("Macro F1     :", f1_score(y_test,      y_pred, average='macro', zero_division=0))
    print("Precision@3  :", precision_at_k(y_test, y_score, k=3))


# after training:
joblib.dump(grid_log.best_estimator_, "pipe_log.pkl")  # Logistic
joblib.dump(grid_svm.best_estimator_, "pipe_svm.pkl")  # SVM
joblib.dump(pipe_mlp,               "pipe_mlp.pkl")   # DNN/MLPClassifier



# assume `results` dict is already defined as:
#   results = {
#       'LogisticRegression': (y_pred_log,   y_score_log),
#       'SVM':                (y_pred_svm,   y_score_svm),
#       'Perceptron':         (y_pred_per,   y_score_per),
#       'Perceptron(online)': (y_pred_per_on,y_score_per_on),
#       'DNN (MLP)':          (y_pred_mlp,   y_score_mlp),
#   }

rows = []
for name, (y_pred, y_score) in results.items():
    ham = hamming_loss(y_test, y_pred)
    mic = f1_score(y_test, y_pred, average='micro', zero_division=0)
    mac = f1_score(y_test, y_pred, average='macro', zero_division=0)
    p3  = precision_at_k(y_test, y_score, k=3)
    rows.append({
        'Model': name,
        'Hamming Loss': ham,
        'Micro-F1':     mic,
        'Macro-F1':     mac,
        'Precision@3':  p3
    })

df_summary = pd.DataFrame(rows).set_index('Model')
display(df_summary.sort_values('Micro-F1', ascending=False))




