In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [8]:
data = pd.read_csv("/content/drive/MyDrive/AI Project/data_for_classification.csv")
data.info()

FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/AI Project/data_for_classification.csv'

In [None]:
df1 = data.copy()
df1 = df1.drop(columns=['job_industry'])

df1['age'] = df1['age'].fillna(df1['age'].mean())

df1['sex'] = df1['sex'].fillna(df1['sex'].mode()[0])
df1['marital_status'] = df1['marital_status'].fillna(df1['marital_status'].mode()[0])

df1.info()

In [None]:
df1.to_csv("/content/drive/MyDrive/AI Project/model_train_data.csv", index=False)

In [None]:
df1[df1['label'] == 1]['label'].value_counts()

In [None]:
df1[df1['label'] == 0]['label'].value_counts()

In [None]:
df1.duplicated().sum()

In [None]:
# Convert categorical features
df = pd.get_dummies(df1, columns=["sex", "marital_status"])
df.info()

In [None]:
# Define target and features
y = df['label']
X = df.drop(columns=['label', 'user_id'])   # remove ID-like columns

***TRAINING***

---



In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import lightgbm as lgb


In [None]:
# 80% train, 20% test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [None]:
X_train.shape, X_test.shape, y_train.value_counts(), y_test.value_counts()


In [None]:
# Not class_weight
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

models = {
    "Logistic Regression": Pipeline([
        ('scaler', StandardScaler()),
        ('model', LogisticRegression(max_iter=1000))
    ]),

    "Decision Tree": DecisionTreeClassifier(random_state=42),

    "Random Forest": RandomForestClassifier(
        n_estimators=300,
        random_state=42
    ),

    "Gradient Boosting": GradientBoostingClassifier(
        n_estimators=300,
        learning_rate=0.05,
        random_state=42
    ),

    "LightGBM": lgb.LGBMClassifier(
        n_estimators=400,
        learning_rate=0.05,
        objective='binary',
        random_state=42,
    )
}

In [None]:
# Raw Data

from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score, roc_curve
)

report = {}

plt.figure(figsize=(10, 7))

for name, model in models.items():
    print(f"Training {name}...")

    model.fit(X_train, y_train)

    # Predict
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]

    # Metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_proba)

    # Save to report
    report[name] = {
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1 Score": f1,
        "ROC-AUC": auc
    }

    # ROC curve
    fpr, tpr, _ = roc_curve(y_test, y_proba)
    plt.plot(fpr, tpr, label=f"{name} (AUC={auc:.3f})")

plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curves for All Models")
plt.legend()
plt.grid()
plt.show()

# Show Report
import pandas as pd
report_df = pd.DataFrame(report).T
print("\n=== FULL METRIC REPORT ===")
print(report_df)


In [None]:
# Under sampling
from imblearn.under_sampling import RandomUnderSampler

rus = RandomUnderSampler(random_state=42)
X_train_us, y_train_us = rus.fit_resample(X_train, y_train)

In [None]:
y_train_us.value_counts()

In [None]:
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score, roc_curve
)

report = {}

plt.figure(figsize=(10, 7))

for name, model in models.items():
    print(f"Training {name}...")

    model.fit(X_train_us, y_train_us)

    # Predict
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]

    # Metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_proba)

    # Save to report
    report[name] = {
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1 Score": f1,
        "ROC-AUC": auc
    }

    # ROC curve
    fpr, tpr, _ = roc_curve(y_test, y_proba)
    plt.plot(fpr, tpr, label=f"{name} (AUC={auc:.3f})")

plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curves for All Models")
plt.legend()
plt.grid()
plt.show()

# Show Report
import pandas as pd
report_df = pd.DataFrame(report).T
print("\n=== FULL METRIC REPORT ===")
print(report_df)


In [None]:
# Over sampling
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_train_os, y_train_os = smote.fit_resample(X_train, y_train)

In [None]:
y_train_os.value_counts()

In [None]:
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score, roc_curve
)

report = {}

plt.figure(figsize=(10, 7))

for name, model in models.items():
    print(f"Training {name}...")

    model.fit(X_train_os, y_train_os)

    # Predict
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]

    # Metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_proba)

    # Save to report
    report[name] = {
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1 Score": f1,
        "ROC-AUC": auc
    }

    # ROC curve
    fpr, tpr, _ = roc_curve(y_test, y_proba)
    plt.plot(fpr, tpr, label=f"{name} (AUC={auc:.3f})")

plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curves for All Models")
plt.legend()
plt.grid()
plt.show()

# Show Report
import pandas as pd
report_df = pd.DataFrame(report).T
print("\n=== FULL METRIC REPORT ===")
print(report_df)


In [None]:
# Class weight
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

models = {
    "Logistic Regression": Pipeline([
        ('scaler', StandardScaler()),
        ('model', LogisticRegression(max_iter=1000, class_weight='balanced'))
    ]),

    "Decision Tree": DecisionTreeClassifier(random_state=42, class_weight='balanced'),

    "Random Forest": RandomForestClassifier(
        n_estimators=300,
        random_state=42,
        class_weight='balanced'
    ),

    "Gradient Boosting": GradientBoostingClassifier(
        n_estimators=300,
        learning_rate=0.05,
        random_state=42
    ),

    "LightGBM": lgb.LGBMClassifier(
        n_estimators=400,
        learning_rate=0.05,
        objective='binary',
        random_state=42,
        class_weight='balanced'
    )
}

In [None]:
# Raw Data

from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score, roc_curve
)

report = {}

plt.figure(figsize=(10, 7))

for name, model in models.items():
    print(f"Training {name}...")

    model.fit(X_train, y_train)

    # Predict
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]

    # Metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_proba)

    # Save to report
    report[name] = {
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1 Score": f1,
        "ROC-AUC": auc
    }

    # ROC curve
    fpr, tpr, _ = roc_curve(y_test, y_proba)
    plt.plot(fpr, tpr, label=f"{name} (AUC={auc:.3f})")

plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curves for All Models")
plt.legend()
plt.grid()
plt.show()

# Show Report
import pandas as pd
report_df = pd.DataFrame(report).T
print("\n=== FULL METRIC REPORT ===")
print(report_df)


***EXPORT PREDICT***

---



In [None]:
# 80% train, 20% test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
X_train.shape, X_test.shape, y_train.value_counts(), y_test.value_counts()


In [None]:
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

models = {
    "Logistic Regression": Pipeline([
        ('scaler', StandardScaler()),
        ('model', LogisticRegression(max_iter=1000, class_weight='balanced'))
    ]),

    "Gradient Boosting": GradientBoostingClassifier(
        n_estimators=300,
        learning_rate=0.05,
        random_state=42
    )
}

In [None]:
# Raw Data

from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score, roc_curve
)

report = {}

plt.figure(figsize=(10, 7))

for name, model in models.items():
    print(f"Training {name}...")

    model.fit(X_train, y_train)

    # Predict
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]

    # Metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_proba)

    # Save to report
    report[name] = {
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1 Score": f1,
        "ROC-AUC": auc
    }

    # ROC curve
    fpr, tpr, _ = roc_curve(y_test, y_proba)
    plt.plot(fpr, tpr, label=f"{name} (AUC={auc:.3f})")

plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curves for All Models")
plt.legend()
plt.grid()
plt.show()

# Show Report
import pandas as pd
report_df = pd.DataFrame(report).T
print("\n=== FULL METRIC REPORT ===")
print(report_df)
def reverse_one_hot(df, prefix):
    cols = [c for c in df.columns if c.startswith(prefix)]
    return df[cols].idxmax(axis=1).str.replace(prefix + "_", "")


In [None]:
# Define target and features
y = df['label']
X = df.drop(columns=['label'])   # remove ID-like columns

# 80% train, 20% test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [None]:
def reverse_one_hot(df, prefix):
    cols = [c for c in df.columns if c.startswith(prefix)]
    return df[cols].idxmax(axis=1).str.replace(prefix + "_", "")

X_train_init = X_train.copy()
X_train_init['sex'] = reverse_one_hot(X_train_init, "sex")
X_train_init['marital_status'] = reverse_one_hot(X_train_init, "marital_status")
X_train_init = X_train_init.drop(['sex_F', 'sex_M', 'sex_other',
       'marital_status_Divorced', 'marital_status_Married',
       'marital_status_Single'], axis=1)
X_train_init.info()

In [None]:
# --- EXPORT TRAIN DATA ---
export_df = X_train_init.copy()
export_df["label"] = y_train.values

# Export to CSV
export_df.to_csv("/content/drive/MyDrive/AI Project/model_train_data.csv", index=False)
print("Saved successfully")

In [None]:
def reverse_one_hot(df, prefix):
    cols = [c for c in df.columns if c.startswith(prefix)]
    return df[cols].idxmax(axis=1).str.replace(prefix + "_", "")

X_test_init = X_test.copy()
X_test_init['sex'] = reverse_one_hot(X_test_init, "sex")
X_test_init['marital_status'] = reverse_one_hot(X_test_init, "marital_status")
X_test_init = X_test_init.drop(['sex_F', 'sex_M', 'sex_other',
       'marital_status_Divorced', 'marital_status_Married',
       'marital_status_Single'], axis=1)
X_test_init.info()

In [None]:
X_test_init.head()

In [None]:
X_test_fit = X_test.copy().drop('user_id', axis=1)
X_test_fit.info()

In [None]:
# --- EXPORT PREDICTIONS ---

# 1. Get the models
gb_model = models["Gradient Boosting"]
lr_model = models["Logistic Regression"]

# 2. Predictions for Gradient Boosting
gb_pred = gb_model.predict(X_test_fit)
gb_proba = gb_model.predict_proba(X_test_fit)[:, 1]

# 3. Predictions for Logistic Regression
lr_pred = lr_model.predict(X_test_fit)
lr_proba = lr_model.predict_proba(X_test_fit)[:, 1]

# 4. Build export dataframe
export_df = X_test_init.copy()
export_df["label"] = y_test.values

export_df["gb_pred"] = gb_pred
export_df["gb_proba"] = gb_proba

export_df["lr_pred"] = lr_pred
export_df["lr_proba"] = lr_proba

# 5. Export to CSV
export_df.to_csv("/content/drive/MyDrive/AI Project/gb_lr_predictions.csv", index=False)
print("Saved successfully")

In [None]:
check1 = pd.read_csv("/content/drive/MyDrive/AI Project/model_train_data.csv")
check1.info()

In [None]:
check2 = pd.read_csv("/content/drive/MyDrive/AI Project/gb_lr_predictions.csv")
check2.info()