# Modelling

In [19]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE


In [6]:
df = pd.read_csv("https://raw.githubusercontent.com/Refdinal/insight-hustler/refs/heads/master/data/processed/data_preprocessed.csv")

In [7]:
df

Unnamed: 0,city_development_index,relevent_experience,education_level,experience,last_new_job,training_hours,gender_Female,gender_Male,gender_Other,enrolled_university_Full time course,...,company_size_Small,company_size_Unknown,company_type_Early Stage Startup,company_type_Funded Startup,company_type_NGO,company_type_Other,company_type_Public Sector,company_type_Pvt Ltd,company_type_Unknown,target
0,0.920,1,3,21,1,3.610918,0,1,0,0,...,0,1,0,0,0,0,0,0,1,1.0
1,0.776,0,3,15,5,3.871201,0,1,0,0,...,1,0,0,0,0,0,0,1,0,0.0
2,0.624,0,3,5,0,4.430817,0,0,1,1,...,0,1,0,0,0,0,0,0,1,0.0
3,0.767,1,4,21,4,2.197225,0,1,0,0,...,1,0,0,1,0,0,0,0,0,0.0
4,0.764,1,3,11,1,3.218876,0,0,1,0,...,0,1,0,0,0,0,0,0,1,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18000,0.878,0,3,14,1,3.761200,0,1,0,0,...,0,1,0,0,0,0,0,0,1,1.0
18001,0.920,1,3,14,4,3.970292,0,1,0,0,...,0,1,0,0,0,0,0,0,1,1.0
18002,0.920,1,3,21,4,3.806662,0,1,0,0,...,1,0,0,0,0,0,0,1,0,0.0
18003,0.802,1,2,0,2,4.584967,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0.0


## Split Data Train-test

In [8]:
from sklearn.model_selection import train_test_split
X = df.drop(columns=['target'])
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## SMOTE for imbalance data to training data

In [10]:
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

## Scaling data dengan standar scaler

In [14]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_resampled)  # Fit dan transform data latih
X_test_scaled = scaler.transform(X_test)  # Hanya transform data uji

# 1. Modelling

In [30]:
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, 
    f1_score, roc_auc_score
)

def evaluate_metrics(model, X_train, y_train, X_test, y_test):
    """
    Evaluates a classification model and returns a DataFrame with key metrics for both train and test datasets.
    
    Parameters:
    - model: Trained classification model (e.g., LogisticRegression, RandomForestClassifier, etc.)
    - X_train: Training features
    - y_train: Training labels
    - X_test: Test features
    - y_test: Test labels
    
    Returns:
    - metrics_df: A DataFrame containing evaluation metrics for both train and test sets.
    """
    # Predictions and probabilities
    y_pred_train = model.predict(X_train)
    y_proba_train = model.predict_proba(X_train)[:, 1]
    
    y_pred_test = model.predict(X_test)
    y_proba_test = model.predict_proba(X_test)[:, 1]
    
    # Metrics for Train Set
    train_metrics = {
        "Accuracy": accuracy_score(y_train, y_pred_train),
        "Precision": precision_score(y_train, y_pred_train),
        "Recall": recall_score(y_train, y_pred_train),
        "F1-Score": f1_score(y_train, y_pred_train),
        "ROC-AUC": roc_auc_score(y_train, y_proba_train),
    }
    
    # Metrics for Test Set
    test_metrics = {
        "Accuracy": accuracy_score(y_test, y_pred_test),
        "Precision": precision_score(y_test, y_pred_test),
        "Recall": recall_score(y_test, y_pred_test),
        "F1-Score": f1_score(y_test, y_pred_test),
        "ROC-AUC": roc_auc_score(y_test, y_proba_test),
    }
    
    # Combine metrics into a DataFrame
    metrics_df = pd.DataFrame([train_metrics, test_metrics], index=["Train", "Test"])
    
    return metrics_df.T

## Model 1 : Logistic Regression

In [31]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit(X_train_scaled, y_train_resampled)

In [33]:
logreg_result = evaluate_metrics(logreg, X_train_scaled, y_train_resampled, X_test_scaled, y_test)
logreg_result

Unnamed: 0,Train,Test
Accuracy,0.786004,0.755623
Precision,0.796743,0.484651
Recall,0.76791,0.615112
F1-Score,0.782061,0.542144
ROC-AUC,0.864084,0.769948


## Model 2 : K-Nearest Neighbors

In [34]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_scaled, y_train_resampled)

In [35]:
knn_result = evaluate_metrics(knn, X_train_scaled, y_train_resampled, X_test_scaled, y_test)
knn_result

Unnamed: 0,Train,Test
Accuracy,0.860183,0.730353
Precision,0.846092,0.443636
Recall,0.880539,0.576151
F1-Score,0.862972,0.501284
ROC-AUC,0.944377,0.72296


## Model 3 : Decision Tree

In [36]:
from sklearn.tree import DecisionTreeClassifier
dtree = DecisionTreeClassifier(random_state=42)
dtree.fit(X_train_scaled, y_train_resampled)

In [40]:
dtree_result = evaluate_metrics(dtree, X_train_scaled, y_train_resampled, X_test_scaled, y_test)
dtree_result

Unnamed: 0,Train,Test
Accuracy,0.998984,0.696751
Precision,0.999908,0.383886
Recall,0.998061,0.478158
F1-Score,0.998984,0.425868
ROC-AUC,0.999998,0.622138


## Model 4 : Random Forest 

In [38]:
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train_scaled, y_train_resampled)

In [41]:
rf_model_result = evaluate_metrics(rf_model, X_train_scaled, y_train_resampled, X_test_scaled, y_test)
rf_model_result

Unnamed: 0,Train,Test
Accuracy,0.998984,0.752569
Precision,0.998708,0.477823
Recall,0.999261,0.559622
F1-Score,0.998985,0.515498
ROC-AUC,0.99999,0.756799


## Model 5 : XGboost

In [44]:
import xgboost as xgb
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(X_train_scaled, y_train_resampled)

In [45]:
xgb_model_result = evaluate_metrics(xgb_model, X_train_scaled, y_train_resampled, X_test_scaled, y_test)
xgb_model_result

Unnamed: 0,Train,Test
Accuracy,0.882986,0.776173
Precision,0.874042,0.519922
Recall,0.894941,0.631641
F1-Score,0.884368,0.570362
ROC-AUC,0.954347,0.776462


# Model Comparison

In [47]:
comparison_df = pd.DataFrame()

In [51]:
comparison_df['Logistic Regression'] = logreg_result['Test']
comparison_df['K-Nearest Neighbors'] = knn_result['Test']
comparison_df['Decision Tree'] = dtree_result['Test']
comparison_df['Random Forest Classifier'] = rf_model_result['Test']
comparison_df['XGBoost'] = xgb_model_result['Test']

In [52]:
comparison_df

Unnamed: 0,Logistic Regression,K-Nearest Neighbors,Decision Tree,Random Forest Classifier,XGBoost
Accuracy,0.755623,0.730353,0.696751,0.752569,0.776173
Precision,0.484651,0.443636,0.383886,0.477823,0.519922
Recall,0.615112,0.576151,0.478158,0.559622,0.631641
F1-Score,0.542144,0.501284,0.425868,0.515498,0.570362
ROC-AUC,0.769948,0.72296,0.622138,0.756799,0.776462
