In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer, precision_score, recall_score, roc_auc_score,accuracy_score
df = pd.read_csv('models/data/data.csv', sep=';')

In [2]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.decomposition import PCA


class DataPreparer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.cols_num = None
        self.cols_cat = None

    def fit(self, X, y=None):
        X = X.drop(columns=['duration'])    
        self.cols_num = X.select_dtypes(include=['number']).columns.tolist()
        self.cols_cat = X.select_dtypes(include=['object', 'category']).columns.tolist()
        if 'y' in self.cols_cat:
            self.cols_cat.remove('y')
        if "y" in self.cols_num:
            self.cols_num.remove("y")
        return self

    def transform(self, X):
        # Create dummy variables for categorical columns
        X = pd.concat([X, pd.get_dummies(X[self.cols_cat], drop_first=False)], axis=1)
        # Create the output label
        X['OUTPUT_LABEL'] = (X['y'] == 'yes').astype(int)
        # Define the input columns
        cols_input = self.cols_num + list(pd.get_dummies(X[self.cols_cat], drop_first=False).columns)
        # Return the prepared data
        scaler = StandardScaler()
        economic_factors = X[['cons.price.idx', 'cons.conf.idx', 'emp.var.rate','euribor3m', 'nr.employed']]
        economic_factors_scaled = scaler.fit_transform(economic_factors)

        pca = PCA(n_components=1)
        principal_components = pca.fit_transform(economic_factors_scaled)


        X['Econ.Stab.Sent.PCA']=principal_components[:,0]

        X = X.drop(['cons.price.idx', 'emp.var.rate', 'euribor3m', 'nr.employed'], axis=1)
        for col in ['cons.price.idx', 'emp.var.rate', 'euribor3m', 'nr.employed']:
            if col in cols_input:
                cols_input.remove(col)
        cols_input += ['Econ.Stab.Sent.PCA']
        return X[cols_input + ['OUTPUT_LABEL']]

# Example usage
df = pd.read_csv('models/data/data.csv', sep=';')
data_preparer = DataPreparer()
df_data = data_preparer.fit_transform(df)

In [3]:
df_data = df_data.sample(n = len(df_data), random_state = 2024)
df_data = df_data.reset_index(drop = True)

In [4]:
from sklearn.model_selection import train_test_split

df_train_all, df_valid_test = train_test_split(df_data, test_size=0.30, random_state=42)
df_valid, df_test = train_test_split(df_valid_test, test_size=0.5, random_state=42)

In [None]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.under_sampling import EditedNearestNeighbours

# Function to balance the dataset using RandomUnderSampler
def balance_data(df,balancer_type='SMOTE'):
    X = df.drop(columns=["OUTPUT_LABEL"])
    y = df["OUTPUT_LABEL"]
    
    if balancer_type == 'SMOTE':
        balancer = SMOTE(sampling_strategy='auto', random_state=42)
    elif balancer_type == 'RandomUnderSampler':
        balancer = RandomUnderSampler(sampling_strategy=1.0, random_state=42)
    elif balancer_type == 'EditedNearestNeighbours':
        balancer = EditedNearestNeighbours(sampling_strategy='auto', n_neighbors=3, kind_sel='all')
    elif balancer_type == 'None':
        return df
    else:
        raise ValueError("balancer_type must be 'SMOTE', 'RandomUnderSampler', or 'EditedNearestNeighbours'")
    
    # X_resampled, y_resampled = rus.fit_resample(X, y)
    # X_resampled, y_resampled = enn.fit_resample(X, y)
    X_resampled, y_resampled = balancer.fit_resample(X, y)
    
    # Recombine the features and labels into a balanced DataFrame
    df_resampled = pd.concat([pd.DataFrame(X_resampled, columns=X.columns), 
                              pd.DataFrame(y_resampled, columns=["OUTPUT_LABEL"])], axis=1)
    
    # Shuffle the DataFrame
    df_resampled = df_resampled.sample(n=len(df_resampled), random_state=42).reset_index(drop=True)
    
    return df_resampled


# resampling using smote

In [None]:

# Balance the training, validation, and test data
df_train = balance_data(df_train_all)
df_valid = balance_data(df_valid)


In [None]:
# Create input and output matrices
X_train = df_train.drop(columns=["OUTPUT_LABEL"]).values
X_train_all = df_train_all.drop(columns=["OUTPUT_LABEL"]).values
X_valid = df_valid.drop(columns=["OUTPUT_LABEL"]).values

y_train = df_train['OUTPUT_LABEL'].values
y_train_all = df_train_all['OUTPUT_LABEL'].values
y_valid = df_valid['OUTPUT_LABEL'].values


In [7]:

# Scale data
scaler = StandardScaler()
scaler.fit(X_train_all)
X_train_all_tf = scaler.transform(X_train_all)
X_train_tf = scaler.transform(X_train)
X_valid_tf = scaler.transform(X_valid)


In [None]:
gbc = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=3, random_state=42)
gbc.fit(X_train_tf, y_train)

random_grid_gbc = {
    'n_estimators': range(50, 200, 50),
    'max_depth': range(1, 5, 1),
    'learning_rate': [0.001, 0.01, 0.1]
}

auc_scoring = make_scorer(roc_auc_score)
gbc_random_smote = RandomizedSearchCV(estimator=gbc, param_distributions=random_grid_gbc, n_iter=20, cv=2, scoring=auc_scoring, verbose=0, random_state=42)
gbc_random_smote.fit(X_train_tf, y_train)


In [None]:

y_train_preds_random = gbc_random_smote.best_estimator_.predict_proba(X_train_tf)[:, 1]
y_valid_preds_random = gbc_random_smote.best_estimator_.predict_proba(X_valid_tf)[:, 1]


In [26]:
import numpy as np

def print_report(y_actual, y_pred, thresh):
    try:
        auc = roc_auc_score(y_actual, y_pred)
    except Exception as e:
        print(e)
        auc = -1
    try:
        accuracy = accuracy_score(y_actual, (y_pred > thresh))
    except Exception as e:
        print(e)
        accuracy = -1
    try:
        recall = recall_score(y_actual, (y_pred > thresh))
    except Exception as e:
        print(e)
        recall = -1
    try:
        precision = precision_score(y_actual, (y_pred > thresh))
    except Exception as e:
        print(e)
        precision = -1
    try:
        f1 = np.divide(2 * (precision * recall), (precision + recall), out=np.zeros(1), where=(precision + recall) != 0)[0] if precision != -1 and recall != -1 else -1
    except Exception as e:
        print(e)
        f1 = -1
    return auc, accuracy, recall, precision, f1


In [11]:


thresh = 0.5
train_report = print_report(y_train, y_train_preds_random, thresh)
valid_report = print_report(y_valid, y_valid_preds_random, thresh)
df_valid, df_test = train_test_split(df_valid_test, test_size=0.5, random_state=42)
df_test = balance_data(df_test, balancer_type='SMOTE')


In [None]:
X_test = df_test.drop(columns=["OUTPUT_LABEL"]).values
y_test = df_test['OUTPUT_LABEL'].values
X_test_tf = scaler.transform(X_test)
y_test_preds_random = gbc_random_smote.best_estimator_.predict_proba(X_test_tf)[:, 1]
test_report = print_report(y_test, y_test_preds_random, thresh)


print(f"Training Report:\n\t  AUC={train_report[0]:.3f}\n\t Accuracy={train_report[1]:.3f}\n\t Recall={train_report[2]:.3f}\n\t Precision={train_report[3]:.3f}\n\t F1={train_report[4]:.3f}")
print(f"Validation Report:\n\t  AUC={valid_report[0]:.3f}\n\t Accuracy={valid_report[1]:.3f}\n\t Recall={valid_report[2]:.3f}\n\t Precision={valid_report[3]:.3f}\n\t F1={valid_report[4]:.3f}")
print("test dataframe has been resampled using the same method as the training data")
print(f"Test Report:\n\t  AUC={test_report[0]:.3f}\n\t Accuracy={test_report[1]:.3f}\n\t Recall={test_report[2]:.3f}\n\t Precision={test_report[3]:.3f}\n\t F1={test_report[4]:.3f}")


Training Report:
	  AUC=0.975
	 Accuracy=0.936
	 Recall=0.907
	 Precision=0.962
	 F1=0.934
Validation Report:
	  AUC=0.970
	 Accuracy=0.935
	 Recall=0.905
	 Precision=0.963
	 F1=0.933
test dataframe has been resampled using the same method as the training data
Test Report:
	  AUC=0.972
	 Accuracy=0.932
	 Recall=0.906
	 Precision=0.956
	 F1=0.930


In [27]:
df_valid, df_test = train_test_split(df_valid_test, test_size=0.5, random_state=42)
df_test = balance_data(df_test, balancer_type='None')
print("Test dataframe has not been resampled")
X_test = df_test.drop(columns=["OUTPUT_LABEL"]).values
y_test = df_test['OUTPUT_LABEL'].values
X_test_tf = scaler.transform(X_test)
y_test_preds_random = gbc_random_smote.best_estimator_.predict_proba(X_test_tf)[:, 1]
test_report = print_report(y_test, y_test_preds_random, thresh)
print(f"Test Report:\n\t  AUC={test_report[0]:.3f}\n\t Accuracy={test_report[1]:.3f}\n\t Recall={test_report[2]:.3f}\n\t Precision={test_report[3]:.3f}\n\t F1={test_report[4]:.3f}")


Test dataframe has not been resampled
Test Report:
	  AUC=0.801
	 Accuracy=0.820
	 Recall=0.660
	 Precision=0.358
	 F1=0.464


# Resampling using random undersampling

In [14]:

# Balance the training, validation, and test data
df_train = balance_data(df_train_all, balancer_type='RandomUnderSampler')
df_valid = balance_data(df_valid, balancer_type='RandomUnderSampler')


In [15]:
# Create input and output matrices
X_train = df_train.drop(columns=["OUTPUT_LABEL"]).values
X_train_all = df_train_all.drop(columns=["OUTPUT_LABEL"]).values
X_valid = df_valid.drop(columns=["OUTPUT_LABEL"]).values

y_train = df_train['OUTPUT_LABEL'].values
y_train_all = df_train_all['OUTPUT_LABEL'].values
y_valid = df_valid['OUTPUT_LABEL'].values


In [16]:

# Scale data
scaler = StandardScaler()
scaler.fit(X_train_all)
X_train_all_tf = scaler.transform(X_train_all)
X_train_tf = scaler.transform(X_train)
X_valid_tf = scaler.transform(X_valid)


In [19]:
gbc = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=3, random_state=42)
gbc.fit(X_train_tf, y_train)

random_grid_gbc = {
    'n_estimators': range(50, 200, 50),
    'max_depth': range(1, 5, 1),
    'learning_rate': [0.001, 0.01, 0.1]
}

auc_scoring = make_scorer(roc_auc_score)
gbc_random_Rus = RandomizedSearchCV(estimator=gbc, param_distributions=random_grid_gbc, n_iter=20, cv=2, scoring=auc_scoring, verbose=0, random_state=42)
gbc_random_Rus.fit(X_train_tf, y_train)


In [28]:

y_train_preds_random_Rus = gbc_random_Rus.best_estimator_.predict_proba(X_train_tf)[:, 1]
y_valid_preds_random_Rus = gbc_random_Rus.best_estimator_.predict_proba(X_valid_tf)[:, 1]


In [29]:


thresh = 0.5
train_report = print_report(y_train, y_train_preds_random_Rus, thresh)
valid_report = print_report(y_valid, y_valid_preds_random_Rus, thresh)
df_valid, df_test = train_test_split(df_valid_test, test_size=0.5, random_state=42)
df_test = balance_data(df_test, balancer_type='RandomUnderSampler')

In [31]:
X_test = df_test.drop(columns=["OUTPUT_LABEL"]).values
y_test = df_test['OUTPUT_LABEL'].values
X_test_tf = scaler.transform(X_test)
y_test_preds_random_Rus= gbc_random_Rus.best_estimator_.predict_proba(X_test_tf)[:, 1]
test_report = print_report(y_test, y_test_preds_random_Rus, thresh)


print(f"Training Report:\n\t  AUC={train_report[0]:.3f}\n\t Accuracy={train_report[1]:.3f}\n\t Recall={train_report[2]:.3f}\n\t Precision={train_report[3]:.3f}\n\t F1={train_report[4]:.3f}")
print(f"Validation Report:\n\t  AUC={valid_report[0]:.3f}\n\t Accuracy={valid_report[1]:.3f}\n\t Recall={valid_report[2]:.3f}\n\t Precision={valid_report[3]:.3f}\n\t F1={valid_report[4]:.3f}")
print("test dataframe has been resampled using the same method as the training data")
print(f"Test Report:\n\t  AUC={test_report[0]:.3f}\n\t Accuracy={test_report[1]:.3f}\n\t Recall={test_report[2]:.3f}\n\t Precision={test_report[3]:.3f}\n\t F1={test_report[4]:.3f}")


Training Report:
	  AUC=0.816
	 Accuracy=0.751
	 Recall=0.655
	 Precision=0.811
	 F1=0.725
Validation Report:
	  AUC=0.781
	 Accuracy=0.735
	 Recall=0.646
	 Precision=0.786
	 F1=0.709
test dataframe has been resampled using the same method as the training data
Test Report:
	  AUC=0.800
	 Accuracy=0.751
	 Recall=0.660
	 Precision=0.806
	 F1=0.726


In [32]:
df_valid, df_test = train_test_split(df_valid_test, test_size=0.5, random_state=42)
df_test = balance_data(df_test, balancer_type='None')
print("Test dataframe has not been resampled")
X_test = df_test.drop(columns=["OUTPUT_LABEL"]).values
y_test = df_test['OUTPUT_LABEL'].values
X_test_tf = scaler.transform(X_test)
y_test_preds_random = gbc_random_smote.best_estimator_.predict_proba(X_test_tf)[:, 1]
test_report = print_report(y_test, y_test_preds_random, thresh)
print(f"Test Report:\n\t  AUC={test_report[0]:.3f}\n\t Accuracy={test_report[1]:.3f}\n\t Recall={test_report[2]:.3f}\n\t Precision={test_report[3]:.3f}\n\t F1={test_report[4]:.3f}")


Test dataframe has not been resampled
Test Report:
	  AUC=0.801
	 Accuracy=0.820
	 Recall=0.660
	 Precision=0.358
	 F1=0.464
