<a href="https://colab.research.google.com/github/Szinacska/CI-CD-Project/blob/main/Bank_Churn_Assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import precision_score, classification_report, roc_auc_score

In [8]:
dataset = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Churn_Banking_Modeling_ENG.csv")

In [9]:
dataset


Unnamed: 0,customer_id,flag_request_closure,amt_cust_value,flag_online_acc_opening,flag_mult_account_ownership,num_age,str_prov_domicile,str_prov_residence,num_year_first_account,str_profession,...,amt_current_liquidity,amt_current_managed,amt_current_administered,amt_6m_current_liquidity,amt_6m_current_managed,amt_6m_current_administered,flag_outgoing_sec_tranfer,flag_card_rejection,flag_loan_rejection,flag_deactivation_rid
0,2,no,65.03,0,0,38.0,BO,BO,2004.0,Impiegato,...,1634.57,2978.40,2980.92,1550.44,2853.20,2853.22,0,0,0,0
1,3,no,138.88,0,0,45.0,SA,SA,2000.0,Impiegato,...,11918.26,,34916.15,26122.17,,22053.82,0,0,0,0
2,4,no,546.54,0,0,61.0,VT,VT,2005.0,Altro/Nessuno,...,2671.95,,232776.62,16545.25,,216304.33,1,0,0,0
3,5,no,68.69,0,0,33.0,MI,MI,2010.0,Impiegato,...,19211.31,15013.53,15017.53,12500.00,,,0,0,0,0
4,7,no,2417.05,0,0,36.0,MI,MI,2001.0,Quadro,...,1526.38,,,1427.70,,21514.60,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
377364,555146,no,5317.47,1,0,49.0,,,2008.0,Avvocato,...,27933.94,197987.97,239399.96,31673.56,243236.45,285346.45,0,0,0,0
377365,555147,no,17.35,0,0,42.0,CT,CT,2002.0,Docente/Insegnante,...,-909.87,,,-2782.54,,1098.68,0,0,0,0
377366,555148,no,334.73,0,0,54.0,RM,BZ,2001.0,Medico/Farmacista,...,72535.85,,15017.53,66142.32,,15208.61,0,0,0,0
377367,555149,no,5171.38,0,0,60.0,GE,GE,2007.0,Libero professionista/Titolare di impresa,...,115.87,338644.32,339296.07,154.64,276199.41,276199.47,0,0,0,0


In [10]:
dataset = dataset.rename(columns={"flag_request_closure": "Target"})

In [11]:
target_col = 'Target'
y = dataset[target_col]
X = dataset.drop(columns=[target_col, 'customer_id'])

In [12]:
num_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_cols = X.select_dtypes(include=['object']).columns.tolist()

In [13]:
print("Missing values per column:")
print(X.isnull().sum())


Missing values per column:
amt_cust_value                         0
flag_online_acc_opening                0
flag_mult_account_ownership            0
num_age                                7
str_prov_domicile                  21188
str_prov_residence                 19472
num_year_first_account                 0
str_profession                     36197
amt_income                        219893
str_cust_type                          0
str_gender                             7
Profilo_MIFID                      49257
amt_pricing_fee                    27396
amt_transfer_vs_competitors       288655
amt_tranfers_vs_no_competitors    176679
num_existing_services             167432
flag_salary_deposit                    0
amt_credit_card_spending          164706
amt_debit_card_spending           176004
num_website_access_count           50199
num_transactions_count             61605
num_trading_activities_count      290438
str_change_num_utilities          159014
flag_mortgage                 

In [15]:
numeric_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),  # Or mean
    ('scaler', StandardScaler())
])

categorical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False)) # Changed 'sparse' to 'sparse_output'
])

In [16]:
preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_pipeline, num_cols),
    ('cat', categorical_pipeline, cat_cols)
])

In [17]:
X_cleaned = preprocessor.fit_transform(X)

In [19]:
encoded_cat_cols = preprocessor.named_transformers_['cat']['encoder'].get_feature_names_out(cat_cols)
all_feature_names = num_cols + list(encoded_cat_cols)
X_cleaned_df = pd.DataFrame(X_cleaned, columns=all_feature_names)

In [20]:
final_df = pd.concat([X_cleaned_df, y.reset_index(drop=True)], axis=1)

In [21]:
final_df.to_csv("cleaned_dataset.csv", index=False)

In [25]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import precision_score, classification_report, roc_auc_score

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X_cleaned_df, y, test_size=0.2, random_state=42)

In [27]:

# Train Decision Tree
dt_model = DecisionTreeClassifier(max_depth=4, random_state=42)
dt_model.fit(X_train, y_train)
y_pred_dt = dt_model.predict(X_test)
y_prob_dt = dt_model.predict_proba(X_test)[:, 1]

In [28]:
# Train Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
y_prob_rf = rf_model.predict_proba(X_test)[:, 1]

In [29]:
# Train Logistic Regression
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train, y_train)
y_pred_lr = lr_model.predict(X_test)
y_prob_lr = lr_model.predict_proba(X_test)[:, 1]

In [32]:
# Evaluating my predictions 3000
def top_k_precision_recall(y_true, y_scores, k=3000):
    # Convert y_true to numerical values if it contains strings
    if y_true.dtype == object:  # Check if y_true is of object type (likely string)
        y_true = y_true.map({'si': 1, 'no': 0})  # Assuming 'si' is 1 and 'no' is 0

    top_k_idx = y_scores.argsort()[-k:][::-1]
    y_top_k = y_true.iloc[top_k_idx]
    precision = y_top_k.mean()  # since it's binary 0/1
    recall = y_top_k.sum() / y_true.sum()
    return precision, recall

In [33]:
# Print results of evaluations
print("Decision Tree:")
print("  Precision@3000:", *top_k_precision_recall(y_test.reset_index(drop=True), pd.Series(y_prob_dt)))
print("  ROC AUC:", roc_auc_score(y_test, y_prob_dt))

print("Random Forest:")
print("  Precision@3000:", *top_k_precision_recall(y_test.reset_index(drop=True), pd.Series(y_prob_rf)))
print("  ROC AUC:", roc_auc_score(y_test, y_prob_rf))

print("Logistic Regression:")
print("  Precision@3000:", *top_k_precision_recall(y_test.reset_index(drop=True), pd.Series(y_prob_lr)))
print("  ROC AUC:", roc_auc_score(y_test, y_prob_lr))

Decision Tree:
  Precision@3000: 0.037333333333333336 0.28211586901763225
  ROC AUC: 0.7480681043196994
Random Forest:
  Precision@3000: 0.05566666666666667 0.42065491183879095
  ROC AUC: 0.8040654751466076
Logistic Regression:
  Precision@3000: 0.051666666666666666 0.3904282115869018
  ROC AUC: 0.860114967105644
