In [3]:
from tensorflow import keras
from tensorflow.keras import layers
import matplotlib.pyplot as plt
from tensorflow.keras.utils import to_categorical
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
from sklearn.metrics import classification_report, accuracy_score, roc_curve, auc, recall_score
from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
import sqlite3
import pandas as pd
from keras.callbacks import EarlyStopping
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np

In [4]:
conn = sqlite3.connect(r'C:\Users\spiri\codingbootcamp\Final-project-C4Y\data.db')

cursor = conn.cursor()
query = ('''
WITH mean_deposite as (
    select 
         client_id,
         AVG(balance) as mean_balance,
         MAX(balance) as max_balance,
         MIN(balance) as min_balance,
         MAX(balance)-MIN(balance) as dif_balance,
         currency
    from balances
    group by 1
)
Select * 
from inv_campaign_eval
left join client
on inv_campaign_eval.client_id = client.client_id
left join client_products
on inv_campaign_eval.client_id = client_products.client_id
left join mean_deposite
on inv_campaign_eval.client_id = mean_deposite.client_id''')
df = pd.read_sql_query(query, conn)

df.head()

Unnamed: 0,client_id,poutcome,client_id.1,age,job,marital,education,gender,client_id.2,has_deposits,loan,has_insurance,has_mortgage,client_id.3,mean_balance,max_balance,min_balance,dif_balance,currency
0,249789938,success,249789938,38.0,services,married,secondary,M,249789938,yes,no,n,no,249789938,2669.532734,2775.615374,2515.500059,260.115315,CZK
1,1504633819,failure,1504633819,38.0,management,married,tertiary,M,1504633819,no,yes,n,yes,1504633819,5440.862334,5529.866975,5344.385346,185.481629,CZK
2,1952195738,success,1952195738,34.0,services,married,secondary,M,1952195738,yes,yes,n,no,1952195738,2922.958852,3016.314818,2852.824056,163.490762,CZK
3,989726384,success,989726384,62.0,retired,married,secondary,F,989726384,yes,no,n,no,989726384,3124.614843,3212.174671,3011.44591,200.728761,CZK
4,1971878545,failure,1971878545,36.0,management,single,tertiary,M,1971878545,yes,no,n,no,1971878545,474.083033,571.028715,362.809777,208.218939,CZK


In [5]:
import joblib


age_pipeline = joblib.load(r'C:\Users\spiri\codingbootcamp\Final-project-C4Y\age_pipeline.pkl')
missing_age_indices = df['age'].isnull()
predicted_ages = age_pipeline.predict(df[missing_age_indices])

# Step 2: Replace missing values with predicted values
df.loc[missing_age_indices, 'age'] = predicted_ages

In [6]:
bins = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]

labels = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90]

df['age_group'] = pd.cut(df['age'], bins=bins, labels=labels, right=False)



In [7]:
df = df.T.drop_duplicates().T
df.mean_balance = pd.to_numeric(df.mean_balance)
df.max_balance = pd.to_numeric(df.max_balance)
df.min_balance = pd.to_numeric(df.min_balance)
df.dif_balance = pd.to_numeric(df.dif_balance)
df.age=pd.to_numeric(df.age)
df.age_group=pd.to_numeric(df.age_group)

In [8]:
rate = {'CZK': 1, 'USD': 23, 'EUR': 25}

df.mean_balance = df.apply(lambda row: row['mean_balance'] * rate[row['currency']], axis=1)
df.max_balance = df.apply(lambda row: row['max_balance'] * rate[row['currency']], axis=1)
df.min_balance = df.apply(lambda row: row['min_balance'] * rate[row['currency']], axis=1)



In [9]:
query1 = ('''
Select * 
from inv_campaign_eval
left join balances
on inv_campaign_eval.client_id = balances.client_id''')
dif_bal = pd.read_sql_query(query1, conn)
dif_bal = dif_bal.T.drop_duplicates().T

rate = {'CZK': 1, 'USD': 23, 'EUR': 25}
dif_bal.balance = dif_bal.apply(lambda row: row['balance'] * rate[row['currency']], axis=1)

dif_bal['balance_change'] = dif_bal.groupby('client_id')['balance'].diff()
dif_bal['balance_change'].fillna(0, inplace=True)

sum_dif = dif_bal.groupby('client_id')['balance_change'].apply(lambda arr: sum(x ** 2 for x in arr))
dif_bal = dif_bal.merge(sum_dif, on='client_id', how='left')
dif_bal

dif_bal.drop(columns=['date', 'balance', 'balance_change_x', 'poutcome', 'currency'], inplace=True)
dif_bal.drop_duplicates(inplace=True)

df = df.merge(dif_bal, on='client_id', how='left')
df

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dif_bal['balance_change'].fillna(0, inplace=True)


Unnamed: 0,client_id,poutcome,age,job,marital,education,gender,has_deposits,loan,has_insurance,has_mortgage,mean_balance,max_balance,min_balance,dif_balance,currency,age_group,balance_change_y
0,249789938,success,38.0,services,married,secondary,M,yes,no,n,no,2669.532734,2775.615374,2515.500059,260.115315,CZK,30,128149.811845
1,1504633819,failure,38.0,management,married,tertiary,M,no,yes,n,yes,5440.862334,5529.866975,5344.385346,185.481629,CZK,30,119577.024704
2,1952195738,success,34.0,services,married,secondary,M,yes,yes,n,no,2922.958852,3016.314818,2852.824056,163.490762,CZK,30,57924.635507
3,989726384,success,62.0,retired,married,secondary,F,yes,no,n,no,3124.614843,3212.174671,3011.445910,200.728761,CZK,60,166963.270898
4,1971878545,failure,36.0,management,single,tertiary,M,yes,no,n,no,474.083033,571.028715,362.809777,208.218939,CZK,30,147742.223144
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2294,1270855655,failure,58.0,admin.,divorced,secondary,F,yes,no,y,yes,3349.398115,3445.072323,3245.043023,200.029300,CZK,50,111099.751243
2295,471724023,success,72.0,retired,married,primary,M,yes,no,n,no,4645.473515,4736.613098,4493.914363,242.698736,CZK,70,165275.194846
2296,620088171,failure,34.0,admin.,married,tertiary,F,yes,no,y,yes,895.455390,979.161966,771.707441,207.454525,CZK,30,129786.760767
2297,1518319265,success,39.0,management,married,unknown,F,yes,no,n,no,77.494621,226.953215,-36.431915,263.385130,CZK,30,160942.081353


In [10]:
chosen = df[(df['poutcome'] == 'success') & (df['has_deposits'] == 'yes')]
chosen.count()


client_id           978
poutcome            978
age                 978
job                 940
marital             978
education           978
gender              978
has_deposits        978
loan                978
has_insurance       978
has_mortgage        978
mean_balance        978
max_balance         978
min_balance         978
dif_balance         978
currency            978
age_group           978
balance_change_y    978
dtype: int64

In [11]:
df1 = df.drop(columns='client_id')
y = df1.poutcome
X = df1.drop(columns = 'poutcome')
enc = LabelEncoder()
y=enc.fit_transform(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [13]:
class InteractionScoreTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        # Assuming the first two columns are the ones we're interested in
        self.max_interaction = X[:, :2].max()
        return self

    def transform(self, X):
        # Create an interaction score: a simple product of the first two columns
        interaction_score = (X[:, 0] * X[:, 1]) / self.max_interaction
        return np.hstack((X, interaction_score[:, None]))

num_features = [ 'age', 'age_group', 'balance_change_y']
cat_features = ['job', 'currency']
labeled_features = [ 'has_deposits', 'loan', 'gender', 'education', 'marital', 'has_insurance', 'has_mortgage']

# Create transformers for the numerical and categorical features
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='unknown')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

labeled_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='no')),
    ('label', OrdinalEncoder())])

# Create a column transformer to apply the transformations to the respective columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_features),
        ('cat', categorical_transformer, cat_features),
        ('label', labeled_transformer, labeled_features)
            ])

# Full pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('interaction', InteractionScoreTransformer()),
    ('classifier', GradientBoostingClassifier(criterion = 'friedman_mse', learning_rate=0.1, loss='log_loss', n_estimators=60))])

In [14]:
pipeline.fit(X_train, y_train)

In [26]:
# Fit the preprocessor
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
print("Accuracy: {:.2f}%".format(accuracy * 100), "Recall: {:.2f}%".format(recall * 100))

Accuracy: 70.65% Recall: 74.89%


In [74]:
parameters = {
    'classifier__loss': ('log_loss', 'exponential'),
    'classifier__learning_rate': [0.01, 0.1, 0.5, 1],
    'classifier__n_estimators': [50, 100, 150, 200],
    'classifier__criterion': ('friedman_mse', 'squared_error')
}

gs = GridSearchCV(pipeline, param_grid=parameters, scoring='recall')
gs.fit(X_train, y_train)
results = pd.DataFrame(gs.cv_results_)
results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__criterion,param_classifier__learning_rate,param_classifier__loss,param_classifier__n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.112816,0.019232,0.006132,0.002089,friedman_mse,0.01,log_loss,50,"{'classifier__criterion': 'friedman_mse', 'cla...",0.695906,0.725146,0.694118,0.664706,0.741176,0.704211,0.026594,35
1,0.192955,0.005001,0.006841,0.000166,friedman_mse,0.01,log_loss,100,"{'classifier__criterion': 'friedman_mse', 'cla...",0.701754,0.725146,0.694118,0.705882,0.747059,0.714792,0.019106,25
2,0.284936,0.009201,0.002818,0.003451,friedman_mse,0.01,log_loss,150,"{'classifier__criterion': 'friedman_mse', 'cla...",0.754386,0.777778,0.694118,0.741176,0.782353,0.749962,0.031743,3
3,0.377043,0.007503,0.004978,0.006418,friedman_mse,0.01,log_loss,200,"{'classifier__criterion': 'friedman_mse', 'cla...",0.754386,0.801170,0.694118,0.735294,0.764706,0.749935,0.035193,6
4,0.108334,0.006190,0.002466,0.003370,friedman_mse,0.01,exponential,50,"{'classifier__criterion': 'friedman_mse', 'cla...",0.701754,0.725146,0.694118,0.670588,0.741176,0.706557,0.024559,33
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59,0.361264,0.004511,0.007392,0.002918,squared_error,1,log_loss,200,"{'classifier__criterion': 'squared_error', 'cl...",0.631579,0.666667,0.641176,0.629412,0.605882,0.634943,0.019655,59
60,0.098073,0.002645,0.007669,0.002075,squared_error,1,exponential,50,"{'classifier__criterion': 'squared_error', 'cl...",0.637427,0.701754,0.682353,0.635294,0.647059,0.660777,0.026561,47
61,0.188789,0.002546,0.005872,0.002984,squared_error,1,exponential,100,"{'classifier__criterion': 'squared_error', 'cl...",0.608187,0.654971,0.688235,0.629412,0.629412,0.642043,0.027445,53
62,0.278418,0.005439,0.005136,0.003510,squared_error,1,exponential,150,"{'classifier__criterion': 'squared_error', 'cl...",0.614035,0.649123,0.705882,0.635294,0.623529,0.645573,0.032361,50


In [103]:
import joblib
joblib.dump(pipeline, 'pipeline.pkl')

['pipeline.pkl']

In [76]:
results.loc[results["rank_test_score"] == 1]
#DecisionTreeClassifier(criterion='entropy', max_depth = 6, min_samples_split = 3)
#AdaBoostClassifier(algorithm = 'SAMME.R', learning_rate = 0.5, n_estimators = 50)
#GradientBoostingClassifier(criterion = 'friedman_mse', learning_rate=0.01, loss='log_loss', n_estimators=200)  for only main features
#GradientBoostingClassifier(criterion = 'squared_error', learning_rate=0.1, loss='log_loss', n_estimators=50)  for all features 
#GradientBoostingClassifier(criterion = 'squared_error', learning_rate=0.1, loss='exponential', n_estimators=200)  for modified list of features features 


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__criterion,param_classifier__learning_rate,param_classifier__loss,param_classifier__n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
47,0.367123,0.008894,0.006093,0.003306,squared_error,0.1,exponential,200,"{'classifier__criterion': 'squared_error', 'cl...",0.725146,0.789474,0.764706,0.729412,0.747059,0.751159,0.02374,1
