In [1]:
from sklearn.metrics import classification_report, accuracy_score, roc_curve, auc, recall_score
from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
import sqlite3
import pandas as pd
from keras.callbacks import EarlyStopping
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import GradientBoostingRegressor

In [2]:
conn = sqlite3.connect(r'C:\Users\spiri\codingbootcamp\Final-project-C4Y\data.db')
cursor = conn.cursor()
query = ('''
WITH mean_deposite as (
    select 
         client_id,
         AVG(balance) as mean_balance,
         MAX(balance) as max_balance,
         MIN(balance) as min_balance,
         MAX(balance)-MIN(balance) as dif_balance,
         currency
    from balances
    group by 1
)
Select * 
from inv_campaign_eval
left join client
on inv_campaign_eval.client_id = client.client_id
left join client_products
on inv_campaign_eval.client_id = client_products.client_id
left join mean_deposite
on inv_campaign_eval.client_id = mean_deposite.client_id''')
df = pd.read_sql_query(query, conn)

df.head()

Unnamed: 0,client_id,poutcome,client_id.1,age,job,marital,education,gender,client_id.2,has_deposits,loan,has_insurance,has_mortgage,client_id.3,mean_balance,max_balance,min_balance,dif_balance,currency
0,249789938,success,249789938,38.0,services,married,secondary,M,249789938,yes,no,n,no,249789938,2669.532734,2775.615374,2515.500059,260.115315,CZK
1,1504633819,failure,1504633819,38.0,management,married,tertiary,M,1504633819,no,yes,n,yes,1504633819,5440.862334,5529.866975,5344.385346,185.481629,CZK
2,1952195738,success,1952195738,34.0,services,married,secondary,M,1952195738,yes,yes,n,no,1952195738,2922.958852,3016.314818,2852.824056,163.490762,CZK
3,989726384,success,989726384,62.0,retired,married,secondary,F,989726384,yes,no,n,no,989726384,3124.614843,3212.174671,3011.44591,200.728761,CZK
4,1971878545,failure,1971878545,36.0,management,single,tertiary,M,1971878545,yes,no,n,no,1971878545,474.083033,571.028715,362.809777,208.218939,CZK


In [3]:
import joblib


age_pipeline = joblib.load(r'C:\Users\spiri\codingbootcamp\Final-project-C4Y\age_pipeline.pkl')
missing_age_indices = df['age'].isnull()
predicted_ages = age_pipeline.predict(df[missing_age_indices])

# Step 2: Replace missing values with predicted values
df.loc[missing_age_indices, 'age'] = predicted_ages

In [4]:
bins = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]

labels = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90]

df['age_group'] = pd.cut(df['age'], bins=bins, labels=labels, right=False)

In [5]:
df = df.T.drop_duplicates().T
df.mean_balance = pd.to_numeric(df.mean_balance)
df.max_balance = pd.to_numeric(df.max_balance)
df.min_balance = pd.to_numeric(df.min_balance)
df.dif_balance = pd.to_numeric(df.dif_balance)
df.age=pd.to_numeric(df.age)
df.age_group=pd.to_numeric(df.age_group)

In [6]:
rate = {'CZK': 1, 'USD': 23, 'EUR': 25}

df.mean_balance = df.apply(lambda row: row['mean_balance'] * rate[row['currency']], axis=1)
df.max_balance = df.apply(lambda row: row['max_balance'] * rate[row['currency']], axis=1)
df.min_balance = df.apply(lambda row: row['min_balance'] * rate[row['currency']], axis=1)

In [7]:
query1 = ('''
Select * 
from inv_campaign_eval
left join balances
on inv_campaign_eval.client_id = balances.client_id''')
dif_bal = pd.read_sql_query(query1, conn)
dif_bal = dif_bal.T.drop_duplicates().T

rate = {'CZK': 1, 'USD': 23, 'EUR': 25}
dif_bal.balance = dif_bal.apply(lambda row: row['balance'] * rate[row['currency']], axis=1)

dif_bal['balance_change'] = dif_bal.groupby('client_id')['balance'].diff()
dif_bal['balance_change'].fillna(0, inplace=True)

sum_dif = dif_bal.groupby('client_id')['balance_change'].apply(lambda arr: sum(x ** 2 for x in arr))
dif_bal = dif_bal.merge(sum_dif, on='client_id', how='left')
dif_bal

dif_bal.drop(columns=['date', 'balance', 'balance_change_x', 'poutcome', 'currency'], inplace=True)
dif_bal.drop_duplicates(inplace=True)

df = df.merge(dif_bal, on='client_id', how='left')

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dif_bal['balance_change'].fillna(0, inplace=True)


In [8]:
y = df.poutcome
X = df.drop(columns = 'poutcome')
enc = LabelEncoder()
y=enc.fit_transform(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
X_train

Unnamed: 0,client_id,age,job,marital,education,gender,has_deposits,loan,has_insurance,has_mortgage,mean_balance,max_balance,min_balance,dif_balance,currency,age_group,balance_change_y
1131,925426077,37.0,management,married,tertiary,F,yes,no,n,no,2294.232788,2384.976236,2164.535085,220.441150,CZK,30,238574.199207
351,878293598,31.0,management,married,tertiary,M,yes,no,n,no,4336.226496,4436.659545,4221.369982,215.289563,CZK,30,128227.201521
944,440706813,45.0,management,married,tertiary,M,no,no,n,yes,-13.082433,80.306071,-127.965976,208.272046,CZK,40,161782.115373
1958,447282181,41.0,admin.,divorced,secondary,M,no,no,n,yes,2552.330411,2641.322256,2448.598687,192.723569,CZK,40,112283.135627
210,554363693,55.0,blue-collar,divorced,secondary,F,yes,no,n,yes,1611.279521,1729.479437,1517.368825,212.110612,CZK,50,200682.281727
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1638,1664835114,57.0,technician,married,tertiary,F,yes,no,n,no,1186.553353,1349.993692,1100.400941,249.592751,CZK,50,147995.008535
1095,1556975193,36.0,blue-collar,single,secondary,F,no,yes,n,yes,215.277657,363.663505,102.452670,261.210835,CZK,30,108813.735528
1130,1032880478,34.0,blue-collar,married,secondary,M,no,no,n,yes,3002.873173,3160.610344,2864.941448,295.668896,CZK,30,153099.087059
1294,1531243162,36.0,admin.,divorced,secondary,F,yes,no,n,yes,426.526694,526.059702,354.715924,171.343778,CZK,30,85666.053177


In [19]:
num_features = ['age', 'balance_change_y', 'max_balance']
labeled_features = ['has_deposits', 'loan', 'has_mortgage','education' ]

# 'dif_balance', 'min_balance','mean_balance', 'max_balance',
#     

# Create transformers for the numerical and categorical features
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

labeled_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='no')),
    ('label', OrdinalEncoder())]),
    

# Create a column transformer to apply the transformations to the respective columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_features),
        ('label', labeled_transformer, labeled_features)
            ])

# Full pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42, n_estimators= 150, min_samples_split= 10, min_samples_leaf= 2, max_features= 'sqrt', max_depth= 10, criterion= 'entropy', bootstrap= False))])

In [20]:
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
print("Accuracy: {:.2f}%".format(accuracy * 100), "Recall: {:.2f}%".format(recall * 100))

Accuracy: 71.96% Recall: 76.26%


In [18]:
joblib.dump(pipeline, 'pipeline_simplified.pkl')

['pipeline_simplified.pkl']