In [1]:
from tensorflow import keras
from tensorflow.keras import layers
import matplotlib.pyplot as plt
from tensorflow.keras.utils import to_categorical
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
from sklearn.metrics import classification_report, accuracy_score, roc_curve, auc, recall_score
from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
import sqlite3
import pandas as pd
from keras.callbacks import EarlyStopping
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np

In [2]:
conn = sqlite3.connect(r'C:\Users\spiri\codingbootcamp\Final-project-C4Y\data.db')
cursor = conn.cursor()
query = ('''
WITH mean_deposite as (
    SELECT 
        client_id,
        AVG(balance) as mean_balance,
        MAX(balance) as max_balance,
        MIN(balance) as min_balance,
        MAX(balance) - MIN(balance) as dif_balance,
        currency
    FROM balances
    GROUP BY client_id
)
SELECT client.* , client_products.*, mean_deposite.*
FROM client
LEFT JOIN inv_campaign_eval ON client.client_id = inv_campaign_eval.client_id
LEFT JOIN client_products ON client.client_id = client_products.client_id
LEFT JOIN mean_deposite ON client.client_id = mean_deposite.client_id
WHERE inv_campaign_eval.client_id is null''')
df = pd.read_sql_query(query, conn)

df = df.T.drop_duplicates().T

import joblib
age_pipeline = joblib.load(r'C:\Users\spiri\codingbootcamp\Final-project-C4Y\age_pipeline.pkl')

missing_age_indices = df['age'].isnull()
predicted_ages = age_pipeline.predict(df[missing_age_indices])
df.loc[missing_age_indices, 'age'] = predicted_ages

bins = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
labels = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90]
df['age_group'] = pd.cut(df['age'], bins=bins, labels=labels, right=False)

query1 = ('''
Select * 
from balances
left join inv_campaign_eval
on balances.client_id = inv_campaign_eval.client_id
WHERE inv_campaign_eval.client_id is null; ''')
dif_bal = pd.read_sql_query(query1, conn)
dif_bal = dif_bal.T.drop_duplicates().T
dif_bal.dropna(axis = 1, inplace = True)
rate = {'CZK': 1, 'USD': 23, 'EUR': 25}
dif_bal.balance = dif_bal.apply(lambda row: row['balance'] * rate[row['currency']], axis=1)

dif_bal['balance_change'] = dif_bal.groupby('client_id')['balance'].diff()
dif_bal['balance_change'].fillna(0, inplace=True)

sum_dif = dif_bal.groupby('client_id')['balance_change'].apply(lambda arr: sum(x ** 2 for x in arr))
dif_bal = dif_bal.merge(sum_dif, on='client_id', how='left')
dif_bal

dif_bal.drop(columns=['date', 'balance', 'balance_change_x', 'currency',], inplace=True)
dif_bal.drop_duplicates(inplace=True)
df = df.merge(dif_bal, on='client_id', how='left')

df.mean_balance = pd.to_numeric(df.mean_balance)
df.max_balance = pd.to_numeric(df.max_balance)
df.min_balance = pd.to_numeric(df.min_balance)
df.dif_balance = pd.to_numeric(df.dif_balance)
df.age=pd.to_numeric(df.age)
df.age_group=pd.to_numeric(df.age_group)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dif_bal['balance_change'].fillna(0, inplace=True)


In [18]:
df

Unnamed: 0,client_id,age,job,marital,education,gender,has_deposits,loan,has_insurance,has_mortgage,mean_balance,max_balance,min_balance,dif_balance,currency,age_group,balance_change_y
0,1222646323,46.0,services,divorced,unknown,M,no,no,n,no,960.562073,1054.517907,886.126906,168.391001,CZK,40,81006.572385
1,451375919,33.0,admin.,single,secondary,F,no,no,n,yes,1221.016419,1303.494818,1100.917203,202.577615,CZK,30,103154.973222
2,338972671,44.0,self-employed,married,secondary,F,no,no,y,yes,297.993265,446.676191,202.053088,244.623103,CZK,40,234426.309740
3,1472834688,36.0,blue-collar,married,primary,M,yes,no,n,yes,1919.318145,2011.939205,1853.387429,158.551776,CZK,30,103686.130884
4,1068680340,63.0,admin.,married,secondary,F,yes,no,n,yes,0.972130,77.806737,-104.261740,182.068477,CZK,60,128401.966138
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8858,440873221,56.0,management,divorced,tertiary,F,no,no,n,no,386.995526,469.647273,270.665696,198.981578,CZK,50,130624.277077
8859,543198889,40.0,blue-collar,single,unknown,M,no,no,n,yes,18.441905,125.568092,-98.265981,223.834074,CZK,40,182976.060921
8860,458558558,41.0,housemaid,divorced,primary,F,no,no,n,yes,25.552707,116.816943,-117.991255,234.808198,CZK,40,179548.802788
8861,1426334329,59.0,technician,married,primary,F,no,no,y,yes,3157.720202,3263.371390,3024.384824,238.986566,CZK,50,152024.406007


In [15]:
import joblib
class InteractionScoreTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        # Assuming the first two columns are the ones we're interested in
        self.max_interaction = X[:, :2].max()
        return self

    def transform(self, X):
        # Create an interaction score: a simple product of the first two columns
        interaction_score = (X[:, 0] * X[:, 1]) / self.max_interaction
        return np.hstack((X, interaction_score[:, None]))

pipeline = joblib.load(r'C:\Users\spiri\codingbootcamp\Final-project-C4Y\pipeline_simplified.pkl')

In [16]:
result = []
def prediction(X):
    predictions = pipeline.predict(X)  # Call predict method of the pipeline
    
    for i, pred in enumerate(predictions):
        result.append((X.iloc[i]['client_id'], pred))
    return result

In [19]:
prediction(df)
res = pd.DataFrame(result)
res_suc = res[res[1] == 1]
map = {0: 'client_id', 1: 'outcome'}
outcome = res_suc.rename(columns=map)
outcome


Unnamed: 0,client_id,outcome
4,1068680340,1
6,422084751,1
7,123316323,1
9,1861869059,1
14,122279077,1
...,...,...
8836,1563192915,1
8838,1903776674,1
8839,887694228,1
8840,1182886062,1


In [3]:
model = joblib.load(r'C:\Users\spiri\codingbootcamp\Final-project-C4Y\model_simplified.pkl')


In [9]:
X = df[['client_id', 'age', 'has_deposits', 'loan', 'has_mortgage', 'balance_change_y', 'max_balance', 'education']]

label_encoders = {}
columns_to_encode = ['has_deposits', 'loan', 'has_mortgage','education' ]

for col in columns_to_encode:
    label_encoders[col] = LabelEncoder()
    X.loc[:, col] = label_encoders[col].fit_transform(X[col])

scaler = StandardScaler()
X[['age', 'has_deposits', 'loan', 'has_mortgage', 'balance_change_y', 'max_balance', 'education']] = scaler.fit_transform(X[['age', 'has_deposits', 'loan', 'has_mortgage', 'balance_change_y', 'max_balance', 'education']])

X.set_index(X.client_id, inplace=True)
X.drop(columns='client_id', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[['age', 'has_deposits', 'loan', 'has_mortgage', 'balance_change_y', 'max_balance', 'education']] = scaler.fit_transform(X[['age', 'has_deposits', 'loan', 'has_mortgage', 'balance_change_y', 'max_balance', 'education']])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.drop(columns='client_id', inplace=True)


In [10]:
X

Unnamed: 0_level_0,age,has_deposits,loan,has_mortgage,balance_change_y,max_balance,education
client_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1222646323,0.432809,-0.845171,-0.403507,-0.96289,-1.050456,-0.162368,2.303339
451375919,-0.694158,-0.845171,-0.403507,1.03854,-0.730492,-0.084279,-0.346689
338972671,0.259429,-0.845171,-0.403507,1.03854,1.165904,-0.353010,-0.346689
1472834688,-0.434089,1.183193,-0.403507,1.03854,-0.722819,0.137915,-1.671704
1068680340,1.906535,1.183193,-0.403507,1.03854,-0.365764,-0.468701,-0.346689
...,...,...,...,...,...,...,...
440873221,1.299707,-0.845171,-0.403507,-0.96289,-0.333660,-0.345805,0.978325
543198889,-0.087330,-0.845171,-0.403507,1.03854,0.422634,-0.453722,2.303339
458558558,-0.000640,-0.845171,-0.403507,1.03854,0.373122,-0.456466,-1.671704
1426334329,1.559776,-0.845171,-0.403507,1.03854,-0.024506,0.530412,-1.671704


In [11]:
result1 = []

def prediction(X):
    predictions = model.predict(X)  # Call predict method of the pipeline
    
    for client_id, pred in zip(X.index, predictions):
        result1.append((client_id, pred))
    return result1

prediction(X)



[(1222646323, 0),
 (451375919, 0),
 (338972671, 0),
 (1472834688, 0),
 (1068680340, 1),
 (711558356, 0),
 (422084751, 1),
 (123316323, 1),
 (1321691715, 0),
 (1861869059, 1),
 (1595517041, 0),
 (1408023785, 0),
 (1327432047, 0),
 (1147481134, 0),
 (122279077, 1),
 (1177854254, 1),
 (985873256, 1),
 (853552378, 0),
 (1687099232, 1),
 (891087466, 0),
 (1035374127, 1),
 (1037143944, 0),
 (308162265, 1),
 (1893016025, 0),
 (1854082979, 1),
 (1600875950, 0),
 (1991899634, 1),
 (638886315, 0),
 (1208147537, 0),
 (1322726978, 1),
 (1256445690, 0),
 (1652258354, 1),
 (1112221396, 0),
 (732497257, 0),
 (140285460, 0),
 (735985444, 0),
 (1344298467, 0),
 (1122977086, 0),
 (1972322805, 1),
 (512479010, 0),
 (1110057717, 0),
 (1433542674, 0),
 (1041617418, 0),
 (134507716, 0),
 (141363965, 0),
 (644584000, 0),
 (730164497, 0),
 (1000803517, 0),
 (785941673, 0),
 (1992212658, 0),
 (1425881145, 0),
 (626144979, 0),
 (1196534790, 0),
 (701373306, 0),
 (810743044, 0),
 (1031945947, 1),
 (1017336008, 1

In [12]:
res = pd.DataFrame(result1)
res_suc = res[res[1] == 1]
map = {0: 'client_id', 1: 'outcome'}
outcome = res_suc.rename(columns=map)
outcome

Unnamed: 0,client_id,outcome
4,1068680340,1
6,422084751,1
7,123316323,1
9,1861869059,1
14,122279077,1
...,...,...
8838,1903776674,1
8839,887694228,1
8840,1182886062,1
8845,1208330461,1


In [50]:
joined = pd.merge(df, outcome, on=['client_id','client_id'], how="inner", indicator=True)
joined.drop(columns=['outcome', '_merge'], inplace=True)
joined

Unnamed: 0,client_id,age,job,marital,education,gender,has_deposits,loan,has_insurance,has_mortgage,mean_balance,max_balance,min_balance,dif_balance,currency,age_group,balance_change_y
0,1068680340,63.0,admin.,married,secondary,F,yes,no,n,yes,0.972130,77.806737,-104.261740,182.068477,CZK,60,128401.966138
1,422084751,30.0,technician,single,secondary,M,yes,no,n,no,2561.223346,2634.111167,2486.790068,147.321099,CZK,30,110648.005976
2,123316323,36.0,blue-collar,divorced,secondary,F,yes,no,n,no,2841.068900,2937.134176,2720.984988,216.149188,CZK,30,176224.042205
3,1861869059,66.0,retired,married,secondary,M,no,no,n,no,3450.227742,3551.465162,3334.598877,216.866285,CZK,60,164396.401467
4,122279077,44.0,self-employed,married,unknown,F,yes,no,n,yes,175.310695,271.779298,60.734525,211.044773,CZK,40,164022.701399
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2791,1903776674,39.0,management,divorced,tertiary,M,yes,no,n,no,224.987226,357.034274,61.498673,295.535600,CZK,30,224428.088760
2792,887694228,27.0,blue-collar,single,secondary,F,yes,no,n,no,3781.777344,3868.336354,3648.086964,220.249389,CZK,20,174736.693110
2793,1182886062,46.0,admin.,divorced,secondary,F,yes,no,n,no,2213.339515,2352.019870,2104.151111,247.868759,CZK,40,153368.528820
2794,1208330461,46.0,management,married,secondary,F,yes,no,n,no,152.848535,268.701324,17.437500,251.263824,CZK,40,190203.643966


In [51]:
df1 = pd.merge(df, outcome, on=['client_id','client_id'], how="outer", indicator=True)
df1 = df1[df1['_merge'] == 'left_only']

In [52]:

depo = df1[(df1['has_deposits'] == 'yes') & (df1['has_mortgage'] == 'no')]
depo.drop(columns=['outcome', '_merge'], inplace=True)
depo

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  depo.drop(columns=['outcome', '_merge'], inplace=True)


Unnamed: 0,client_id,age,job,marital,education,gender,has_deposits,loan,has_insurance,has_mortgage,mean_balance,max_balance,min_balance,dif_balance,currency,age_group,balance_change_y
123,130223900,41.0,blue-collar,married,secondary,M,yes,yes,n,no,-194.664609,-110.328878,-329.087759,218.758882,CZK,40,55545.917217
293,167916114,34.0,admin.,single,secondary,M,yes,yes,n,no,-45.318717,58.931641,-156.220994,215.152635,CZK,30,86333.392865
326,175441990,53.0,housemaid,married,primary,M,yes,yes,n,no,118.214571,206.341138,44.951541,161.389597,CZK,50,69595.653015
376,187674401,35.0,management,single,tertiary,M,yes,yes,n,no,-311.364634,-220.912357,-382.852988,161.940631,CZK,30,69165.089537
403,193510933,51.0,services,divorced,secondary,F,yes,yes,n,no,243.498003,341.319131,80.435848,260.883283,CZK,50,247618.704421
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8549,1936300610,36.0,management,single,tertiary,M,yes,yes,n,no,2042.932116,2152.242240,1902.437758,249.804482,CZK,30,157623.673605
8614,1951320551,57.0,housemaid,married,primary,F,yes,yes,n,no,617.482556,737.561611,485.243099,252.318513,CZK,50,170268.233278
8660,1960076428,44.0,unemployed,single,primary,M,yes,yes,n,no,-3.001486,119.022483,-88.188904,207.211387,CZK,40,80301.483562
8787,1983122569,54.0,self-employed,single,tertiary,M,yes,yes,n,no,378.601565,488.462126,253.540394,234.921732,CZK,50,156085.485668


In [76]:
client_base = pd.concat([joined, depo])
client_base

Unnamed: 0,client_id,age,job,marital,education,gender,has_deposits,loan,has_insurance,has_mortgage,mean_balance,max_balance,min_balance,dif_balance,currency,age_group,balance_change_y
0,1068680340,63.0,admin.,married,secondary,F,yes,no,n,yes,0.972130,77.806737,-104.261740,182.068477,CZK,60,128401.966138
1,422084751,30.0,technician,single,secondary,M,yes,no,n,no,2561.223346,2634.111167,2486.790068,147.321099,CZK,30,110648.005976
2,123316323,36.0,blue-collar,divorced,secondary,F,yes,no,n,no,2841.068900,2937.134176,2720.984988,216.149188,CZK,30,176224.042205
3,1861869059,66.0,retired,married,secondary,M,no,no,n,no,3450.227742,3551.465162,3334.598877,216.866285,CZK,60,164396.401467
4,122279077,44.0,self-employed,married,unknown,F,yes,no,n,yes,175.310695,271.779298,60.734525,211.044773,CZK,40,164022.701399
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8549,1936300610,36.0,management,single,tertiary,M,yes,yes,n,no,2042.932116,2152.242240,1902.437758,249.804482,CZK,30,157623.673605
8614,1951320551,57.0,housemaid,married,primary,F,yes,yes,n,no,617.482556,737.561611,485.243099,252.318513,CZK,50,170268.233278
8660,1960076428,44.0,unemployed,single,primary,M,yes,yes,n,no,-3.001486,119.022483,-88.188904,207.211387,CZK,40,80301.483562
8787,1983122569,54.0,self-employed,single,tertiary,M,yes,yes,n,no,378.601565,488.462126,253.540394,234.921732,CZK,50,156085.485668


In [58]:
rndppl = pd.merge(df, client_base, on=['client_id'], how="outer", indicator=True)
rndppl = rndppl[rndppl['_merge'] == 'left_only']
rndppl.dropna(axis=1, inplace=True)
rndppl

Unnamed: 0,client_id,age_x,marital_x,education_x,gender_x,has_deposits_x,loan_x,has_insurance_x,has_mortgage_x,mean_balance_x,max_balance_x,min_balance_x,dif_balance_x,currency_x,age_group_x,balance_change_y_x,_merge
0,100070076,41.0,divorced,tertiary,M,no,no,n,no,203.907928,361.807400,103.460451,258.346949,CZK,40,226645.598677,left_only
1,100141910,34.0,single,tertiary,F,no,no,n,yes,2544.965438,2666.636101,2449.282039,217.354063,CZK,30,141673.231917,left_only
3,100646594,75.0,divorced,primary,F,no,no,n,no,37131.526158,37281.123426,37015.861279,265.262146,CZK,70,193556.380846,left_only
4,100741497,56.0,married,secondary,M,no,no,n,yes,228.186356,379.549653,91.039351,288.510302,CZK,50,267354.905468,left_only
5,100757123,33.0,married,secondary,F,no,no,n,yes,5783.153417,5937.708127,5685.106154,252.601973,CZK,30,179391.433785,left_only
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8855,1998762652,37.0,married,secondary,M,no,no,n,yes,477.129738,562.430159,355.603264,206.826894,CZK,30,117378.878802,left_only
8858,1999460711,53.0,divorced,secondary,M,no,yes,n,yes,215.601872,348.240956,128.394602,219.846354,CZK,50,172236.703890,left_only
8859,1999463182,34.0,married,secondary,M,no,no,n,yes,1268.089349,1376.045106,1146.101785,229.943321,CZK,30,170357.398835,left_only
8860,1999499145,31.0,single,tertiary,F,no,no,n,yes,-4.751058,104.483184,-119.430736,223.913920,CZK,30,88949.557622,left_only


In [80]:
rndppl = rndppl.sort_values(by='max_balance_x', ascending=False)
rndppl[0:4]
big_balance = rndppl[0:4]
big_balance

Unnamed: 0,client_id,age_x,marital_x,education_x,gender_x,has_deposits_x,loan_x,has_insurance_x,has_mortgage_x,mean_balance_x,max_balance_x,min_balance_x,dif_balance_x,currency_x,age_group_x,balance_change_y_x,_merge
1381,415300718,52.0,married,primary,F,no,no,n,no,66659.912413,66747.365096,66583.084323,164.280773,CZK,50,123918.451218,left_only
5306,1239608796,43.0,single,secondary,F,no,no,n,no,56825.121802,56918.339731,56683.868334,234.471397,CZK,40,159411.887637,left_only
6467,1494666538,56.0,married,tertiary,M,no,yes,n,no,51431.314882,51513.77614,51325.437741,188.338399,CZK,50,105228.273796,left_only
3,100646594,75.0,divorced,primary,F,no,no,n,no,37131.526158,37281.123426,37015.861279,265.262146,CZK,70,193556.380846,left_only


In [81]:
rndpplbb = rndppl.sort_values(by='balance_change_y_x', ascending=False)
rndpplbb = rndpplbb[rndpplbb['has_deposits_x'] == 'yes']
dep_change = rndpplbb[:115]
dep_change


Unnamed: 0,client_id,age_x,marital_x,education_x,gender_x,has_deposits_x,loan_x,has_insurance_x,has_mortgage_x,mean_balance_x,max_balance_x,min_balance_x,dif_balance_x,currency_x,age_group_x,balance_change_y_x,_merge
8486,1922188215,34.0,single,tertiary,M,yes,no,n,yes,2823.482078,2979.748234,2668.680926,311.067308,CZK,30,405667.225875,left_only
5818,1349718296,35.0,single,tertiary,M,yes,no,n,yes,161.532166,379.426913,49.910389,329.516524,CZK,30,393702.653996,left_only
6711,1550899005,31.0,single,tertiary,F,yes,no,n,yes,151.692462,297.753574,13.454011,284.299564,CZK,30,386583.202053,left_only
1755,497978154,35.0,married,secondary,F,yes,no,n,yes,5624.705745,5791.475967,5467.672199,323.803768,CZK,30,377528.952120,left_only
8274,1878319679,47.0,married,secondary,M,yes,yes,n,yes,4417.958157,4575.207522,4291.114566,284.092955,CZK,40,370264.470510,left_only
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3976,954059680,45.0,divorced,primary,F,yes,no,n,yes,-291.029318,-156.427427,-389.364524,232.937097,CZK,40,221179.925556,left_only
1872,519397078,40.0,married,primary,F,yes,no,n,yes,283.067227,456.871527,170.082694,286.788833,CZK,40,220888.666278,left_only
8849,1997889070,32.0,married,secondary,F,yes,no,n,yes,-10.920405,103.854653,-131.261945,235.116598,CZK,30,219988.587943,left_only
8127,1847607853,48.0,divorced,primary,M,yes,no,n,yes,20.396204,142.005825,-108.222443,250.228267,CZK,40,219931.893741,left_only


In [78]:
client_base = pd.concat([client_base, big_balance])
client_base = pd.concat([client_base, dep_change])


In [84]:
client_ids = client_base.client_id
client_ids.to_csv('client_ids.csv', index=False)