In [27]:
from tensorflow import keras
from tensorflow.keras import layers
import matplotlib.pyplot as plt
from tensorflow.keras.utils import to_categorical
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
from sklearn.metrics import classification_report, accuracy_score, roc_curve, auc, recall_score
from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
import sqlite3
import pandas as pd
from keras.callbacks import EarlyStopping
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np

In [28]:
conn = sqlite3.connect(r'C:\Users\spiri\codingbootcamp\Final-project-C4Y\data.db')
cursor = conn.cursor()

In [31]:
query = ('''
WITH mean_deposite as (
    SELECT 
        client_id,
        AVG(balance) as mean_balance,
        MAX(balance) as max_balance,
        MIN(balance) as min_balance,
        MAX(balance) - MIN(balance) as dif_balance,
        currency
    FROM balances
    GROUP BY client_id
)
SELECT client.* , client_products.*, mean_deposite.*
FROM client
LEFT JOIN inv_campaign_eval ON client.client_id = inv_campaign_eval.client_id
LEFT JOIN client_products ON client.client_id = client_products.client_id
LEFT JOIN mean_deposite ON client.client_id = mean_deposite.client_id
WHERE inv_campaign_eval.client_id is null''')
df = pd.read_sql_query(query, conn)

df.head()

Unnamed: 0,client_id,age,job,marital,education,gender,client_id.1,has_deposits,loan,has_insurance,has_mortgage,client_id.2,mean_balance,max_balance,min_balance,dif_balance,currency
0,1222646323,46.0,services,divorced,unknown,M,1222646323,no,no,n,no,1222646323,960.562073,1054.517907,886.126906,168.391001,CZK
1,451375919,33.0,admin.,single,secondary,F,451375919,no,no,n,yes,451375919,1221.016419,1303.494818,1100.917203,202.577615,CZK
2,338972671,44.0,self-employed,married,secondary,F,338972671,no,no,y,yes,338972671,297.993265,446.676191,202.053088,244.623103,CZK
3,1472834688,36.0,blue-collar,married,primary,M,1472834688,yes,no,n,yes,1472834688,1919.318145,2011.939205,1853.387429,158.551776,CZK
4,1068680340,63.0,admin.,married,secondary,F,1068680340,yes,no,n,yes,1068680340,0.97213,77.806737,-104.26174,182.068477,CZK


In [32]:
df = df.T.drop_duplicates().T

In [33]:
df.isna().sum()

client_id          0
age              182
job              355
marital            0
education          0
gender             0
has_deposits       0
loan               0
has_insurance      0
has_mortgage       0
mean_balance       0
max_balance        0
min_balance        0
dif_balance        0
currency           0
dtype: int64

In [24]:
import joblib
class InteractionScoreTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        # Assuming the first two columns are the ones we're interested in
        self.max_interaction = X[:, :2].max()
        return self

    def transform(self, X):
        # Create an interaction score: a simple product of the first two columns
        interaction_score = (X[:, 0] * X[:, 1]) / self.max_interaction
        return np.hstack((X, interaction_score[:, None]))

pipeline = joblib.load(r'C:\Users\spiri\codingbootcamp\Final-project-C4Y\pipeline.pkl')

In [38]:
result = []
def prediction(X):
    predictions = pipeline.predict(X)  # Call predict method of the pipeline
    
    for i, pred in enumerate(predictions):
        result.append((X.iloc[i]['client_id'], pred))
    return result

In [39]:
prediction(df)

[(1222646323, 0),
 (451375919, 0),
 (338972671, 0),
 (1472834688, 0),
 (1068680340, 1),
 (711558356, 0),
 (422084751, 1),
 (123316323, 1),
 (1321691715, 0),
 (1861869059, 1),
 (1595517041, 0),
 (1408023785, 0),
 (1327432047, 0),
 (1147481134, 0),
 (122279077, 1),
 (1177854254, 1),
 (985873256, 1),
 (853552378, 0),
 (1687099232, 1),
 (891087466, 0),
 (1035374127, 1),
 (1037143944, 0),
 (308162265, 1),
 (1893016025, 0),
 (1854082979, 1),
 (1600875950, 0),
 (1991899634, 1),
 (638886315, 0),
 (1208147537, 0),
 (1322726978, 1),
 (1256445690, 0),
 (1652258354, 0),
 (1112221396, 1),
 (732497257, 0),
 (140285460, 0),
 (735985444, 0),
 (1344298467, 0),
 (1122977086, 0),
 (1972322805, 1),
 (512479010, 0),
 (1110057717, 0),
 (1433542674, 0),
 (1041617418, 0),
 (134507716, 0),
 (141363965, 0),
 (644584000, 0),
 (730164497, 0),
 (1000803517, 0),
 (785941673, 0),
 (1992212658, 0),
 (1425881145, 0),
 (626144979, 0),
 (1196534790, 0),
 (701373306, 0),
 (810743044, 0),
 (1031945947, 1),
 (1017336008, 0

In [42]:
res = pd.DataFrame(result)

In [43]:
res

Unnamed: 0,0,1
0,1222646323,0
1,451375919,0
2,338972671,0
3,1472834688,0
4,1068680340,1
...,...,...
8858,440873221,0
8859,543198889,0
8860,458558558,0
8861,1426334329,0


In [50]:
res_suc = res[res[1] == 1]
map = {0: 'client_id', 1: 'outcome'}
outcome = res_suc.rename(columns=map)
outcome

Unnamed: 0,client_id,outcome
4,1068680340,1
6,422084751,1
7,123316323,1
9,1861869059,1
14,122279077,1
...,...,...
8839,887694228,1
8840,1182886062,1
8845,1208330461,1
8846,1335817823,1


In [55]:
joined = outcome.join(df, how='inner', lsuffix='_outcome', rsuffix='_df')
joined

Unnamed: 0,client_id_outcome,outcome,client_id_df,age,job,marital,education,gender,has_deposits,loan,has_insurance,has_mortgage,mean_balance,max_balance,min_balance,dif_balance,currency
4,1068680340,1,1068680340,63.0,admin.,married,secondary,F,yes,no,n,yes,0.97213,77.806737,-104.26174,182.068477,CZK
6,422084751,1,422084751,30.0,technician,single,secondary,M,yes,no,n,no,2561.223346,2634.111167,2486.790068,147.321099,CZK
7,123316323,1,123316323,36.0,blue-collar,divorced,secondary,F,yes,no,n,no,2841.0689,2937.134176,2720.984988,216.149188,CZK
9,1861869059,1,1861869059,66.0,retired,married,secondary,M,no,no,n,no,3450.227742,3551.465162,3334.598877,216.866285,CZK
14,122279077,1,122279077,44.0,self-employed,married,unknown,F,yes,no,n,yes,175.310695,271.779298,60.734525,211.044773,CZK
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8839,887694228,1,887694228,27.0,blue-collar,single,secondary,F,yes,no,n,no,3781.777344,3868.336354,3648.086964,220.249389,CZK
8840,1182886062,1,1182886062,46.0,admin.,divorced,secondary,F,yes,no,n,no,2213.339515,2352.01987,2104.151111,247.868759,CZK
8845,1208330461,1,1208330461,46.0,management,married,secondary,F,yes,no,n,no,152.848535,268.701324,17.4375,251.263824,CZK
8846,1335817823,1,1335817823,41.0,blue-collar,married,primary,M,yes,no,y,no,239.464578,314.273881,140.546806,173.727075,CZK


In [59]:
joined.mean_balance.median()

861.9390648255093