In [1]:
import sqlite3
import pandas as pd
from src import paths,sql_querys,converting
import joblib
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, recall_score
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, LabelEncoder
from sklearn.model_selection import train_test_split


In [2]:
conn = sqlite3.connect(paths.db_path)
cursor = conn.cursor()
query = sql_querys.query_training_model
df = pd.read_sql_query(query, conn)
df = df.loc[:,~df.columns.duplicated()]
df.max_balance = pd.to_numeric(df.max_balance)
df.age=pd.to_numeric(df.age)

In [3]:
rate = {'CZK': 1, 'USD': 23, 'EUR': 25}
df = converting.converter(df, rate)


In [4]:

age_pipeline = joblib.load(paths.age_pipeline_path)
missing_age_indices = df['age'].isnull()
predicted_ages = age_pipeline.predict(df[missing_age_indices])

# Step 2: Replace missing values with predicted values
df.loc[missing_age_indices, 'age'] = predicted_ages

In [5]:
query1 = (sql_querys.query_square_sum)
dif_bal = pd.read_sql_query(query1, conn)
dif_bal = dif_bal.loc[:,~dif_bal.columns.duplicated()]

dif_bal = converting.converter(dif_bal, rate)

dif_bal['balance_diff'] = dif_bal.groupby('client_id')['balance'].diff().fillna(0)
dif_bal['balance_diff_square_sum'] = dif_bal.groupby('client_id')['balance_diff'].transform(lambda x: (x ** 2).sum())
dif_bal.drop(columns=['date', 'balance', 'balance_diff', 'poutcome', 'currency'], inplace=True)
df = df.merge(dif_bal.drop_duplicates(subset=['client_id']), on='client_id', how='left')


In [6]:
y = df.poutcome
X = df.drop(columns = 'poutcome')
enc = LabelEncoder()
y=enc.fit_transform(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
num_features = ['age', 'balance_diff_square_sum', 'max_balance']
labeled_features = ['has_deposits', 'loan', 'has_mortgage','education' ]

# Create transformers for the numerical and categorical features
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

labeled_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='no')),
    ('label', OrdinalEncoder())])

# Create a column transformer to apply the transformations to the respective columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_features),
        ('label', labeled_transformer, labeled_features)
            ])

# Full pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42, n_estimators= 150, min_samples_split= 10, min_samples_leaf= 2, max_features= 'sqrt', max_depth= 10, criterion= 'entropy', bootstrap= False))])

In [8]:
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
print("Accuracy: {:.2f}%".format(accuracy * 100), "Recall: {:.2f}%".format(recall * 100))

Accuracy: 72.17% Recall: 76.26%
