In [1]:
from sklearn.metrics import classification_report, accuracy_score, roc_curve, auc, recall_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
import sqlite3
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import GradientBoostingRegressor
from src import paths, sql_querys, converting
from sklearn.metrics import mean_absolute_error
import joblib

In [2]:
conn = sqlite3.connect(paths.db_path)
cursor = conn.cursor()
query = sql_querys.query_age_model
df = pd.read_sql_query(query, conn)
df.head()

Unnamed: 0,client_id,age,job,marital,education,gender,client_id.1,has_deposits,loan,has_insurance,has_mortgage,client_id.2,mean_balance,max_balance,min_balance,dif_balance,currency
0,249789938,38.0,services,married,secondary,M,249789938,yes,no,n,no,249789938,2669.532734,2775.615374,2515.500059,260.115315,CZK
1,1222646323,46.0,services,divorced,unknown,M,1222646323,no,no,n,no,1222646323,960.562073,1054.517907,886.126906,168.391001,CZK
2,451375919,33.0,admin.,single,secondary,F,451375919,no,no,n,yes,451375919,1221.016419,1303.494818,1100.917203,202.577615,CZK
3,338972671,44.0,self-employed,married,secondary,F,338972671,no,no,y,yes,338972671,297.993265,446.676191,202.053088,244.623103,CZK
4,1472834688,36.0,blue-collar,married,primary,M,1472834688,yes,no,n,yes,1472834688,1919.318145,2011.939205,1853.387429,158.551776,CZK


In [3]:
df = df.T.drop_duplicates().T
df.mean_balance = pd.to_numeric(df.mean_balance)
df.max_balance = pd.to_numeric(df.max_balance)
df.min_balance = pd.to_numeric(df.min_balance)

In [4]:
rate = {'CZK': 1, 'USD': 23, 'EUR': 25}
df = converting.converter(df,rate)

In [5]:
df1 = df.drop(columns='client_id')
df1.dropna(subset=['age'], inplace=True, how='any', axis=0)
y = df1.age
X = df1.drop(columns = 'age')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


Here we present a pipeline for modeling age, specifically designed to handle missing values within the dataset's age column.

In [24]:

num_features = ['max_balance']
cat_features = ['job']
labeled_features = ['marital', 'education', 'gender', 'has_deposits', 'loan', 'has_insurance', 'has_mortgage']


# Create transformers for the numerical and categorical features
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='unknown')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

labeled_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='no')),
    ('label', OrdinalEncoder())])


# Create a column transformer to apply the transformations to the respective columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_features),
        ('cat', categorical_transformer, cat_features),
        ('label', labeled_transformer, labeled_features)
            ])

# Full pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', GradientBoostingRegressor(learning_rate=0.05, loss='huber', max_depth=8, 
                max_features='sqrt', min_samples_leaf=10, min_samples_split=10, n_estimators=129))])

In [25]:
pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)

# Print the MAE
print("Mean Absolute Error (MAE):", mae)

Mean Absolute Error (MAE): 6.566775882894331


In [23]:
joblib.dump(pipeline, 'age_pipeline.pkl')

Mean Absolute Error (MAE): 6.567035702399732
