In [None]:
import pandas as pd
import numpy as np
import missingno as msno
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv('train_loan_prediction.csv')


df.head().T

In [None]:
column_names = [cols for cols in df]
print(column_names)

In [None]:
num_underscore_present_columns = [cols for cols in column_names if '_' not in cols]
num_underscore_present_columns

In [None]:
cols_mappings = {}
for cols in num_underscore_present_columns:
    uppercase_in_cols = [val.isupper() for val in cols]
    num_uppercase_letters = sum(uppercase_in_cols)

    cols_mappings[cols] = {
        "is_uppercase_letter": uppercase_in_cols,
        "num_uppercase_letters": num_uppercase_letters,
        "needs_underscore": (num_uppercase_letters > 1)
    }

In [None]:
for key in cols_mappings.keys():
    if cols_mappings[key]['needs_underscore']:
        print()
        print(f'{key} need the underscore at location ', cols_mappings[key]['is_uppercase_letter'].index(True, 1)) 

In [None]:
'ApplicantIncome'[:9] + '_' + 'ApplicantIncome'[9:]

In [None]:
cols_mappings = {}
for cols in num_underscore_present_columns:
    uppercase_in_cols = [val.isupper() for val in cols]
    num_uppercase_letters = sum(uppercase_in_cols)
    
    if num_uppercase_letters > 1:
        underscore_index = uppercase_in_cols.index(True, 1)
        updated_column_name = cols[:underscore_index] + "_" + cols[underscore_index:]
    else:
        updated_column_name = cols

    cols_mappings[cols] = {
        "is_uppercase_letter": uppercase_in_cols,
        "num_uppercase_letters": num_uppercase_letters,
        "needs_underscore": (num_uppercase_letters > 1),
        "updated_column_name": updated_column_name
    }
    if cols_mappings[cols]['needs_underscore']:
        print(f"{cols} will be renamed to {cols_mappings[cols]['updated_column_name']}")
        
        
column_mappings = {key: cols_mappings[key]["updated_column_name"] for key in cols_mappings.keys()}
column_mappings

In [None]:
df = df.rename(columns=column_mappings)
column_names = [cols for cols in df]
print(column_names)

In [None]:
print([cols.lower() for cols in df])

In [None]:
df.columns = [cols.lower() for cols in df]
print(df.columns)

In [None]:
id_col = 'loan_id'
target = 'loan_status'

cat_cols = [cols for cols in df if df[cols].dtype == 'object' and cols not in [id_col, target]]
cat_cols

In [None]:
for cols in cat_cols:
    print(cols)
    print(df[cols].unique())
    print()

In [None]:
df_consistent = df.copy()
for col in cat_cols:

    df_consistent[col] = df_consistent[col].apply(lambda val: val.lower() if isinstance(val, str) else val)

    df_consistent[col] = df_consistent[col].apply(lambda val: val.replace(' ','_') if isinstance(val, str) else val)

for cols in cat_cols:
    print(cols)
    print(df_consistent[cols].unique())
    print()

In [None]:
df_consistent.dependents = df_consistent.dependents.apply(lambda val: float(val.replace('+','')) if isinstance(val, str) else float(val))

In [None]:
for cols in ['married', 'self_employed']:
    df_consistent[cols] = df_consistent[cols].map({"yes": 1, "no": 0})
    
df_consistent.education = df_consistent.education.map({
    'graduate': 1,
    'not_graduate': 0
})


df_consistent.gender = df_consistent.gender.map({
    'male': 1,
    'female': 0
})

for cols in cat_cols:
    print(cols)
    print(df_consistent[cols].unique())
    print()

In [None]:
def make_data_consistent(df, cat_cols) -> pd.DataFrame:
    """Function to make data consistent and meaningful"""
    
    df = df.copy()
    
    for col in cat_cols:
           
        df[col] = df[col].apply(lambda val: val.lower() if isinstance(val, str) else val)
        df[col] = df[col].apply(lambda val: val.replace(' ','_') if isinstance(val, str) else val)
            
    
    df['dependents'] = df['dependents'].apply(lambda val: float(val.replace('+','')) if isinstance(val, str) else float(val))

    for cols in ['married', 'self_employed']:
        df[cols] = df[cols].map({"yes": 1, "no": 0})

    df['education'] = df['education'].map({
        'graduate': 1,
        'not_graduate': 0
    })

    df['gender'] = df['gender'].map({
        'male': 1,
        'female': 0
    })
    
    
    return df

In [None]:
df_consistent = df.copy()
df_consistent = make_data_consistent(df=df_consistent, cat_cols=cat_cols)

for cols in cat_cols:
    print(cols)
    print(df_consistent[cols].unique())
    print()

In [None]:
df.loan_id.nunique(), df.shape[0]

In [None]:
df[['applicant_income', 'coapplicant_income', 'loan_amount']].value_counts().reset_index(name='count')

In [None]:
df[(df.applicant_income == 4333) & (df.coapplicant_income == 2451) & (df.loan_amount == 110)]

In [None]:
df.gender.value_counts(normalize=True)

In [None]:
remaining_rows = df_consistent.dropna(axis=0).shape[0]
total_records = df_consistent.shape[0]
perc_dropped = ((total_records - remaining_rows)/total_records)*100

print("By dropping all missing data, only {:,} records will be left out of {:,}, a reduction by {:,.3f}%".format(remaining_rows, total_records, perc_dropped))

In [None]:
id_col = 'loan_id'
target = 'loan_status'

feature_cols = [cols for cols in df_consistent if cols not in [id_col, target]]
binary_cols = [cols for cols in feature_cols if df_consistent[cols].nunique() == 2]
cat_cols = [cols for cols in feature_cols if (df_consistent[cols].dtype == 'object' or df_consistent[cols].nunique() <= 15)]
num_cols = [cols for cols in feature_cols if cols not in cat_cols]


In [None]:
cat_cols

In [None]:
binary_cols

In [None]:
num_cols

In [None]:
df_consistent.info()

In [None]:
df_consistent.isnull().sum()

In [None]:
def missing_data_percentage(df: pd.DataFrame):
    """Function to print percentage of missing values"""
    
    df = df.copy()
    
    missing_data = df.isnull().sum()
    total_records = df.shape[0]
    
    perc_missing = round((missing_data/total_records)*100, 3)
    
    missing_df = pd.DataFrame(data={'columm_name':perc_missing.index, 'perc_missing':perc_missing.values})
    
    return missing_df


missing_data_percentage(df_consistent[feature_cols]).sort_values(by='perc_missing', ascending=False)

In [None]:
msno.matrix(df_consistent[feature_cols], figsize=(35, 15))

In [None]:
msno.heatmap(df_consistent[feature_cols], labels=True)

In [None]:
missing_cols = [cols for cols in feature_cols if df_consistent[cols].isnull().sum() > 0]

In [None]:
msno.dendrogram(df_consistent[missing_cols])

In [None]:
cat_missing = [cols for cols in cat_cols if df_consistent[cols].isnull().sum() > 0]

def cat_missing_association_with_outcome(data, missing_data_column, outcome):
    """Function to plot missing association of categorical varibles with outcome"""
    
    df = data.copy()
    df[f"{missing_data_column}_is_missing"] = df[missing_data_column].isnull().astype(int)
    df.groupby([outcome]).agg({f"{missing_data_column}_is_missing": 'mean'}).plot.bar()
    
for cols in cat_missing:
    cat_missing_association_with_outcome(df_consistent, cols, target)

In [None]:
num_missing = [cols for cols in num_cols if df_consistent[cols].isnull().sum() > 0]

def num_missing_association_with_outcome(data, missing_data_column, outcome):
    """Function to plot missing association of categorical varibles with outcome"""
    
    df = data.copy()
    df[f"{missing_data_column}_is_missing"] = df[missing_data_column].isnull().astype(int)
    df.groupby([outcome]).agg({f"{missing_data_column}_is_missing": 'mean'}).plot.bar()


for cols in num_missing:
    num_missing_association_with_outcome(df, cols, target)

In [None]:
df_consistent.loan_amount.plot.kde(color='orange', label='loan_amount', legend=True)
df_consistent.loan_amount.fillna(value=df.loan_amount.median()).plot.kde(color='b', label='loan_amount_imputed', alpha=0.5, figsize=(9,7), legend=True)

In [None]:
round(df_consistent.loan_amount.std(),2), round(df_consistent.loan_amount.fillna(value=df_consistent.loan_amount.median()).std(),2)

In [None]:
df_consistent[num_cols].corr()

In [None]:
observation = df_consistent[df_consistent.loan_amount.isnull()]
imputed_values = []
for idx in observation.index:
    seed = int(observation.loc[idx,['applicant_income']])
    imputed_value = df_consistent['loan_amount'].dropna().sample(1, random_state=seed)
    imputed_values.append(imputed_value)

df_consistent.loc[df_consistent['loan_amount'].isnull(),'loan_amount_random_imputed']=imputed_values 
df_consistent.loc[df['loan_amount'].isnull()==False,'loan_amount_random_imputed']=df_consistent[df_consistent['loan_amount'].isnull()==False]['loan_amount'].values 

In [None]:
df_consistent.loan_amount.plot.kde(color='orange', label='loan_amount', legend=True, linewidth=2)
df_consistent.loan_amount_random_imputed.plot.kde(color='g', label='loan_amount_random_imputed', legend=True, linewidth=2)
df_consistent.loan_amount.fillna(value=df_consistent.loan_amount.median()).plot.kde(color='b', label='loan_amount_median_imputed', linewidth=1, alpha=0.5, figsize=(9,7), legend=True)

In [None]:
round(df_consistent.loan_amount.std(),2), round(df_consistent.loan_amount_random_imputed.std(),2), round(df_consistent.loan_amount.fillna(value=df_consistent.loan_amount.median()).std(),2)

In [None]:
df_consistent['loan_amount_median_imputed'] = df_consistent['loan_amount'].fillna(value=df_consistent['loan_amount'].median())
df_consistent[['loan_amount', 'loan_amount_median_imputed','loan_amount_random_imputed', 'applicant_income']].corr()

In [None]:
df_consistent.credit_history.value_counts(normalize=True)

In [None]:
df_consistent.credit_history.fillna(value=df_consistent.credit_history.mode()[0]).value_counts(normalize=True)

In [None]:
from sklearn.impute import KNNImputer
from feature_engine.wrappers import SklearnTransformerWrapper
from sklearn.preprocessing import StandardScaler

In [None]:
num_cols = [cols for cols in df_consistent if df_consistent[cols].nunique() > 15 and cols not in [id_col, target] and not cols.endswith('imputed')]

In [None]:
df_num = df_consistent[num_cols].copy()
df_num.head()

In [None]:
def scale_data(df, scaler, columns):
    """Function to scale the data"""
    
    df_scaled = df.copy()
    if columns:
        df_scaled[columns] = scaler.fit_transform(df_scaled[columns])
    else:
        columns = [cols for cols in df_scaled]
        df_scaled[columns] = scaler.fit_transform(df_scaled[columns])
    
    return df_scaled, scaler

In [None]:
scaler = StandardScaler()
df_scaled, scaler = scale_data(df_num, scaler=scaler, columns=num_cols)

In [None]:
knn_imputer = SklearnTransformerWrapper(
    transformer = KNNImputer(n_neighbors=10, weights='distance'),
    variables = num_cols
)

In [None]:
df_imputed = knn_imputer.fit_transform(df_scaled)

In [None]:
df_imputed = pd.DataFrame(columns=num_cols, data=scaler.inverse_transform(df_imputed))
df_imputed.head()

In [None]:
df_imputed['loan_amount'].plot.kde(color='orange', label='loan_amount_knn_imputed',linewidth=2, legend=True)
df_consistent['loan_amount'].plot.kde(color='b', label='loan_amount', legend=True, linewidth=2, figsize=(9,7), alpha=0.5)

In [None]:
round(df_consistent.loan_amount.std(),2), round(df_consistent.loan_amount_random_imputed.std(),2), round(df_consistent.loan_amount_median_imputed.std(),2), round(df_imputed.loan_amount.std(),2)

In [None]:
df_consistent['loan_amount_knn_imputed'] = df_imputed.loan_amount
df_consistent[['loan_amount', 'loan_amount_median_imputed','loan_amount_random_imputed', 'loan_amount_knn_imputed', 'applicant_income']].corr()

In [None]:
from sklearn.ensemble import ExtraTreesRegressor, ExtraTreesClassifier
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from feature_engine.encoding import OneHotEncoder

In [None]:
ohe_cols = [cols for cols in cat_cols if df_consistent[cols].dtype == 'object']
ohe_cols

In [None]:
df_ohe_encoded = df_consistent.copy()
ohe = OneHotEncoder(variables=ohe_cols)
df_ohe_encoded = ohe.fit_transform(df_ohe_encoded)

In [None]:
df_ohe_encoded[[cols for cols in df_ohe_encoded if 'property_area' in cols]].head()

In [None]:
cat_cols = [cols for cols in df_ohe_encoded if df_ohe_encoded[cols].nunique() <= 15 and cols not in [id_col, target]]

In [None]:
cat_cols

In [None]:
miss_forest_classifier = IterativeImputer(estimator=ExtraTreesClassifier(n_estimators=100, 
                                                                        random_state=1,
                                                                        bootstrap=True, 
                                                                        n_jobs=-1),
                           max_iter=10,
                           random_state=1,
                           add_indicator=True,
                           initial_strategy='median')

df_cat_imputed = miss_forest_classifier.fit_transform(df_ohe_encoded[cat_cols])

In [None]:
df_cat_imputed = pd.DataFrame(columns=miss_forest_classifier.get_feature_names_out(), 
                               data=df_cat_imputed, 
                               index=df_ohe_encoded.index)
df_cat_imputed.head()

In [None]:
for cols in cat_cols:
    print(cols)
    print(df_cat_imputed[cols].unique())
    print()

In [None]:
num_cols = [cols for cols in df_consistent if cols not in df_cat_imputed and cols not in [id_col, target] + ohe_cols 
            and not cols.endswith("imputed")]

df_combined = pd.concat([df_consistent[num_cols], df_cat_imputed], axis=1)
feature_cols = [cols for cols in df_combined]

In [None]:
feature_cols

In [None]:
miss_forest_regressor = IterativeImputer(estimator=ExtraTreesRegressor(n_estimators=100, 
                                                                       random_state=1, 
                                                                       bootstrap=True, 
                                                                       n_jobs=-1),
                           max_iter=10,
                           random_state=1,
                           add_indicator=True,
                           initial_strategy='median')

df_imputed = miss_forest_regressor.fit_transform(df_combined[feature_cols])

In [None]:
df_imputed = pd.DataFrame(data=df_imputed, 
                           columns=miss_forest_regressor.get_feature_names_out(),
                           index=df_combined.index)

In [None]:
df_imputed.isnull().sum()

In [None]:
df_imputed['loan_amount'].plot.kde(color='orange', label='loan_amount_miss_forest_imputed',linewidth=2, legend=True)
df_consistent['loan_amount'].plot.kde(color='b', label='loan_amount', legend=True, linewidth=2, figsize=(9,7), alpha=0.5)

In [None]:
round(df_consistent.loan_amount.std(),2), round(df_consistent.loan_amount_random_imputed.std(),2), round(df_consistent.loan_amount_median_imputed.std(),2), round(df_imputed.loan_amount.std(),2)

In [None]:
df_consistent['loan_amount_miss_forest_imputed'] = df_imputed.loan_amount
df_consistent[['loan_amount', 'loan_amount_median_imputed','loan_amount_random_imputed', 'loan_amount_miss_forest_imputed', 'applicant_income']].corr()

In [None]:
df_consistent.drop([cols for cols in df_consistent if cols.endswith('imputed')], axis=1, inplace=True)

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix, ConfusionMatrixDisplay
from typing import List
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

In [None]:
feature_cols = [cols for cols in df_consistent if cols not in [target, id_col]]
X_train, X_test, y_train, y_test = train_test_split(df_consistent[feature_cols],
                                                    df_consistent[target].map({'Y':1, 'N':0}), 
                                                    test_size=0.1, 
                                                    random_state=1, 
                                                    stratify=df_consistent[target].map({'Y':1, 'N':0}))

In [None]:
feature_cols

In [None]:
cat_cols = [cols for cols in X_train if X_train[cols].nunique() <= 15]
num_cols = [cols for cols in X_train if cols not in cat_cols]

In [None]:
def miss_forest_categorical_transformer():
    """Function to define categorical pipeline"""
    
    cat_transformer = Pipeline(
        steps=[
            ("one_hot_encoding", 
             OneHotEncoder(variables=ohe_cols)
            ),

            ("miss_forest_classifier",
             IterativeImputer(
                estimator=ExtraTreesClassifier(n_estimators=100,
                                              random_state=1,
                                              bootstrap=True, 
                                              n_jobs=-1),
                max_iter=10,
                random_state=1,
                initial_strategy='median',
                add_indicator=True)
            )
        ]
    )
    
    return cat_transformer

In [None]:
def miss_forest_numerical_transformer():
    """Function to define numerical pipeline"""
    
    num_transformer = Pipeline(
        steps=[
            ("miss_forest", 
             IterativeImputer(
                estimator=ExtraTreesRegressor(n_estimators=100,
                                              random_state=1,
                                              bootstrap=True, 
                                              n_jobs=-1),
                max_iter=10,
                random_state=1,
                initial_strategy='median',
                add_indicator=True)
            )
        ]
    )
    
    return num_transformer

In [None]:
cat_transformer = miss_forest_categorical_transformer()
num_transformer = miss_forest_numerical_transformer()


X_train_cat_imputed = cat_transformer.fit_transform(X_train[cat_cols])
X_test_cat_imputed = cat_transformer.transform(X_test[cat_cols])

X_train_cat_imputed_df = pd.DataFrame(data=X_train_cat_imputed, 
                                      columns=cat_transformer.get_feature_names_out(),
                                      index=X_train.index) 

X_test_cat_imputed_df = pd.DataFrame(data=X_test_cat_imputed, 
                                     columns=cat_transformer.get_feature_names_out(),
                                     index=X_test.index)

X_train_cat_imputed_df = pd.concat([X_train_cat_imputed_df, X_train[num_cols]], axis=1)
X_test_cat_imputed_df = pd.concat([X_test_cat_imputed_df, X_test[num_cols]], axis=1)


X_train_imputed = num_transformer.fit_transform(X_train_cat_imputed_df)
X_test_imputed = num_transformer.transform(X_test_cat_imputed_df)

X_train_transformed = pd.DataFrame(data=X_train_imputed, 
                                   columns=num_transformer.get_feature_names_out(),
                                   index=X_train.index)

X_test_transformed = pd.DataFrame(data=X_test_imputed, 
                                  columns=num_transformer.get_feature_names_out(),
                                  index=X_test.index)

In [None]:
X_train_transformed.head()

In [None]:
X_test_transformed.head()

In [None]:
y_train.mean(), y_test.mean()

In [None]:
d_param_grid = {
    'max_features': [None, 'sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8,10,20],
    'min_samples_leaf' : [1,3,5,8,10,12,15],
    'min_samples_split': [2,6,10,16,20,24,30],
    'criterion' : ['gini', 'entropy'],
    'random_state' : [1], 
    'class_weight' : ['balanced']
}
d_clf = DecisionTreeClassifier()

In [None]:
def train_custom_classifier(X_train, y_train, X_test, y_test, clf, params):
    """Function to train the decision tree classifier and return some metrics"""

    d_clf_cv = GridSearchCV(estimator=d_clf, param_grid=d_param_grid, cv=10, scoring='roc_auc')
    d_clf_cv.fit(X_train, y_train)

    print("Decision tree optimised")


    d_best_params = d_clf_cv.best_params_

    print(f"Getting the best params which are {d_best_params}")

    model = DecisionTreeClassifier(**d_best_params)
    model.fit(X_train, y_train)

    training_predictions_prob = model.predict_proba(X_train)
    testing_predictions_prob = model.predict_proba(X_test)

    training_predictions = model.predict(X_train)
    testing_predictions = model.predict(X_test)

    training_roc_auc = roc_auc_score(y_train, training_predictions_prob[:,1])
    testing_roc_auc = roc_auc_score(y_test, testing_predictions_prob[:,1])

    training_acc = accuracy_score(y_train, training_predictions)
    testing_acc = accuracy_score(y_test, testing_predictions)

    print(f"Training roc is {training_roc_auc}, and testing roc is {testing_roc_auc} \n \
            training accuracy is {training_acc}, testing_acc as {testing_acc}")
    
    return model, testing_predictions, training_roc_auc, testing_roc_auc, training_acc, testing_acc

In [None]:
model, test_predictions, train_roc, test_roc, train_acc, test_acc  = train_custom_classifier(
    X_train=X_train_transformed, 
    y_train=y_train, 
    X_test=X_test_transformed, 
    y_test=y_test, 
    clf=d_clf, 
    params=d_param_grid
    
)

In [None]:
cm = confusion_matrix(y_test, test_predictions, labels=model.classes_, normalize='true')
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=model.classes_)
disp.plot()


In [None]:
cat_transformer = Pipeline(
    steps=[
        ("one_hot_encoding", 
         OneHotEncoder(variables=ohe_cols)
        )
    ]
)

impute_transformer = Pipeline(
    steps=[
        ("simple_imputer", 
         SimpleImputer(strategy='median',
                       add_indicator=True)
        )
    ]
)

X_train_ohe = cat_transformer.fit_transform(X_train)
X_test_ohe = cat_transformer.transform(X_test)


X_train_imputed = impute_transformer.fit_transform(X_train_ohe)
X_test_imputed = impute_transformer.transform(X_test_ohe)


X_train_transformed = pd.DataFrame(data=X_train_imputed, 
                                   columns=impute_transformer.get_feature_names_out(),
                                   index=X_train.index)

X_test_transformed = pd.DataFrame(data=X_test_imputed, 
                                  columns=impute_transformer.get_feature_names_out(),
                                  index=X_test.index)

In [None]:
model, test_predictions, train_roc, test_roc, train_acc, test_acc = train_custom_classifier(
    X_train=X_train_transformed, 
    y_train=y_train, 
    X_test=X_test_transformed, 
    y_test=y_test, 
    clf=d_clf, 
    params=d_param_grid
)

In [None]:
cm = confusion_matrix(y_test, test_predictions, labels=model.classes_, normalize='true')
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=model.classes_)
disp.plot()

In [None]:
data_cutoff_points = np.linspace(start=0.1, stop=1, num=10)
data_cutoff_points

In [None]:
scores = []
for cutoff in data_cutoff_points:
    if cutoff < 1.0:
        X_train_subset, X_train_rem, y_train_subset, y_train_rem = train_test_split(X_train, 
                                                                                y_train, 
                                                                                random_state=1, 
                                                                                train_size=cutoff, 
                                                                                stratify=y_train)
    else:
        X_train_subset = X_train.copy()
        y_train_subset = y_train.copy()
    
    print(f"Model will be trained on {X_train_subset.shape[0]} rows out of {X_train.shape[0]}")
    

    cat_transformer = miss_forest_categorical_transformer()
    num_transformer = miss_forest_numerical_transformer()
    
    X_train_cat_imputed = cat_transformer.fit_transform(X_train_subset[cat_cols])
    X_test_cat_imputed = cat_transformer.transform(X_test[cat_cols])

    X_train_cat_imputed_df = pd.DataFrame(data=X_train_cat_imputed, 
                                          columns=cat_transformer.get_feature_names_out(),
                                          index=X_train_subset.index)

    X_test_cat_imputed_df = pd.DataFrame(data=X_test_cat_imputed, 
                                         columns=cat_transformer.get_feature_names_out(),
                                         index=X_test.index)

    X_train_cat_imputed_df = pd.concat([X_train_cat_imputed_df, X_train_subset[num_cols]], axis=1)
    X_test_cat_imputed_df = pd.concat([X_test_cat_imputed_df, X_test[num_cols]], axis=1)

    X_train_imputed = num_transformer.fit_transform(X_train_cat_imputed_df)
    X_test_imputed = num_transformer.transform(X_test_cat_imputed_df)

    X_train_transformed = pd.DataFrame(data=X_train_imputed, 
                                       columns=num_transformer.get_feature_names_out(),
                                       index=X_train_subset.index)

    X_test_transformed = pd.DataFrame(data=X_test_imputed, 
                                      columns=num_transformer.get_feature_names_out(),
                                      index=X_test.index)
    
    model, test_predictions, train_roc, test_roc, train_acc, test_acc = train_custom_classifier(
        X_train=X_train_transformed, 
        y_train=y_train_subset, 
        X_test=X_test_transformed, 
        y_test=y_test, 
        clf=d_clf, 
        params=d_param_grid)
    
    scores.append((cutoff, train_roc, test_roc, train_acc, test_acc))

In [None]:
df = pd.DataFrame(data=scores, columns=['data_size', 'training_roc', 'testing_roc', "training_acc", "testing_acc"])

In [None]:
plt.plot(df.data_size, df.training_roc, label='training_roc')
plt.plot(df.data_size, df.testing_roc, label='testing_roc')
plt.xlabel("Data Size")
plt.ylabel("ROC")
plt.title("Error Analysis")
plt.legend()

In [None]:
plt.plot(df.data_size, df.training_acc, label='training_acc')
plt.plot(df.data_size, df.testing_acc, label='testing_acc')
plt.xlabel("Data Size")
plt.ylabel("Accuracy")
plt.title("Error Analysis")
plt.legend()

In [None]:
income_variables = ['applicant_income', 'coapplicant_income']
loan_variable = ['loan_amount']
loan_term_variable = ['loan_amount_term']

In [None]:
from feature_engine.creation.math_features import MathFeatures
from feature_engine.creation.relative_features import RelativeFeatures
from sklearn.base import BaseEstimator, TransformerMixin
from feature_engine.selection import DropFeatures

In [None]:
class MultiplyColumns(BaseEstimator, TransformerMixin):
    """Custom pipeline class to multiply columns passed in a dataframe with a value"""
    
    def __init__(self, multiply_by=1, variables=None):
        self.multiply_by = multiply_by
        self.variables = variables
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        if self.variables:
            X[self.variables] = X[self.variables] * self.multiply_by
        return X

In [None]:
cat_transformer = miss_forest_categorical_transformer()
num_transformer = miss_forest_numerical_transformer()
        

feature_transformer = Pipeline(
    steps=[
        ("multiply_by_thousand",
         MultiplyColumns(
             multiply_by=1000,
             variables=loan_variable
         )
        ),
        ("add_columns",
         MathFeatures(
             variables=income_variables,
             func='sum'
         )   
        ),
        ("income_to_loan_ratio",
         RelativeFeatures(variables=[f"sum_{income_variables[0]}_{income_variables[1]}"],
                          reference=loan_variable,
                          func=["div"]
                         )
        ),
        ("emi",
         RelativeFeatures(variables=loan_variable,
                          reference=loan_term_variable,
                          func=["div"])
        ),
        ("drop_features",
         DropFeatures(features_to_drop=income_variables
          ))
    ]
)


X_train_cat_imputed = cat_transformer.fit_transform(X_train[cat_cols])
X_test_cat_imputed = cat_transformer.transform(X_test[cat_cols])

X_train_cat_imputed_df = pd.DataFrame(data=X_train_cat_imputed, 
                                      columns=cat_transformer.get_feature_names_out(),
                                      index=X_train.index) 

X_test_cat_imputed_df = pd.DataFrame(data=X_test_cat_imputed, 
                                     columns=cat_transformer.get_feature_names_out(),
                                     index=X_test.index)

X_train_cat_imputed_df = pd.concat([X_train_cat_imputed_df, X_train[num_cols]], axis=1)
X_test_cat_imputed_df = pd.concat([X_test_cat_imputed_df, X_test[num_cols]], axis=1)


X_train_imputed = num_transformer.fit_transform(X_train_cat_imputed_df)
X_test_imputed = num_transformer.transform(X_test_cat_imputed_df)

X_train_imputed_df = pd.DataFrame(data=X_train_imputed, 
                                   columns=num_transformer.get_feature_names_out(),
                                   index=X_train.index)

X_test_imputed_df = pd.DataFrame(data=X_test_imputed, 
                                  columns=num_transformer.get_feature_names_out(),
                                  index=X_test.index)


X_train_transformed = feature_transformer.fit_transform(X_train_imputed_df)
X_test_transformed = feature_transformer.transform(X_test_imputed_df)

In [None]:
model, test_predictions, train_roc, test_roc, train_acc, test_acc = train_custom_classifier(
    X_train=X_train_transformed, 
    y_train=y_train, 
    X_test=X_test_transformed, 
    y_test=y_test, 
    clf=d_clf, 
    params=d_param_grid)

In [None]:
cm = confusion_matrix(y_test, test_predictions, labels=model.classes_, normalize='true')
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=model.classes_)
disp.plot()

In [None]:
import imblearn
from imblearn.over_sampling import SMOTE, ADASYN
from collections import Counter
print(imblearn.__version__)

In [None]:
results = []
over_sampling = [0.65,0.7, 0.75, 0.8, 'auto']
n_neighbours = [1,3,5,7,9,10]
for os in over_sampling:
    for k in n_neighbours:
        oversample = ADASYN(random_state=1, sampling_strategy=os, n_neighbors=k)
        counter = Counter(y_train)
        print(f"data size before applying smote technique is {counter}")
        X_train_synthetic, y_train_synthetic = oversample.fit_resample(X_train_transformed, y_train)
        
        counter = Counter(y_train_synthetic)
        print(f"data size after applying smote technique is {counter}")
        
        model, test_predictions, train_roc, test_roc, train_acc, test_acc = train_custom_classifier(
        X_train=X_train_synthetic, 
        y_train=y_train_synthetic, 
        X_test=X_test_transformed, 
        y_test=y_test, 
        clf=d_clf, 
        params=d_param_grid)

        results.append((os, k, train_roc, test_roc, train_acc, test_acc))
        
synthetic_df = pd.DataFrame(columns=['os_strategy', "n_neighbours", "train_roc", "test_roc", "train_acc", "test_acc"], data=results)

In [None]:
synthetic_df.sort_values(by="test_acc", ascending=False)

In [None]:
counter = Counter(y_train)
print(f"data size before applying smote technique {tech_name} is {counter}")
# transform the dataset
oversample = ADASYN(random_state=1, n_neighbors=7, sampling_strategy=0.75)
X_train_synthetic, y_train_synthetic = oversample.fit_resample(X_train_transformed, y_train)

counter = Counter(y_train_synthetic)
print(f"data size after applying smote technique {tech_name} is {counter}")

model, test_predictions, train_roc, test_roc, train_acc, test_acc = train_custom_classifier(
X_train=X_train_synthetic, 
y_train=y_train_synthetic, 
X_test=X_test_transformed, 
y_test=y_test, 
clf=d_clf, 
params=d_param_grid)


In [None]:
np.random.seed(1)
data = {
    "id": np.linspace(start=1, stop=10, num=10, dtype=int),
    "population" : np.random.randint(low=1000, high=100000, size=10),
    "property_area": ["urban"]*4 + ["semi_urban"]*5 + ["rural"]*1
}

df = pd.DataFrame(data=data)

In [None]:
df.head()

In [None]:
df.property_area.value_counts(normalize=True)

In [None]:
df.property_area.isin(['rural', 'urban']) == False

In [None]:
sum(df.property_area.isin(['rural', 'urban']) == False) / df.shape[0]

In [None]:
df['true_property_area'] = df.population.apply(lambda value: 'rural' if value <= 20000 else 'urban')

In [None]:
df[['true_property_area', 'property_area', 'population']]

In [None]:
sum(df.property_area == df.true_property_area) / df.shape[0]

In [None]:
accuracy_score(y_pred=df.property_area, y_true=df.true_property_area)

In [None]:
from datetime import datetime, timedelta
import warnings

In [None]:
numdays = 100
base = datetime.today() 
date_list = [base - timedelta(days=day) for day in range(numdays)] # Subracting values from 1 to 100 from todays date

In [None]:
[date.date().strftime('%Y-%m-%d') for date in date_list[0:10]]

In [None]:
np.random.seed(1) 
data = {
    "id": np.linspace(start=1, stop=100, num=100, dtype=int),
    "population" : np.random.randint(low=1000, high=100000, size=100),
    "property_area": ["urban"]*40 + ["semi_urban"]*50 + ["rural"]*10,
    "date_loaded": date_list
}

df = pd.DataFrame(data=data)

In [None]:
df.head()

In [None]:
(datetime.now() - df.date_loaded.max()).days

In [None]:
def check_data_recency_days(df: pd.DataFrame, loaded_at_column: str, warning_at: int=5, error_at: int=10):
    """Function to detect data freshness"""
    
    df = df.copy()
    days_since_data_refreshed = (datetime.now() - df[loaded_at_column].max()).days
    
    if days_since_data_refreshed < warning_at:
        print(f"Data is fresh and is {days_since_data_refreshed} days old")
    
    elif error_at > days_since_data_refreshed >= warning_at:
        warnings.warn(f"Warning: Data is not fresh, and is {days_since_data_refreshed} days old")
    
    else:
        raise ValueError(f"Date provided is too old and stale, please contact source provider: {days_since_data_refreshed} days old")

In [None]:
check_data_recency_days(df, "date_loaded")

In [None]:
df_filter_6_days = df[df.date_loaded <= (datetime.today() -  timedelta(days=6))] 
df_filter_12_days = df[df.date_loaded <= (datetime.today() -  timedelta(days=12))]

In [None]:
check_data_recency_days(df_filter_6_days, "date_loaded")

In [None]:
import alibi
from alibi_detect.cd import TabularDrift

In [None]:
cd = TabularDrift(x_ref=X_train_transformed.to_numpy(), p_val=.05 )

In [None]:
preds = cd.predict(X_test_transformed.to_numpy())
labels = ['No', 'Yes']
print('Drift: {}'.format(labels[preds['data']['is_drift']]))

In [None]:
X_test_transformed.head()

In [None]:
X_test_transformed['loan_amount'] = X_test_transformed['loan_amount']*1.5
X_test_transformed['sum_applicant_income_coapplicant_income'] = X_test_transformed['sum_applicant_income_coapplicant_income']*1.2
X_test_transformed.sum_applicant_income_coapplicant_income_div_loan_amount = X_test_transformed.sum_applicant_income_coapplicant_income/X_test_transformed.loan_amount
X_test_transformed.loan_amount_div_loan_amount_term = X_test_transformed.loan_amount/X_test_transformed.loan_amount_term

In [None]:
preds = cd.predict(X_test_transformed.to_numpy())
labels = ['No', 'Yes']
print('Drift: {}'.format(labels[preds['data']['is_drift']]))

In [None]:
testing_predictions_prob = model.predict_proba(X_test_transformed)
testing_predictions = model.predict(X_test_transformed)

testing_roc_auc = roc_auc_score(y_test, testing_predictions_prob[:,1])
testing_acc = accuracy_score(y_test, testing_predictions)

print(f"Testing roc is {testing_roc_auc} and testing_acc as {testing_acc}")