In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from scipy.stats import chi2_contingency
from matplotlib.ticker import PercentFormatter
from sklearn.model_selection import train_test_split, GridSearchCV
import imblearn
plt.rcParams["patch.force_edgecolor"] = True
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, mean_absolute_error
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn import preprocessing
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from scipy import stats

In [None]:
# Import the data into a dataframe
test_original = pd.read_csv('data/Test.csv')
train_original = pd.read_csv('data/train.csv')

#test = test.drop('uniqueid', axis=1)
#train = train.drop('uniqueid', axis=1)

In [None]:
test_original.head()

In [None]:
train_original.head()

In [None]:
# Make bins for household size in train
household_bins = pd.cut(train_original['household_size'], [1, 3, 7, 10, 25], labels=['single', 'small', 'average', 'big'])
household_bins.name = 'household_sizes'

train = train_original.join(household_bins, how='inner')
train = train.drop('household_size', axis=1)
train.head()

In [None]:
train_original.head()

In [None]:
# Make bins for household size in test
household_bins = pd.cut(test_original['household_size'], [1, 3, 7, 10, 25], labels=['single', 'small', 'average', 'big'])
household_bins.name = 'household_sizes'

test = test_original.join(household_bins, how='inner')
test = test.drop('household_size', axis=1)
test.head()

In [None]:
test_original.head()

In [None]:
# Make bins for age in train
age_bins = pd.cut(train_original['age_of_respondent'], [0, 16, 45, 75, 100], labels=['child', 'adult', 'elder', 'old'])
age_bins.name = 'ages'

train = train_original.join(age_bins, how='inner')
train = train.drop('age_of_respondent', axis=1)
train.head()

In [None]:
# Make bins for age in test
age_bins = pd.cut(test_original['age_of_respondent'], [0, 16, 45, 75, 100], labels=['child', 'adult', 'elder', 'old'])
age_bins.name = 'ages'

test = test_original.join(age_bins, how='inner')
test = test.drop('age_of_respondent', axis=1)
test.head()

In [None]:
# drop unique id for train and test df
test = test.drop('uniqueid', axis=1)
train = train.drop('uniqueid', axis=1)

In [None]:
# make dummie variables for train
train = pd.get_dummies(train, drop_first=True)
train.head().T

In [None]:
# make dummie variables for test
test = pd.get_dummies(test, drop_first=True)
test.head().T

In [None]:
# Train-test Split

# Defining X and y
features = train.columns.tolist()
features.remove('bank_account_Yes')

X = train[features]
y = train.bank_account_Yes

# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, shuffle=True, stratify=y)

# Check the shape of the data sets
print("X_train:", X_train.shape)
print("y_train:", y_train.shape)
print("X_test:", X_test.shape)
print("y_test:", y_test.shape)

In [None]:
# Perform undersampling on the majority class
undersampler = RandomUnderSampler(random_state=42)
X_train_under, y_train_under = undersampler.fit_resample(X_train, y_train)

In [None]:
# Perform oversampling on the minority class
oversampler = SMOTE(random_state=42)
X_train_over, y_train_over = oversampler.fit_resample(X_train, y_train)

## Decision Tree

In [None]:
# Create Decision Tree
dtree = DecisionTreeClassifier()
dtree.fit(X_train, y_train)

In [None]:
# Predict
y_pred_tree = dtree.predict(X_test)
print(confusion_matrix(y_test, y_pred_tree))
print(classification_report(y_test, y_pred_tree))

In [None]:
# Create Decision Tree with undersampling
dtree = DecisionTreeClassifier()
dtree.fit(X_train_under, y_train_under)

In [None]:
# Predict with undersampling
y_pred_tree = dtree.predict(X_test)
print(confusion_matrix(y_test, y_pred_tree))
print(classification_report(y_test, y_pred_tree))

In [None]:
# Create Decision Tree with oversampling
dtree = DecisionTreeClassifier()
dtree.fit(X_train_over, y_train_over)

In [None]:
# Predict with oversampling
y_pred_tree = dtree.predict(X_test)
print(confusion_matrix(y_test, y_pred_tree))
print(classification_report(y_test, y_pred_tree))

---

## Random Forest

In [None]:
#Create Random Forest
forest = RandomForestClassifier()
forest.fit(X_train, y_train)

In [None]:
# Predict
y_pred_forest = dtree.predict(X_test)
print(confusion_matrix(y_test, y_pred_forest))
print(classification_report(y_test, y_pred_forest))

In [None]:
#Create Random Forest with undersampling
forest = RandomForestClassifier()
forest.fit(X_train_under, y_train_under)

In [None]:
# predict with undersampling
y_pred_forest = dtree.predict(X_test)
print(confusion_matrix(y_test, y_pred_forest))
print(classification_report(y_test, y_pred_forest))

In [None]:
#Create Random Forest with oversampling
forest = RandomForestClassifier()
forest.fit(X_train_over, y_train_over)

In [None]:
# Predict with oversampling
y_pred_forest = dtree.predict(X_test)
print(y_pred_forest)
print(confusion_matrix(y_test, y_pred_forest))
print(classification_report(y_test, y_pred_forest))

---

### Decision Tree w/o over- or undersampling has best precision
__Fit test data to model__

In [None]:
test.head().T

In [None]:
# Defining X and  for test
features = test.columns.tolist()
X = test[features]

# Predict
y_pred_tree = dtree.predict(X)
# print(type(y_pred_tree))
series = pd.Series(y_pred_tree).astype(int)
series.name = 'bank_account'

test_new = pd.concat([test, series], axis=1).astype(bool)
test_new.head().T

In [None]:
# make countplot for bank_account of train dataset
# Countplot with bank account, hue=gender
sns.set(rc={'figure.figsize':(11.7,8.27)})
sns.countplot(x=train['bank_account_Yes'], hue=train['gender_of_respondent_Male']);

In [None]:
# make countplot for bank_account of test dataset
# Countplot with bank account, hue=gender
sns.set(rc={'figure.figsize':(11.7,8.27)})
sns.countplot(x=test_new['bank_account'], hue=test_new['gender_of_respondent_Male']);

### KNN Model

In [None]:
train.head()
train.columns

Train the KNN Model with the train dataset

In [None]:
# Splitting features and target
X = train.drop('bank_account_Yes', axis=1)
y = train['bank_account_Yes']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the k-NN model
knn = KNeighborsClassifier(n_neighbors=7)
knn.fit(X_train, y_train)  

# Make predictions
y_pred = knn.predict(X_test)

# Calculate and print the classification report
report = classification_report(y_test, y_pred)
print("Classification Report:\n", report)

In [None]:
# Defining X and for test
features = test.columns.tolist()
X = test[features]

import pickle
with open('/tmp/knn.pickle', 'wb') as f:
    pickle.dump(knn, f, pickle.HIGHEST_PROTOCOL)

# Predict
y_pred_final = knn.predict(X)
# print(type(y_pred_tree))
series_knn = pd.Series(y_pred_final).astype(int)
series_knn = series_knn.astype(bool)
series_knn.name = 'bank_account'

#test_new = pd.concat([test, series], axis=1).astype(bool)
#test_new.head().T

In [None]:
test_original = pd.concat([test_original, series_knn], axis=1)
#test_original.head()
test_original.info()
#test_original.columns

In [None]:
# merge train and test_new on unique id:
final_df = train_original.merge(test_original, how='outer') 
final_df.info()
final_df.tail()

In [None]:
final_df.tail()

In [None]:
# check if value a boolean
def bool2yes(boolean):
    if isinstance(boolean, bool):
        if boolean == True:
            return "Yes"
        else:
            return "No"
    else:
        return boolean
    
final_df = final_df.applymap(bool2yes)

In [None]:
final_df['location_type'].unique()

In [None]:
# Export the dataframe into a csv:
'''
# Define the path to the folder in your repository
folder_path = 'data/'

# Define the file name and extension
file_name = 'final_df.csv'

# Concatenate the folder path and file name
file_path = f'{folder_path}/{file_name}'

# Export the DataFrame to the specified folder
final_df.to_csv(file_path, index=False)
'''

In [None]:
df = pd.read_csv('data/final_df.csv')
df_ba_yes = df.loc[df['bank_account'] == 'Yes']
df_ba_yes.head()

# histplot with hue job type:
# stat='density': Is used when subsets that differ substantially. Use density for normalization. 
# common_norm has to be set to False.
ax = sns.histplot(data=df_ba_yes, x="country", stat='density', common_norm=False, hue='job_type', multiple="dodge", shrink=.8);
sns.move_legend(ax, "upper left", bbox_to_anchor=(1, 1));

In [None]:
# histplot with hue education level
ax = sns.histplot(data=df_ba_yes, x="country", stat='density', common_norm=False, hue='education_level', multiple="dodge", shrink=.8);
sns.move_legend(ax, "upper left", bbox_to_anchor=(1, 1));

In [None]:
final_df['marital_status'].value_counts()