In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import matplotlib.ticker as mtick
import warnings
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from imblearn.over_sampling import SMOTE
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import f1_score, accuracy_score, recall_score, precision_score, roc_auc_score



warnings.filterwarnings(action='ignore')

class Colors:
    Gray = "#5d5d5d"
    LightGray = "#fafafa"
    Black = "#000000"
    White = "#FFFFFF"
    Teal = "#008080"
    Aquamarine = "#76c8c8"
    Blue = "#2596be"
    LightCyan = "#badbdb"
    WhiteSmoke = "#dedad2"
    Cream = "#e4bcad"
    PeachPuff = "#df979e"
    HotPink = "#d7658b"
    DeepPink = "#c80064"
    LightSeaGreen = "#20B2AA"
    DarkGray = "#464144"

In [4]:
# read dataset
def read_banking_data(filename) -> pd.DataFrame:
    data = pd.read_csv(filename, delimiter=';')

    data = data[data['age'] != 'age']

    numeric = ['age', 'balance', 'duration', 'campaign']
    for att in numeric:
        data[att] = pd.to_numeric(data[att])

    return data


df = read_banking_data(filename='Data_train.csv')
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,duration,campaign,subscribed
0,58,management,married,tertiary,no,2143,yes,no,unknown,261,1,no
1,44,technician,single,secondary,no,29,yes,no,unknown,151,1,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,76,1,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,92,1,no
4,33,unknown,single,unknown,no,1,no,no,unknown,198,1,no


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   age         45211 non-null  int64 
 1   job         45211 non-null  object
 2   marital     45211 non-null  object
 3   education   45211 non-null  object
 4   default     45211 non-null  object
 5   balance     45211 non-null  int64 
 6   housing     45211 non-null  object
 7   loan        45211 non-null  object
 8   contact     45211 non-null  object
 9   duration    45211 non-null  int64 
 10  campaign    45211 non-null  int64 
 11  subscribed  45211 non-null  object
dtypes: int64(4), object(8)
memory usage: 4.1+ MB


In [6]:
# subset outliers function
def handling_outlier() -> pd.DataFrame:
    cols = [
        "age",
        "balance",
        "duration",
        "campaign",
    ]
    Q1 = df[cols].quantile(0.25)
    Q3 = df[cols].quantile(0.75)
    IQR = Q3 - Q1
    df2 = df[
        ~((df[cols] < (Q1 - 1.5 * IQR)) | (df[cols] > (Q3 + 1.5 * IQR))).any(axis=1)
    ]
    return df2


df2 = handling_outlier()

In [7]:
# Data Prep

def data_prep(*var_name, dataframe) -> pd.DataFrame:
    data = dataframe.groupby([*var_name]).size().to_frame(name = 'count').reset_index()
    data['percentage'] = data['count'].apply(lambda x: x / sum(data['count']) * 100)
    
    return data

In [8]:
col_names = ['job', 'education', 'contact']

for col in col_names:
    print(f'Before | {col} = {df2[col].unique()}')
    df2[col].replace(['unknown'], df2[col].mode(), inplace=True)
    print(f'After | {col} = {df2[col].unique()}\n')


Before | job = ['management' 'technician' 'entrepreneur' 'blue-collar' 'unknown'
 'retired' 'admin.' 'services' 'self-employed' 'unemployed' 'housemaid'
 'student']
After | job = ['management' 'technician' 'entrepreneur' 'blue-collar' 'retired' 'admin.'
 'services' 'self-employed' 'unemployed' 'housemaid' 'student']

Before | education = ['tertiary' 'secondary' 'unknown' 'primary']
After | education = ['tertiary' 'secondary' 'primary']

Before | contact = ['unknown' 'cellular' 'telephone']
After | contact = ['cellular' 'telephone']



In [9]:
#from sklearn.preprocessing import LabelEncoder
class Encoding:
    def __init__(self, data: pd.DataFrame):
        self.dataframe = data
        self.data = None
    
    def encoding_binary_data(self, binary_data: list):
        for col in binary_data:
            if col == 'contact':
                self.dataframe[col] = self.dataframe[col].map({
                    'telephone': 1,
                    'cellular' : 0
                })

            else:
                self.dataframe[col] = self.dataframe[col].map({
                    'yes': 1,
                    'no' : 0
                })
        
        self.data = self.dataframe
        return self
    
    def encoding_nonbinary_data(self, var: list):
        for col in var:
            self.dataframe[col] = LabelEncoder().fit_transform(self.dataframe[col])
        
        self.data = self.dataframe
        return self
    
    def fetch(self) -> pd.DataFrame:
        return self.data
    
encoding = Encoding(data = df2)\
    .encoding_binary_data(['default', 'housing', 'loan', 'contact'])\
    .encoding_nonbinary_data(['job', 'marital', 'education','subscribed'])

In [10]:
df2 = encoding.fetch()
df2.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,duration,campaign,subscribed
0,58,4,1,2,0,2143,1,0,0,261,1,0
1,44,9,2,1,0,29,1,0,0,151,1,0
2,33,2,1,1,0,2,1,1,0,76,1,0
3,47,1,1,1,0,1506,1,0,0,92,1,0
4,33,1,2,1,0,1,0,0,0,198,1,0


In [11]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 34719 entries, 0 to 45210
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype
---  ------      --------------  -----
 0   age         34719 non-null  int64
 1   job         34719 non-null  int32
 2   marital     34719 non-null  int32
 3   education   34719 non-null  int32
 4   default     34719 non-null  int64
 5   balance     34719 non-null  int64
 6   housing     34719 non-null  int64
 7   loan        34719 non-null  int64
 8   contact     34719 non-null  int64
 9   duration    34719 non-null  int64
 10  campaign    34719 non-null  int64
 11  subscribed  34719 non-null  int32
dtypes: int32(4), int64(8)
memory usage: 2.9 MB


In [12]:
#from sklearn.model_selection import train_test_split, cross_val_score

x = df2.drop('subscribed', axis = 1)
y = df2.loc[:, 'subscribed']

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=1)

In [13]:
#from imblearn.over_sampling import SMOTE

oversampling = SMOTE(random_state = 1)
X_train_oversampling, y_train_oversampling = oversampling.fit_resample(X_train, y_train)

In [14]:
#from sklearn.naive_bayes import GaussianNB
#from sklearn.model_selection import GridSearchCV


nb_params = {"var_smoothing": np.logspace(0, -9, num = 100)}

gcv_nb = GridSearchCV(GaussianNB(), 
                   param_grid = nb_params, 
                   cv = 10, 
                   scoring = 'f1')


nb_gcv = gcv_nb.fit(X_train_oversampling, y_train_oversampling)
print(f'Best Parameter: {nb_gcv.best_params_}')
print(f'Best Score: {nb_gcv.best_score_}')

Best Parameter: {'var_smoothing': 1.232846739442066e-07}
Best Score: 0.8143327625314871


In [15]:
#from sklearn.metrics import classification_report, confusion_matrix
#from sklearn.metrics import f1_score, accuracy_score, recall_score, precision_score, roc_auc_score

nb_pred = nb_gcv.best_estimator_.predict(X_test)

print(classification_report(y_test, nb_pred))
print('Accuracy Score: ',accuracy_score(y_test,nb_pred))
print(f'F1 Score: {f1_score(y_test,nb_pred)}\n')

              precision    recall  f1-score   support

           0       0.97      0.68      0.80      9564
           1       0.17      0.74      0.28       852

    accuracy                           0.69     10416
   macro avg       0.57      0.71      0.54     10416
weighted avg       0.90      0.69      0.76     10416

Accuracy Score:  0.6865399385560675
F1 Score: 0.27781464277814644



In [16]:
#df2.to_csv('raw_data.csv', index=False)

In [17]:
input_data = (55,7,1,1,0,2476,1,0,0,579,1)

input_data_as_numpy_array = np.array(input_data)

input_data_reshape = input_data_as_numpy_array.reshape(1,-1)

prediction = nb_gcv.predict(input_data_reshape)
print(prediction)

if (prediction[0]==0):
    print('Tidak Berlangganan Deposito Berjangka')
else:
    print('Berlangganan Deposito Berjangka')


[1]
Berlangganan Deposito Berjangka
