In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import precision_score

In [2]:
data = pd.read_csv("bank-full-dataset.csv")
data.head(10)

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,,5,may,261,1,-1,0,,no
1,44,technician,single,secondary,no,29,yes,no,,5,may,151,1,-1,0,,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,,5,may,76,1,-1,0,,no
3,47,blue-collar,married,,no,1506,yes,no,,5,may,92,1,-1,0,,no
4,33,,single,,no,1,no,no,,5,may,198,1,-1,0,,no
5,35,management,married,tertiary,no,231,yes,no,,5,may,139,1,-1,0,,no
6,28,management,single,tertiary,no,447,yes,yes,,5,may,217,1,-1,0,,no
7,42,entrepreneur,divorced,tertiary,yes,2,yes,no,,5,may,380,1,-1,0,,no
8,58,retired,married,primary,no,121,yes,no,,5,may,50,1,-1,0,,no
9,43,technician,single,secondary,no,593,yes,no,,5,may,55,1,-1,0,,no


In [3]:
data.isnull().sum()

age              0
job            288
marital          0
education     1857
default          0
balance          0
housing          0
loan             0
contact      13020
day              0
month            0
duration         0
campaign         0
pdays            0
previous         0
poutcome     36959
y                0
dtype: int64

In [4]:
data = data.drop(["contact","poutcome"], axis=1)

In [5]:
enc = LabelEncoder()
features_to_convert = ["job","marital","default","housing","loan","month","y"]

for i in features_to_convert:
    data[i] = enc.fit_transform(data[i].astype('str'))

In [6]:
data['education'] = data['education'].fillna('unknown')
encoder = ['unknown','primary','secondary','tertiary']

for i, word in enumerate(encoder):
    data['education'] = data['education'].astype('str').str.replace(word, str(i))
    
data['education'] = data['education'].astype('int64')
data.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,day,month,duration,campaign,pdays,previous,y
0,58,4,1,3,0,2143,1,0,5,8,261,1,-1,0,0
1,44,10,2,2,0,29,1,0,5,8,151,1,-1,0,0
2,33,2,1,2,0,2,1,1,5,8,76,1,-1,0,0
3,47,1,1,0,0,1506,1,0,5,8,92,1,-1,0,0
4,33,5,2,0,0,1,0,0,5,8,198,1,-1,0,0


In [7]:
outliers = {}
for i in range(data.shape[1]):
    min_t = data[data.columns[i]].mean() - (3 * data[data.columns[i]].std())
    max_t = data[data.columns[i]].mean() + (3 * data[data.columns[i]].std())
    count = 0
    for j in data[data.columns[i]]:
        if j < min_t or j > max_t:
            count += 1
    outliers[data.columns[i]] = [count,data.shape[0]]
print(outliers)

{'age': [381, 45211], 'job': [0, 45211], 'marital': [0, 45211], 'education': [0, 45211], 'default': [815, 45211], 'balance': [745, 45211], 'housing': [0, 45211], 'loan': [0, 45211], 'day': [0, 45211], 'month': [0, 45211], 'duration': [963, 45211], 'campaign': [840, 45211], 'pdays': [1723, 45211], 'previous': [582, 45211], 'y': [0, 45211]}


In [8]:
X = data.drop("y", axis=1)
Y = data["y"]

In [9]:
X_new, X_test, Y_new, Y_test = train_test_split(X, Y, test_size=0.2, random_state = 0)
test_size = X_test.shape[0] / X_new.shape[0]
X_train, X_dev, Y_train, Y_dev = train_test_split(X_new, Y_new, test_size=test_size, random_state = 0)
print(X_train.shape, Y_train.shape, X_dev.shape, Y_dev.shape, X_test.shape, Y_test.shape)

(27125, 14) (27125,) (9043, 14) (9043,) (9043, 14) (9043,)


### First attempt

In [10]:
model_tree = DecisionTreeClassifier(random_state = 2)
model_tree.fit(X_train, Y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=2, splitter='best')

In [11]:
model_NN = MLPClassifier(random_state = 2)
model_NN.fit(X_train, Y_train)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(100,), learning_rate='constant',
              learning_rate_init=0.001, max_fun=15000, max_iter=200,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=2, shuffle=True, solver='adam',
              tol=0.0001, validation_fraction=0.1, verbose=False,
              warm_start=False)

In [12]:
X_sets = [X_train, X_dev, X_test]
Y_sets = [Y_train, Y_dev, Y_test]

precision = []

for i in range(0,len(X_sets)):
    
    pred = model_tree.predict(X_sets[i])
    score = precision_score(Y_sets[i], pred)
    precision.append(score)

print(precision)

[1.0, 0.43909348441926344, 0.4208059981255858]

In [13]:
X_sets = [X_train, X_dev, X_test]
Y_sets = [Y_train, Y_dev, Y_test]

precision = []

for i in range(0,len(X_sets)):
    
    pred = model_NN.predict(X_sets[i])
    score = precision_score(Y_sets[i], pred)
    precision.append(score)

print(precision)

[0.35577647236029525, 0.35199283475145543, 0.3470483005366726]

### Second attempt

In [14]:
model_tree = DecisionTreeClassifier(random_state = 2, min_samples_leaf=100, max_depth=100)
model_tree.fit(X_train, Y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=100, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=100, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=2, splitter='best')

In [15]:
model_NN = MLPClassifier(random_state = 2, max_iter=1000,  hidden_layer_sizes = [100,100,50,25,25], tol=1e-4)
model_NN.fit(X_train, Y_train)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=[100, 100, 50, 25, 25],
              learning_rate='constant', learning_rate_init=0.001, max_fun=15000,
              max_iter=1000, momentum=0.9, n_iter_no_change=10,
              nesterovs_momentum=True, power_t=0.5, random_state=2,
              shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
              verbose=False, warm_start=False)

In [16]:
X_sets = [X_train, X_dev, X_test]
Y_sets = [Y_train, Y_dev, Y_test]

precision = []

for i in range(0,len(X_sets)):
    
    pred = model_tree.predict(X_sets[i])
    score = precision_score(Y_sets[i], pred)
    precision.append(score)

print(precision)

[0.6073670992046881, 0.5691158156911582, 0.5448113207547169]

In [17]:
X_sets = [X_train, X_dev, X_test]
Y_sets = [Y_train, Y_dev, Y_test]

precision = []

for i in range(0,len(X_sets)):
    
    pred = model_NN.predict(X_sets[i])
    score = precision_score(Y_sets[i], pred)
    precision.append(score)

print(precision)

[0.759941089837997, 0.5920398009950248, 0.5509259259259259]