## Week 7: Machine Learning & Data Mining

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

### Q1. Predicting the class ‘Survived’ with Decision tree, KNN, Na¨ıve Bayes classifiers.

In [2]:
# loading Titanic data

titanic = pd.read_csv('./titanic.csv')
print("Number of points in original data: {}".format(len(titanic.index)))

columns = titanic.columns
print("Features present in dataset: \n", list(columns))
titanic.head(5)

Number of points in original data: 887
Features present in dataset: 
 ['Survived', 'Pclass', 'Name', 'Sex', 'Age', 'Siblings/Spouses Aboard', 'Parents/Children Aboard', 'Fare']


Unnamed: 0,Survived,Pclass,Name,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
0,0,3,Mr. Owen Harris Braund,male,22.0,1,0,7.25
1,1,1,Mrs. John Bradley (Florence Briggs Thayer) Cum...,female,38.0,1,0,71.2833
2,1,3,Miss. Laina Heikkinen,female,26.0,0,0,7.925
3,1,1,Mrs. Jacques Heath (Lily May Peel) Futrelle,female,35.0,1,0,53.1
4,0,3,Mr. William Henry Allen,male,35.0,0,0,8.05


#### Converting continuous attaribute into classes

In [3]:
conditions = [(titanic['Age'] < 25.0),(titanic['Age'] > 45.0), 
              (titanic['Age'] > 25.0) & (titanic['Age'] < 45.0)]

values = [1, 3, 2]
titanic['New_age'] = np.select(conditions, values)
#titanic
conditions = [(titanic['Fare'] < 15),(titanic['Fare'] > 50), 
              (titanic['Fare'] > 15) & (titanic['Fare'] < 50)]

values = [1, 3, 2]
titanic['New_Fare'] = np.select(conditions, values)

#titanic.loc[titanic['Siblings/Spouses Aboard'] == 1 , 'Siblings/Spouses Aboard'] = 'True'
#titanic

titanic.loc[titanic['Sex'] == 'male', 'Sex'] = 1
titanic.loc[titanic['Sex'] == 'female', 'Sex'] = 0

#### Removing continuous attributes after creating new attribute of same column 

In [4]:
titanic.drop(columns=['Age','Fare'], axis = 1, inplace = True)
titanic

Unnamed: 0,Survived,Pclass,Name,Sex,Siblings/Spouses Aboard,Parents/Children Aboard,New_age,New_Fare
0,0,3,Mr. Owen Harris Braund,1,1,0,1,1
1,1,1,Mrs. John Bradley (Florence Briggs Thayer) Cum...,0,1,0,2,3
2,1,3,Miss. Laina Heikkinen,0,0,0,2,1
3,1,1,Mrs. Jacques Heath (Lily May Peel) Futrelle,0,1,0,2,3
4,0,3,Mr. William Henry Allen,1,0,0,2,1
...,...,...,...,...,...,...,...,...
882,0,2,Rev. Juozas Montvila,1,0,0,2,1
883,1,1,Miss. Margaret Edith Graham,0,0,0,1,2
884,0,3,Miss. Catherine Helen Johnston,0,1,2,1,2
885,1,1,Mr. Karl Howell Behr,1,0,0,2,2


In [5]:
le = preprocessing.LabelEncoder()

pd.set_option('display.max_colwidth', None)
x = titanic[["Pclass","Sex","Siblings/Spouses Aboard","Parents/Children Aboard", "New_age","New_Fare"]]
y = le.fit(titanic["Survived"])
y = le.transform(titanic["Survived"])


# set the random state 
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.35, random_state=0)

print("No of training samples: {}".format(x_train.shape))
print("No of test samples    : {}".format(x_test.shape))
print("y training samples    : {}".format(y_train.shape))
print("y test samples        : {}".format(y_test.shape))
x_train.head(5)

No of training samples: (576, 6)
No of test samples    : (311, 6)
y training samples    : (576,)
y test samples        : (311,)


Unnamed: 0,Pclass,Sex,Siblings/Spouses Aboard,Parents/Children Aboard,New_age,New_Fare
243,3,1,0,0,2,1
518,3,1,0,0,1,1
35,1,1,1,0,2,3
81,3,0,0,0,2,1
159,3,1,0,1,2,2


#### a. Accuracy of the classifies with 5-fold CV

In [6]:
k = 5
dt_kfold = KFold(n_splits = k, random_state = None)
dt_model = DecisionTreeClassifier()
dt_acc_score = []
#print(x_train.shape)
dt_precision = []
dt_recall = []
dt_f1 = []

for train_index, test_index in dt_kfold.split(x):
    #print(train_index.shape, test_index.shape)
    dt_x_train, dt_x_test = x.iloc[train_index,:], x.iloc[test_index,:]
    dt_y_train, dt_y_test = y[train_index], y[test_index]
    
    dt_model.fit(dt_x_train,dt_y_train)
    dt_predict = dt_model.predict(dt_x_test)
    dt_acc = accuracy_score(dt_predict, dt_y_test)
    dt_acc_score.append(dt_acc)
    
    precision_tree = precision_score(dt_y_test, dt_predict)
    dt_precision.append(precision_tree)
    
    recall_tree = recall_score(dt_y_test, dt_predict)
    dt_recall.append(recall_tree)
    
    f1_tree = f1_score(dt_y_test, dt_predict)
    dt_f1.append(f1_tree)

avg_dt_acc_score = sum(dt_acc_score)/k
print("Accuracy of decision tree of each fold is:\n {}".format(dt_acc_score))
print("Average accuracy of decision tree is {:.2f}%".format(100*avg_dt_acc_score))

Accuracy of decision tree of each fold is:
 [0.7752808988764045, 0.7921348314606742, 0.807909604519774, 0.768361581920904, 0.8248587570621468]
Average accuracy of decision tree is 79.37%


#### Precision, recall, f1 of decion tree classifier

In [7]:
avg_dt_precision = sum(dt_precision)/k
print("Precision of decision tree of each fold is:\n{}".format(dt_precision))
print("precision accuracy of decision tree is:    {:.2f}% \n".format(100*avg_dt_precision))

avg_dt_recall = sum(dt_recall)/k
print("Recall of decision tree of each fold is:\n{}".format(dt_recall))
print("Recall accuracy of decision tree is:       {:.2f}% \n".format(100*avg_dt_recall))

avg_dt_f1 = sum(dt_f1)/k
print("F1 of decision tree of each fold is:\n{} ".format(dt_f1))
print("F1 accuracy of decision tree is:           {:.2f}% \n".format(100*avg_dt_f1))

Precision of decision tree of each fold is:
[0.6727272727272727, 0.8088235294117647, 0.78125, 0.7586206896551724, 0.7758620689655172]
precision accuracy of decision tree is:    75.95% 

Recall of decision tree of each fold is:
[0.6271186440677966, 0.6962025316455697, 0.7142857142857143, 0.6197183098591549, 0.7142857142857143]
Recall accuracy of decision tree is:       67.43% 

F1 of decision tree of each fold is:
[0.6491228070175439, 0.7482993197278912, 0.7462686567164178, 0.6821705426356589, 0.7438016528925621] 
F1 accuracy of decision tree is:           71.39% 



#### Finding accuracy of K-fold, precision, recall and f1-score of KNN 

In [8]:
k = 5
knn_kfold = KFold(n_splits = k, random_state = None)
knn_model = KNeighborsClassifier()
knn_acc_score = []
#print(x_train.shape)
knn_precision = []
knn_recall = []
knn_f1 = []

for train_index_knn, test_index_knn in knn_kfold.split(x):
    #print(train_index.shape, test_index.shape)
    knn_x_train, knn_x_test = x.iloc[train_index_knn,:], x.iloc[test_index_knn,:]
    knn_y_train, knn_y_test = y[train_index_knn], y[test_index_knn]
    
    knn_model.fit(knn_x_train,knn_y_train)
    knn_predict = knn_model.predict(knn_x_test)
    knn_acc = accuracy_score(knn_predict, knn_y_test)
    knn_acc_score.append(knn_acc)
    
    precision_knn = precision_score(knn_y_test, knn_predict)
    knn_precision.append(precision_knn)
    
    recall_knn = recall_score(knn_y_test, knn_predict)
    knn_recall.append(recall_knn)
    
    f1_knn = f1_score(knn_y_test, knn_predict)
    knn_f1.append(f1_knn)

avg_knn_acc_score = sum(knn_acc_score)/k
print("Accuracy of KNN of each fold is:\n{}".format(knn_acc_score))
print("Average accuracy of knn is:       {:.2f}%".format(100*avg_knn_acc_score))

Accuracy of KNN of each fold is:
[0.7528089887640449, 0.7921348314606742, 0.8022598870056498, 0.7966101694915254, 0.8305084745762712]
Average accuracy of knn is:       79.49%


In [9]:
avg_knn_precision = sum(knn_precision)/k
print("Precision of KNN of each fold is:\n{}".format(knn_precision))
print("precision accuracy of KNN is:    {:.2f}% \n".format(100*avg_knn_precision))

avg_knn_recall = sum(knn_recall)/k
print("Recall of KNN of each fold is:\n{}".format(knn_recall))
print("Recall accuracy of KNN is:       {:.2f}% \n".format(100*avg_knn_recall))

avg_knn_f1 = sum(knn_f1)/k
print("F1 of KNN of each fold is:\n{} ".format(knn_f1))
print("F1 accuracy of KNN is:           {:.2f}% \n".format(100*avg_knn_f1))

Precision of KNN of each fold is:
[0.6119402985074627, 0.7763157894736842, 0.7536231884057971, 0.8571428571428571, 0.8666666666666667]
precision accuracy of KNN is:    77.31% 

Recall of KNN of each fold is:
[0.6949152542372882, 0.7468354430379747, 0.7428571428571429, 0.5915492957746479, 0.6190476190476191]
Recall accuracy of KNN is:       67.90% 

F1 of KNN of each fold is:
[0.6507936507936508, 0.7612903225806452, 0.748201438848921, 0.7000000000000001, 0.7222222222222222] 
F1 accuracy of KNN is:           71.65% 



#### After calculated KNN classifier by direct devide the data into training and test got accuracy 79.10% and by deviding the training data into 5 folds got an average accuracy of 79.49% I observed that both accuracies are almost same.

In [10]:
knnclassifier = KNeighborsClassifier(n_neighbors = 3, metric = 'cosine')
knnclassifier.fit(x_train.values, y_train)
knn_y_pred = knnclassifier.predict(x_test.values)

knn_acc = accuracy_score(y_test, knn_y_pred)
print("KNN Accuracy   : {:.2f}%".format(100*knn_acc))

knn_precision = precision_score(y_test, knn_y_pred)
print("KNN precision  : {:.2f}%".format(100*knn_precision))

knn_recall = recall_score(y_test, knn_y_pred)
print("KNN Accuracy   : {:.2f}%".format(100*knn_recall))

knn_f1 = f1_score(y_test, knn_y_pred)
print("KNN Accuracy   : {:.2f}%".format(100*knn_f1))

print("KNN confusion_matrix :\n{}".format(confusion_matrix(y_test,knn_y_pred)))

KNN_report = classification_report(y_test, knn_y_pred)
print(KNN_report)

KNN Accuracy   : 79.10%
KNN precision  : 75.00%
KNN Accuracy   : 69.42%
KNN Accuracy   : 72.10%
KNN confusion_matrix :
[[162  28]
 [ 37  84]]
              precision    recall  f1-score   support

           0       0.81      0.85      0.83       190
           1       0.75      0.69      0.72       121

    accuracy                           0.79       311
   macro avg       0.78      0.77      0.78       311
weighted avg       0.79      0.79      0.79       311



#### Finding accuracy of K-fold, precision, recall and f1-score of Naive bays classifier

In [11]:
k = 5
g_kfold = KFold(n_splits = k, random_state = None)
g_model = GaussianNB()
g_acc_score = []
#print(x_train.shape)
g_precision = []
g_recall = []
g_f1 = []

for train_index_g, test_index_g in g_kfold.split(x):
    #print(train_index.shape, test_index.shape)
    g_x_train, g_x_test = x.iloc[train_index_g,:], x.iloc[test_index_g,:]
    g_y_train, g_y_test = y[train_index_g], y[test_index_g]
    
    g_model.fit(g_x_train,g_y_train)
    g_predict = g_model.predict(g_x_test)
    g_acc = accuracy_score(g_predict, g_y_test)
    g_acc_score.append(g_acc)
    
    precision_g = precision_score(g_y_test, g_predict)
    g_precision.append(precision_g)
    
    recall_g = recall_score(g_y_test, g_predict)
    g_recall.append(recall_g)
    
    f1_g = f1_score(g_y_test, g_predict)
    g_f1.append(f1_g)

avg_g_acc_score = sum(g_acc_score)/k
print("Accuracy of Gaussian of each fold is:\n{}".format(g_acc_score))
print("Average accuracy of Gaussian is:       {:.2f}%".format(100*avg_g_acc_score))

Accuracy of Gaussian of each fold is:
[0.7134831460674157, 0.7640449438202247, 0.7740112994350282, 0.7796610169491526, 0.807909604519774]
Average accuracy of Gaussian is:       76.78%


In [12]:
avg_g_precision = sum(g_precision)/k
print("Precision of Gaussian of each fold is:\n{}".format(g_precision))
print("precision accuracy of Gaussian is:    {:.2f}% \n".format(100*avg_g_precision))

avg_g_recall = sum(g_recall)/k
print("Recall of Gaussian of each fold is:\n{}".format(g_recall))
print("Recall accuracy of Gaussian is:       {:.2f}% \n".format(100*avg_g_recall))

avg_g_f1 = sum(g_f1)/k
print("F1 of Gaussian of each fold is:\n{} ".format(g_f1))
print("F1 accuracy of Gaussian is:           {:.2f}% \n".format(100*avg_g_f1))

Precision of Gaussian of each fold is:
[0.5487804878048781, 0.7176470588235294, 0.6666666666666666, 0.7424242424242424, 0.7230769230769231]
precision accuracy of Gaussian is:    67.97% 

Recall of Gaussian of each fold is:
[0.7627118644067796, 0.7721518987341772, 0.8571428571428571, 0.6901408450704225, 0.746031746031746]
Recall accuracy of Gaussian is:       76.56% 

F1 of Gaussian of each fold is:
[0.6382978723404256, 0.7439024390243902, 0.75, 0.7153284671532847, 0.7343749999999999] 
F1 accuracy of Gaussian is:           71.64% 



### Q2. Building Decision tree, KNN, Naıve Bayes models with selected stock using all attributes to predict ‘daily returns’

#### loading data

In [13]:
df = pd.read_csv('./IBM.txt', delimiter = " ")
df_raw = df
print("Number of rows in original data: {}".format(len(df.index)))
print("Features: ", list(df.columns))


Number of rows in original data: 3692
Features:  ['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Adjusted']


In [14]:
pd.options.mode.chained_assignment = None

df['Daily_returns'] = 100*((df['Close'] - df['Close'].shift())/ df['Close'].shift())
conditions = [(df['Daily_returns'] >= 0.0),(df['Daily_returns'] < 0.0)]
# 1 for UP. -1 for Down

values1 = [1, -1]
df['Decision'] = np.select(conditions, values1)
df['Decision(next_day)'] = df['Decision'].shift(-1)
print("Number of rows in processed data: {}".format(len(df.index)))

df_new = df[1:-2]
df_new['Decision(next_day)'] = df_new['Decision(next_day)'].astype('int32')
df_new.head(8)


Number of rows in processed data: 3692


Unnamed: 0,Date,Open,High,Low,Close,Volume,Adjusted,Daily_returns,Decision,Decision(next_day)
1,2007-01-04,97.25,98.790001,96.879997,98.309998,10524500,63.802544,1.06919,1,-1
2,2007-01-05,97.599998,97.949997,96.910004,97.419998,7221300,63.22493,-0.9053,-1,1
3,2007-01-08,98.5,99.5,98.349998,98.900002,10340000,64.185463,1.519199,1,1
4,2007-01-09,99.080002,100.330002,99.07,100.07,11108200,64.944771,1.183011,1,-1
5,2007-01-10,98.5,99.050003,97.93,98.889999,8744800,64.178978,-1.179176,-1,-1
6,2007-01-11,99.0,99.900002,98.5,98.650002,8000700,64.023201,-0.242691,-1,1
7,2007-01-12,98.989998,99.690002,98.5,99.339996,6636500,64.471024,0.699436,1,1
8,2007-01-16,99.400002,100.839996,99.300003,100.82,9602200,65.431503,1.489837,1,-1


In [15]:
pd.options.mode.chained_assignment = None

df_new.loc[df_new['Open'] < 100, 'Open'] = 100
df_new.loc[(df_new['Open'] > 100)&(df_new['Open'] < 120), 'Open'] = 120
df_new.loc[df_new['Open'] > 120, 'Open'] = 150

df_new.loc[df_new['High'] < 100, 'High'] = 100
df_new.loc[((df_new['High'] > 100)&(df_new['High'] < 120)), 'High'] = 120
df_new.loc[df_new['High'] > 120, 'High'] = 150

df_new.loc[df_new['Low'] < 100, 'Low'] = 100
df_new.loc[((df_new['Low'] > 100)&(df_new['Low'] < 120)), 'Low'] = 120
df_new.loc[df_new['Low'] > 120, 'Low'] = 150

df_new.loc[df_new['Adjusted'] < 100, 'Adjusted'] = 100
df_new.loc[((df_new['Adjusted'] > 100)&(df_new['Adjusted'] < 120)), 'Adjusted'] = 120
df_new.loc[df_new['Adjusted'] > 120, 'Adjusted'] = 150

df_new.loc[df_new['Close'] < 100, 'Close'] = 100
df_new.loc[((df_new['Close'] > 100)&(df_new['Close'] < 120)), 'Close'] = 120
df_new.loc[df_new['Close'] > 120, 'Close'] = 150

df_new.loc[df_new['Volume'] <= 3039600, 'Volume'] = 100
df_new.loc[((df_new['Volume'] > 3039600)&(df_new['Volume'] < 10000000)), 'Volume'] = 120
df_new.loc[df_new['Volume'] > 10000000, 'Volume'] = 150
df_new

Unnamed: 0,Date,Open,High,Low,Close,Volume,Adjusted,Daily_returns,Decision,Decision(next_day)
1,2007-01-04,100.0,100.0,100.0,100.0,150,100.0,1.069190,1,-1
2,2007-01-05,100.0,100.0,100.0,100.0,120,100.0,-0.905300,-1,1
3,2007-01-08,100.0,100.0,100.0,100.0,150,100.0,1.519199,1,1
4,2007-01-09,100.0,120.0,100.0,120.0,150,100.0,1.183011,1,-1
5,2007-01-10,100.0,100.0,100.0,100.0,120,100.0,-1.179176,-1,-1
...,...,...,...,...,...,...,...,...,...,...
3685,2021-08-23,150.0,150.0,150.0,150.0,100,150.0,0.366612,1,1
3686,2021-08-24,150.0,150.0,150.0,150.0,100,150.0,0.157571,1,1
3687,2021-08-25,150.0,150.0,150.0,150.0,100,150.0,0.014306,1,-1
3688,2021-08-26,150.0,150.0,150.0,150.0,100,150.0,-0.772202,-1,1


#### Split the data. Last 100 rows as test

In [16]:
df_new_IBM = df_new.copy()
xd_IBM = df_new_IBM[[ "Open", "High", "Low", "Close","Volume", "Adjusted"]]
le = preprocessing.LabelEncoder()
decision = le.fit(df_new_IBM["Decision(next_day)"])
decision = le.transform(df_new_IBM["Decision(next_day)"])

xd_train_dt =  xd_IBM[:-102]
xd_test_dt  =  xd_IBM[-102:-2]

yd_train_dt =  decision[:-102]
yd_test_dt  =  decision[-102:-2]

print("No of training samples : {}".format(xd_train_dt.shape))
print("No of test samples     : {}\n".format(xd_test_dt.shape))
print("y training samples     : {}".format(yd_train_dt.shape))
print("y test samples         : {}\n".format(yd_test_dt.shape))

No of training samples : (3587, 6)
No of test samples     : (100, 6)

y training samples     : (3587,)
y test samples         : (100,)



In [27]:
import warnings
warnings.filterwarnings('ignore')

k = 10
dt_kfold_ibm = KFold(n_splits = k, random_state = None)
dt_model_ibm = DecisionTreeClassifier()
dt_acc_score_ibm = []
#print(x_train.shape)
dt_precision_ibm = []
dt_recall_ibm = []
dt_f1_ibm = []

for train_index_ibm, test_index_ibm in dt_kfold_ibm.split(xd_IBM):
    #print(train_index.shape, test_index.shape)
    dt_x_train_ibm, dt_x_test_ibm = xd_IBM.iloc[train_index_ibm,:], xd_IBM.iloc[test_index_ibm,:]
    dt_y_train_ibm, dt_y_test_ibm = decision[train_index_ibm], decision[test_index_ibm]
    
    dt_model_ibm.fit(dt_x_train_ibm, dt_y_train_ibm)
    dt_predict_ibm = dt_model_ibm.predict(dt_x_test_ibm)
    dt_acc_ibm = accuracy_score(dt_predict_ibm, dt_y_test_ibm)
    dt_acc_score_ibm.append(dt_acc_ibm)
    
    precision_tree_ibm = precision_score(dt_y_test_ibm, dt_predict_ibm)
    #print(precision_tree_ibm)
    dt_precision_ibm.append(precision_tree_ibm)
    
    recall_tree_ibm = recall_score(dt_y_test_ibm, dt_predict_ibm)
    dt_recall_ibm.append(recall_tree_ibm)
    
    f1_tree_ibm = f1_score(dt_y_test_ibm, dt_predict_ibm)
    dt_f1_ibm.append(f1_tree_ibm)

avg_dt_acc_score_ibm = sum(dt_acc_score_ibm)/k
print("Accuracy of IBM decision tree of each fold is:\n {}".format(dt_acc_score_ibm))
print("Average accuracy of IBM decision tree is:        {:.2f}%".format(100*avg_dt_acc_score_ibm))

Accuracy of IBM decision tree of each fold is:
 [0.5176151761517616, 0.46070460704607047, 0.46883468834688347, 0.5176151761517616, 0.46612466124661245, 0.48509485094850946, 0.5338753387533876, 0.5230352303523035, 0.5176151761517616, 0.5434782608695652]
Average accuracy of IBM decision tree is:        50.34%


In [28]:
avg_dt_precision_ibm = sum(dt_precision_ibm)/k
print("Precision of IBM decision tree of each fold is:\n{}".format(dt_precision_ibm))
print("precision accuracy of IBM decision tree is:    {:.2f}% \n".format(100*avg_dt_precision_ibm))

avg_dt_recall_ibm = sum(dt_recall_ibm)/k
print("Recall of IBM decision tree of each fold is:\n{}".format(dt_recall_ibm))
print("Recall accuracy of IBM decision tree is:       {:.2f}% \n".format(100*avg_dt_recall_ibm))

avg_dt_f1_ibm = sum(dt_f1_ibm)/k
print("F1 of IBM decision tree of each fold is:\n{} ".format(dt_f1_ibm))
print("F1 accuracy of IBM decision tree is:           {:.2f}% \n".format(100*avg_dt_f1_ibm))

Precision of IBM decision tree of each fold is:
[0.5363984674329502, 0.45493562231759654, 0.5202702702702703, 0.5625, 0.3620689655172414, 0.4835164835164835, 0.5486381322957199, 0.5681818181818182, 0.546875, 0.5679611650485437]
precision accuracy of IBM decision tree is:    51.51% 

Recall of IBM decision tree of each fold is:
[0.7106598984771574, 0.5955056179775281, 0.3811881188118812, 0.28421052631578947, 0.11602209944751381, 0.9887640449438202, 0.7157360406091371, 0.26595744680851063, 0.5357142857142857, 0.5969387755102041]
Recall accuracy of IBM decision tree is:       51.91% 

F1 of IBM decision tree of each fold is:
[0.611353711790393, 0.5158150851581509, 0.44, 0.3776223776223776, 0.17573221757322177, 0.6494464944649446, 0.6211453744493393, 0.36231884057971014, 0.5412371134020619, 0.5820895522388059] 
F1 accuracy of IBM decision tree is:           48.77% 



#### Finding accuracy of K-fold, precision, recall and f1-score of KNN classifier

In [30]:
k = 10
knn_kfold_ibm = KFold(n_splits = k, random_state = None)
knn_model_ibm = KNeighborsClassifier()
knn_acc_score_ibm = []
#print(x_train.shape)
knn_precision_ibm = []
knn_recall_ibm = []
knn_f1_ibm = []

for train_index_ibm, test_index_ibm in knn_kfold_ibm.split(xd_IBM):
    #print(train_index.shape, test_index.shape)
    knn_x_train_ibm, knn_x_test_ibm = xd_IBM.iloc[train_index_ibm,:], xd_IBM.iloc[test_index_ibm,:]
    knn_y_train_ibm, knn_y_test_ibm = decision[train_index_ibm], decision[test_index_ibm]
    
    knn_model_ibm.fit(knn_x_train_ibm, knn_y_train_ibm)
    knn_predict_ibm = knn_model_ibm.predict(knn_x_test_ibm)
    knn_acc_ibm = accuracy_score(knn_predict_ibm, knn_y_test_ibm)
    knn_acc_score_ibm.append(knn_acc_ibm)
    
    precision_tree_ibm_knn = precision_score(knn_y_test_ibm, knn_predict_ibm)
    #print(precision_tree_ibm)
    knn_precision_ibm.append(precision_tree_ibm_knn)
    
    recall_tree_ibm_knn = recall_score(knn_y_test_ibm, knn_predict_ibm)
    knn_recall_ibm.append(recall_tree_ibm_knn)
    
    f1_tree_ibm_knn = f1_score(knn_y_test_ibm, knn_predict_ibm)
    knn_f1_ibm.append(f1_tree_ibm_knn)

avg_knn_acc_score_ibm = sum(knn_acc_score_ibm)/k
print("Accuracy of IBM KNN of each fold is:\n {}".format(knn_acc_score_ibm))
print("Average accuracy of IBM KNN is:        {:.2f}%".format(100*avg_knn_acc_score_ibm))

Accuracy of IBM KNN of each fold is:
 [0.5203252032520326, 0.4634146341463415, 0.46883468834688347, 0.5176151761517616, 0.5094850948509485, 0.5338753387533876, 0.5013550135501355, 0.5040650406504065, 0.4905149051490515, 0.5543478260869565]
Average accuracy of IBM KNN is:        50.64%


In [32]:
avg_knn_precision_ibm = sum(knn_precision_ibm)/k
print("Precision of IBM KNN of each fold is:\n{}".format(knn_precision_ibm))
print("precision accuracy of IBM KNN is:    {:.2f}% \n".format(100*avg_knn_precision_ibm))

avg_knn_recall_ibm = sum(knn_recall_ibm)/k
print("Recall of IBM KNN of each fold is:\n{}".format(knn_recall_ibm))
print("Recall accuracy of IBM KNN is:       {:.2f}% \n".format(100*avg_knn_recall_ibm))

avg_knn_f1_ibm = sum(knn_f1_ibm)/k
print("F1 of IBM KNN of each fold is:\n{} ".format(knn_f1_ibm))
print("F1 accuracy of IBM KNN is:           {:.2f}% \n".format(100*avg_knn_f1_ibm))

Precision of IBM KNN of each fold is:
[0.5485436893203883, 0.4618320610687023, 0.52, 0.5625, 0.0, 0.5340909090909091, 0.5414012738853503, 0.5581395348837209, 0.5487804878048781, 0.5747663551401869]
precision accuracy of IBM KNN is:    48.50% 

Recall of IBM KNN of each fold is:
[0.5736040609137056, 0.6797752808988764, 0.38613861386138615, 0.28421052631578947, 0.0, 0.2640449438202247, 0.43147208121827413, 0.1276595744680851, 0.22959183673469388, 0.6275510204081632]
Recall accuracy of IBM KNN is:       36.04% 

F1 of IBM KNN of each fold is:
[0.5607940446650124, 0.55, 0.4431818181818182, 0.3776223776223776, 0.0, 0.3533834586466165, 0.480225988700565, 0.20779220779220778, 0.3237410071942446, 0.6000000000000001] 
F1 accuracy of IBM KNN is:           38.97% 



#### Finding accuracy of K-fold, precision, recall and f1-score of Naive bays classifier

In [33]:
k = 10
g_kfold_ibm = KFold(n_splits = k, random_state = None)
g_model_ibm = GaussianNB()
g_acc_score_ibm = []
#print(x_train.shape)
g_precision_ibm = []
g_recall_ibm = []
g_f1_ibm = []

for train_index_ibm, test_index_ibm in g_kfold_ibm.split(xd_IBM):
    #print(train_index.shape, test_index.shape)
    g_x_train_ibm, g_x_test_ibm = xd_IBM.iloc[train_index_ibm,:], xd_IBM.iloc[test_index_ibm,:]
    g_y_train_ibm, g_y_test_ibm = decision[train_index_ibm], decision[test_index_ibm]
    
    g_model_ibm.fit(g_x_train_ibm, g_y_train_ibm)
    g_predict_ibm = g_model_ibm.predict(g_x_test_ibm)
    g_acc_ibm = accuracy_score(g_predict_ibm, g_y_test_ibm)
    g_acc_score_ibm.append(g_acc_ibm)
    
    precision_tree_ibmg = precision_score(g_y_test_ibm, g_predict_ibm)
    #print(precision_tree_ibm)
    g_precision_ibm.append(precision_tree_ibmg)
    
    recall_tree_ibm_g = recall_score(g_y_test_ibm, g_predict_ibm)
    g_recall_ibm.append(recall_tree_ibm_g)
    
    f1_tree_ibm_g = f1_score(g_y_test_ibm, g_predict_ibm)
    g_f1_ibm.append(f1_tree_ibm_g)

avg_g_acc_score_ibm = sum(g_acc_score_ibm)/k
print("Accuracy of IBM Gaussian classifier of each fold is:\n {}".format(g_acc_score_ibm))
print("Average accuracy of IBM Gaussian classifier is:        {:.2f}%".format(100*avg_g_acc_score_ibm))

Accuracy of IBM Gaussian classifier of each fold is:
 [0.5257452574525745, 0.4823848238482385, 0.5447154471544715, 0.5176151761517616, 0.46612466124661245, 0.48509485094850946, 0.4905149051490515, 0.5040650406504065, 0.5149051490514905, 0.5271739130434783]
Average accuracy of IBM Gaussian classifier is:        50.58%


In [34]:
avg_g_precision_ibm = sum(g_precision_ibm)/k
print("Precision of IBM Gaussian classifier of each fold is:\n{}".format(g_precision_ibm))
print("precision accuracy of IBM Gaussian classifier is:    {:.2f}% \n".format(100*avg_g_precision_ibm))

avg_g_recall_ibm = sum(g_recall_ibm)/k
print("Recall of IBM Gaussian classifier of each fold is:\n{}".format(g_recall_ibm))
print("Recall accuracy of IBM Gaussian classifier is:       {:.2f}% \n".format(100*avg_g_recall_ibm))

avg_g_f1_ibm = sum(g_f1_ibm)/k
print("F1 of IBM Gaussian classifier of each fold is:\n{} ".format(g_f1_ibm))
print("F1 accuracy of IBM Gaussian classifier is:           {:.2f}% \n".format(100*avg_g_f1_ibm))

Precision of IBM Gaussian classifier of each fold is:
[0.5381944444444444, 0.481994459833795, 0.5494186046511628, 0.5625, 0.3620689655172414, 0.48342541436464087, 0.5294117647058824, 0.5581395348837209, 0.5858585858585859, 0.53125]
precision accuracy of IBM Gaussian classifier is:    51.82% 

Recall of IBM Gaussian classifier of each fold is:
[0.7868020304568528, 0.9775280898876404, 0.9356435643564357, 0.28421052631578947, 0.11602209944751381, 0.9831460674157303, 0.41116751269035534, 0.1276595744680851, 0.29591836734693877, 0.9540816326530612]
Recall accuracy of IBM Gaussian classifier is:       58.72% 

F1 of IBM Gaussian classifier of each fold is:
[0.6391752577319587, 0.6456400742115027, 0.6923076923076924, 0.3776223776223776, 0.17573221757322177, 0.6481481481481481, 0.4628571428571429, 0.20779220779220778, 0.39322033898305087, 0.6824817518248175] 
F1 accuracy of IBM Gaussian classifier is:           49.25% 

