## Week 7: Machine Learning & Data Mining

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
# from sklearn.metrics import precision_score
# from sklearn.metrics import recall_score
# from sklearn.metrics import f1_score

#from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

### Q1. Taking the titanic dataset and using all attributes to predict the class ‘Survived’

In [2]:
# loading Titanic data

titanic = pd.read_csv('./titanic.csv')
print("Number of points in original data: {}".format(len(titanic.index)))

columns = titanic.columns
print("Features present in dataset: \n", list(columns))
titanic.head(5)

Number of points in original data: 887
Features present in dataset: 
 ['Survived', 'Pclass', 'Name', 'Sex', 'Age', 'Siblings/Spouses Aboard', 'Parents/Children Aboard', 'Fare']


Unnamed: 0,Survived,Pclass,Name,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
0,0,3,Mr. Owen Harris Braund,male,22.0,1,0,7.25
1,1,1,Mrs. John Bradley (Florence Briggs Thayer) Cum...,female,38.0,1,0,71.2833
2,1,3,Miss. Laina Heikkinen,female,26.0,0,0,7.925
3,1,1,Mrs. Jacques Heath (Lily May Peel) Futrelle,female,35.0,1,0,53.1
4,0,3,Mr. William Henry Allen,male,35.0,0,0,8.05


#### Converting continuous attaribute into classes

In [3]:
conditions = [(titanic['Age'] < 25.0),(titanic['Age'] > 45.0), 
              (titanic['Age'] > 25.0) & (titanic['Age'] < 45.0)]

values = [1, 3, 2]
titanic['New_age'] = np.select(conditions, values)
#titanic
conditions = [(titanic['Fare'] < 15),(titanic['Fare'] > 50), 
              (titanic['Fare'] > 15) & (titanic['Fare'] < 50)]

values = [1, 3, 2]
titanic['New_Fare'] = np.select(conditions, values)

#titanic.loc[titanic['Siblings/Spouses Aboard'] == 1 , 'Siblings/Spouses Aboard'] = 'True'
#titanic

titanic.loc[titanic['Sex'] == 'male', 'Sex'] = 1
titanic.loc[titanic['Sex'] == 'female', 'Sex'] = 0

#### Removing continuous attributes after creating new attribute of same column 

In [4]:
titanic.drop(columns=['Age','Fare'], axis = 1, inplace = True)
titanic

Unnamed: 0,Survived,Pclass,Name,Sex,Siblings/Spouses Aboard,Parents/Children Aboard,New_age,New_Fare
0,0,3,Mr. Owen Harris Braund,1,1,0,1,1
1,1,1,Mrs. John Bradley (Florence Briggs Thayer) Cum...,0,1,0,2,3
2,1,3,Miss. Laina Heikkinen,0,0,0,2,1
3,1,1,Mrs. Jacques Heath (Lily May Peel) Futrelle,0,1,0,2,3
4,0,3,Mr. William Henry Allen,1,0,0,2,1
...,...,...,...,...,...,...,...,...
882,0,2,Rev. Juozas Montvila,1,0,0,2,1
883,1,1,Miss. Margaret Edith Graham,0,0,0,1,2
884,0,3,Miss. Catherine Helen Johnston,0,1,2,1,2
885,1,1,Mr. Karl Howell Behr,1,0,0,2,2


In [25]:
le = preprocessing.LabelEncoder()

pd.set_option('display.max_colwidth', None)
x = titanic[["Pclass","Sex","Siblings/Spouses Aboard","Parents/Children Aboard", "New_age","New_Fare"]]
y = le.fit(titanic["Survived"])
y = le.transform(titanic["Survived"])


# set the random state 
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.35, random_state=0)

print("No of training samples: {}".format(x_train.shape))
print("No of test samples    : {}".format(x_test.shape))
print("y training samples    : {}".format(y_train.shape))
print("y test samples        : {}".format(y_test.shape))
x_train.head(5)

No of training samples: (576, 6)
No of test samples    : (311, 6)
y training samples    : (576,)
y test samples        : (311,)


Unnamed: 0,Pclass,Sex,Siblings/Spouses Aboard,Parents/Children Aboard,New_age,New_Fare
243,3,1,0,0,2,1
518,3,1,0,0,1,1
35,1,1,1,0,2,3
81,3,0,0,0,2,1
159,3,1,0,1,2,2


#### A. Choose Three classifiers and evaluate their performance using all attributes

In [32]:
clf_gaussian = GaussianNB()
clf_gaussian.fit(x_train, y_train)
gaussian_pred = clf_gaussian.predict(x_test)

nb_acc = accuracy_score(y_test, gaussian_pred)
print("Gaussian Accuracy : {:.2f}%".format(nb_acc*100))
print("Gaussian confusion matrix :\n{} ".format(confusion_matrix(y_test, gaussian_pred)))

Gaussian Accuracy : 79.74%
Gaussian confusion matrix :
[[157  33]
 [ 30  91]] 


In [33]:
clf_tree = DecisionTreeClassifier()
clf_tree.fit(x_train, y_train)
tree_pred = clf_tree.predict(x_test)

tree_acc = accuracy_score(y_test, tree_pred)
print("Decision tree Accuracy : {:.2f}%".format(tree_acc*100))
print("Decision tree confusion matrix :\n{} ".format(confusion_matrix(y_test, tree_pred)))

Decision tree Accuracy : 76.85%
Decision tree confusion matrix :
[[153  37]
 [ 35  86]] 


In [34]:
clf_knn = KNeighborsClassifier()
clf_knn.fit(x_train, y_train)
knn_pred = clf_knn.predict(x_test)

knn_acc = accuracy_score(y_test, knn_pred)
print("Decision tree Accuracy : {:.2f}%".format(knn_acc*100))
print("Decision tree confusion matrix :\n{} ".format(confusion_matrix(y_test, knn_pred)))

Decision tree Accuracy : 78.14%
Decision tree confusion matrix :
[[156  34]
 [ 34  87]] 


#### B. Define a feature selection method and use it on all the classifiers

In [42]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

best_features =  SelectKBest(score_func = chi2, k=5)
fit_features = best_features.fit(x_train, y_train)
train_features = fit_features.transform(x_train)
test_features = fit_features.transform(x_test)

clf_gaussian.fit(train_features,y_train)
y_pred_g = clf_gaussian.predict(test_features)

g_acc = accuracy_score(y_test, y_pred_g)

print("Gaussian selected features accuracy : {:.2f}%".format(g_acc*100))
print("Gaussian selected features confusion matrix :\n{} ".format(confusion_matrix(y_test, y_pred_g)))

Gaussian selected features accuracy : 79.74%
Gaussian selected features confusion matrix :
[[157  33]
 [ 30  91]] 


In [43]:
clf_tree.fit(train_features,y_train)
y_pred_tree = clf_tree.predict(test_features)

t_acc = accuracy_score(y_test, y_pred_tree)

print("Decision tree selected features accuracy : {:.2f}%".format(t_acc*100))
print("Decision tree selected features confusion matrix :\n{} ".format(confusion_matrix(y_test, y_pred_tree)))

Decision tree selected features accuracy : 77.49%
Decision tree selected features confusion matrix :
[[159  31]
 [ 39  82]] 


In [44]:
clf_knn.fit(train_features,y_train)
y_pred_knn = clf_knn.predict(test_features)

k_acc = accuracy_score(y_test, y_pred_knn)

print("Decision tree selected features accuracy : {:.2f}%".format(k_acc*100))
print("Decision tree selected features confusion matrix :\n{} ".format(confusion_matrix(y_test, y_pred_knn)))

Decision tree selected features accuracy : 75.56%
Decision tree selected features confusion matrix :
[[143  47]
 [ 29  92]] 


#### C. Compare the classifiers and explain the differences observed

Accuracy is reduced. Because accuracy is depend on which features selected, if good features are selected the accuracy is high otherwise accuracynis low.

### Q2. Build Decision tree model with your selected stock / market index using all attributes to predict ‘daily returns’

#### loading data

In [13]:
df = pd.read_csv('./IBM.txt', delimiter = " ")
df_raw = df
print("Number of rows in original data: {}".format(len(df.index)))
print("Features: ", list(df.columns))


Number of rows in original data: 3692
Features:  ['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Adjusted']


In [14]:
pd.options.mode.chained_assignment = None

df['Daily_returns'] = 100*((df['Close'] - df['Close'].shift())/ df['Close'].shift())
conditions = [(df['Daily_returns'] >= 0.0),(df['Daily_returns'] < 0.0)]
# 1 for UP. -1 for Down

values1 = [1, -1]
df['Decision'] = np.select(conditions, values1)
df['Decision(next_day)'] = df['Decision'].shift(-1)
print("Number of rows in processed data: {}".format(len(df.index)))

df_new = df[1:-2]
df_new['Decision(next_day)'] = df_new['Decision(next_day)'].astype('int32')
df_new.head(8)


Number of rows in processed data: 3692


Unnamed: 0,Date,Open,High,Low,Close,Volume,Adjusted,Daily_returns,Decision,Decision(next_day)
1,2007-01-04,97.25,98.790001,96.879997,98.309998,10524500,63.802544,1.06919,1,-1
2,2007-01-05,97.599998,97.949997,96.910004,97.419998,7221300,63.22493,-0.9053,-1,1
3,2007-01-08,98.5,99.5,98.349998,98.900002,10340000,64.185463,1.519199,1,1
4,2007-01-09,99.080002,100.330002,99.07,100.07,11108200,64.944771,1.183011,1,-1
5,2007-01-10,98.5,99.050003,97.93,98.889999,8744800,64.178978,-1.179176,-1,-1
6,2007-01-11,99.0,99.900002,98.5,98.650002,8000700,64.023201,-0.242691,-1,1
7,2007-01-12,98.989998,99.690002,98.5,99.339996,6636500,64.471024,0.699436,1,1
8,2007-01-16,99.400002,100.839996,99.300003,100.82,9602200,65.431503,1.489837,1,-1


In [15]:
pd.options.mode.chained_assignment = None

df_new.loc[df_new['Open'] < 100, 'Open'] = 100
df_new.loc[(df_new['Open'] > 100)&(df_new['Open'] < 120), 'Open'] = 120
df_new.loc[df_new['Open'] > 120, 'Open'] = 150

df_new.loc[df_new['High'] < 100, 'High'] = 100
df_new.loc[((df_new['High'] > 100)&(df_new['High'] < 120)), 'High'] = 120
df_new.loc[df_new['High'] > 120, 'High'] = 150

df_new.loc[df_new['Low'] < 100, 'Low'] = 100
df_new.loc[((df_new['Low'] > 100)&(df_new['Low'] < 120)), 'Low'] = 120
df_new.loc[df_new['Low'] > 120, 'Low'] = 150

df_new.loc[df_new['Adjusted'] < 100, 'Adjusted'] = 100
df_new.loc[((df_new['Adjusted'] > 100)&(df_new['Adjusted'] < 120)), 'Adjusted'] = 120
df_new.loc[df_new['Adjusted'] > 120, 'Adjusted'] = 150

df_new.loc[df_new['Close'] < 100, 'Close'] = 100
df_new.loc[((df_new['Close'] > 100)&(df_new['Close'] < 120)), 'Close'] = 120
df_new.loc[df_new['Close'] > 120, 'Close'] = 150

df_new.loc[df_new['Volume'] <= 3039600, 'Volume'] = 100
df_new.loc[((df_new['Volume'] > 3039600)&(df_new['Volume'] < 10000000)), 'Volume'] = 120
df_new.loc[df_new['Volume'] > 10000000, 'Volume'] = 150
df_new

Unnamed: 0,Date,Open,High,Low,Close,Volume,Adjusted,Daily_returns,Decision,Decision(next_day)
1,2007-01-04,100.0,100.0,100.0,100.0,150,100.0,1.069190,1,-1
2,2007-01-05,100.0,100.0,100.0,100.0,120,100.0,-0.905300,-1,1
3,2007-01-08,100.0,100.0,100.0,100.0,150,100.0,1.519199,1,1
4,2007-01-09,100.0,120.0,100.0,120.0,150,100.0,1.183011,1,-1
5,2007-01-10,100.0,100.0,100.0,100.0,120,100.0,-1.179176,-1,-1
...,...,...,...,...,...,...,...,...,...,...
3685,2021-08-23,150.0,150.0,150.0,150.0,100,150.0,0.366612,1,1
3686,2021-08-24,150.0,150.0,150.0,150.0,100,150.0,0.157571,1,1
3687,2021-08-25,150.0,150.0,150.0,150.0,100,150.0,0.014306,1,-1
3688,2021-08-26,150.0,150.0,150.0,150.0,100,150.0,-0.772202,-1,1


#### Split the data. Last 100 rows as test

In [16]:
df_new_IBM = df_new.copy()
xd_IBM = df_new_IBM[[ "Open", "High", "Low", "Close","Volume", "Adjusted"]]
le = preprocessing.LabelEncoder()
decision = le.fit(df_new_IBM["Decision(next_day)"])
decision = le.transform(df_new_IBM["Decision(next_day)"])

xd_train_dt =  xd_IBM[:-102]
xd_test_dt  =  xd_IBM[-102:-2]

yd_train_dt =  decision[:-102]
yd_test_dt  =  decision[-102:-2]

print("No of training samples : {}".format(xd_train_dt.shape))
print("No of test samples     : {}\n".format(xd_test_dt.shape))
print("y training samples     : {}".format(yd_train_dt.shape))
print("y test samples         : {}\n".format(yd_test_dt.shape))

No of training samples : (3587, 6)
No of test samples     : (100, 6)

y training samples     : (3587,)
y test samples         : (100,)



#### A. Choose Three feature selection methods to evaluate the model

In [53]:
from sklearn.feature_selection import chi2, f_classif, mutual_info_classif

best_features =  SelectKBest(score_func = chi2, k=5)
fit_features = best_features.fit(xd_train_dt, yd_train_dt)
train_features_dt = fit_features.transform(xd_train_dt)
test_features_dt = fit_features.transform(xd_test_dt)

clf_tree_chi2 = DecisionTreeClassifier()
clf_tree_chi2.fit(train_features_dt,yd_train_dt)
y_pred_dt = clf_tree_chi2.predict(test_features_dt)

dt_acc = accuracy_score(yd_test_dt, y_pred_dt)

print("Decision tree selected features accuracy : {:.2f}%".format(dt_acc*100))
print("Decision tree selected features confusion matrix :\n{} ".format(confusion_matrix(yd_test_dt, y_pred_dt)))

Decision tree selected features accuracy : 44.00%
Decision tree selected features confusion matrix :
[[44  0]
 [56  0]] 


In [55]:
from sklearn.feature_selection import RFE

rfe =  RFE(clf_tree, 5)
fit_rfe = rfe.fit(xd_train_dt, yd_train_dt)
train_features_dt_rfe = fit_rfe.transform(xd_train_dt)
test_features_dt_rfe = fit_rfe.transform(xd_test_dt)

clf_tree_rfe = DecisionTreeClassifier()
clf_tree_rfe.fit(train_features_dt_rfe, yd_train_dt)
y_pred_dt_rfe = clf_tree_rfe.predict(test_features_dt_rfe)

dt_acc_rfe = accuracy_score(yd_test_dt, y_pred_dt_rfe)

print("feature ranking : {}".format(fit.ranking_))
print("Decision tree selected features accuracy : {:.2f}%".format(dt_acc_rfe * 100))
print("Decision tree selected features confusion matrix :\n{} ".format(confusion_matrix(yd_test_dt, y_pred_dt_rfe)))

feature ranking : [1 1 1 1 1 1]
Decision tree selected features accuracy : 44.00%
Decision tree selected features confusion matrix :
[[44  0]
 [56  0]] 


In [56]:
from sklearn.decomposition import PCA

pca =  PCA(n_components = 3)
fit_pca = pca.fit(xd_train_dt)
train_features_dt_pca = fit_pca.transform(xd_train_dt)
test_features_dt_pca = fit_pca.transform(xd_test_dt)

clf_tree_pca = DecisionTreeClassifier()
clf_tree_pca.fit(train_features_dt_pca, yd_train_dt)
y_pred_dt_pca = clf_tree_pca.predict(test_features_dt_pca)

dt_acc_pca = accuracy_score(yd_test_dt, y_pred_dt_pca)

#print("feature ranking : {}".format(fit.ranking_))
print("Decision tree selected features accuracy : {:.2f}%".format(dt_acc_pca * 100))
print("Decision tree selected features confusion matrix :\n{} ".format(confusion_matrix(yd_test_dt, y_pred_dt_pca)))

feature ranking : [1 1 1 1 1 1]
Decision tree selected features accuracy : 50.00%
Decision tree selected features confusion matrix :
[[36  8]
 [42 14]] 


#### Compare the feature selection methods and explain the differences observed

I observed that chi2 and RFE feature methods accuracy is same but PCA featue method accuracy is high as compared to othermethods