### Week5: Machine Learning & Data Mining

In [128]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn import preprocessing
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif, f_regression, chi2, mutual_info_classif 

In [129]:
pd.set_option('display.max_colwidth',None)
titanic = pd.read_csv('./titanic.csv')
print("Number of samples in original data: {}\n".format(len(titanic.index)))

columns = titanic.columns
print("Features present in dataset: \n", columns)

titanic.loc[titanic['Sex'] == 'male', 'Sex']=1
titanic.loc[titanic['Sex'] == 'female', 'Sex']=0
print(titanic.head(5))

Number of samples in original data: 887

Features present in dataset: 
 Index(['Survived', 'Pclass', 'Name', 'Sex', 'Age', 'Siblings/Spouses Aboard',
       'Parents/Children Aboard', 'Fare'],
      dtype='object')
   Survived  Pclass                                                Name Sex  \
0         0       3                              Mr. Owen Harris Braund   1   
1         1       1  Mrs. John Bradley (Florence Briggs Thayer) Cumings   0   
2         1       3                               Miss. Laina Heikkinen   0   
3         1       1         Mrs. Jacques Heath (Lily May Peel) Futrelle   0   
4         0       3                             Mr. William Henry Allen   1   

    Age  Siblings/Spouses Aboard  Parents/Children Aboard     Fare  
0  22.0                        1                        0   7.2500  
1  38.0                        1                        0  71.2833  
2  26.0                        0                        0   7.9250  
3  35.0                        1  

### Q1. Predict the class ‘Survived’ with a k-nearest neighbours classifier with 3 distance metrics

In [130]:
le = preprocessing.LabelEncoder()

knnclassifier = KNeighborsClassifier(n_neighbors = 3, metric = 'manhattan')

xd = titanic[["Pclass","Sex","Age","Siblings/Spouses Aboard","Parents/Children Aboard","Fare"]]
yd = le.fit(titanic["Survived"])
yd = le.transform(titanic["Survived"])

xd_train =  xd[:-102]
xd_test  =  xd[-102:-2]

yd_train =  yd[:-102]
yd_test  =  yd[-102:-2]

print("No of training samples: {}".format(xd_train.shape))
print("No of test samples: {}".format(xd_test.shape))
print("y training samples: {}".format(yd_train.shape))
print("y test samples: {}".format(yd_test.shape))

No of training samples: (785, 6)
No of test samples: (100, 6)
y training samples: (785,)
y test samples: (100,)


In [131]:
knnclassifier.fit(xd_train, yd_train)
y_pred_m = knnclassifier.predict(xd_test)
acc_manhattan = accuracy_score(yd_test, y_pred_m)
print(acc_manhattan)
print(confusion_matrix(yd_test,y_pred_m))
#print(y_pred_m)

0.73
[[51 14]
 [13 22]]


In [132]:
knnclassifier_e = KNeighborsClassifier(n_neighbors = 3, metric = 'euclidean')

knnclassifier_e.fit(xd_train, yd_train)
y_pred_e = knnclassifier_e.predict(xd_test)

acc_euclidien = accuracy_score(yd_test, y_pred_e)
print(acc_euclidien)
print(confusion_matrix(yd_test,y_pred_e))
#print(y_pred_e)


0.75
[[52 13]
 [12 23]]


In [133]:

knnclassifier_c = KNeighborsClassifier(n_neighbors = 3, metric = 'cosine')

knnclassifier_c.fit(xd_train, yd_train)
y_pred_c = knnclassifier_c.predict(xd_test)
acc_cosine = accuracy_score(yd_test, y_pred_c)
print(acc_cosine)
print(confusion_matrix(yd_test,y_pred_c))


0.77
[[56  9]
 [14 21]]


### which distance do you think is the best distance measure? and why?

As above accuracy of manhattan, euclidien, cosine, Cosine is best distance measure because it gives the accuracy of 77%

### Q2. determine the number attributes that is capable of giving the best prediction.

In [95]:
df = pd.read_csv('./IBM.txt', delimiter = " ")
df_raw = df
print("Number of rows in original data: {}".format(len(df.index)))
print("Features: ", df.columns)


Number of rows in original data: 3692
Features:  Index(['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Adjusted'], dtype='object')


In [96]:
 df['Daily_returns'] = 100*((df['Close'] - df['Close'].shift())/ df['Close'].shift())
conditions = [(df['Daily_returns'] >= 0.0),(df['Daily_returns'] < 0.0)]
# 1 for UP. -1 for Down
values = [1, -1]
df['Decision'] = np.select(conditions, values)
#df['Decision(next_day)'] = df['Decision'].shift(-1)
print("Number of rows in processed data: {}".format(len(df.index)))

df_new = df[1:-2]
#df_new['Decision(next_day)'] = df_new['Decision(next_day)'].astype('int32')
df_new.head(8)


Number of rows in processed data: 3692


Unnamed: 0,Date,Open,High,Low,Close,Volume,Adjusted,Daily_returns,Decision
1,2007-01-04,97.25,98.790001,96.879997,98.309998,10524500,63.802544,1.06919,1
2,2007-01-05,97.599998,97.949997,96.910004,97.419998,7221300,63.22493,-0.9053,-1
3,2007-01-08,98.5,99.5,98.349998,98.900002,10340000,64.185463,1.519199,1
4,2007-01-09,99.080002,100.330002,99.07,100.07,11108200,64.944771,1.183011,1
5,2007-01-10,98.5,99.050003,97.93,98.889999,8744800,64.178978,-1.179176,-1
6,2007-01-11,99.0,99.900002,98.5,98.650002,8000700,64.023201,-0.242691,-1
7,2007-01-12,98.989998,99.690002,98.5,99.339996,6636500,64.471024,0.699436,1
8,2007-01-16,99.400002,100.839996,99.300003,100.82,9602200,65.431503,1.489837,1


In [97]:
### df_new_IBM = df_new.copy()

xd_IBM = df_new_IBM[[ "Open", "High", "Low", "Close","Volume", "Adjusted"]]
print(xd_IBM.head(5))
le = preprocessing.LabelEncoder()
decision = le.fit(df_new_IBM["Decision"])
decision = le.transform(df_new_IBM["Decision"])
print(decision.shape)


xd_train_knn =  xd_IBM[:-102]
xd_test_knn  =  xd_IBM[-102:-2]

yd_train_knn =  decision[:-102]
yd_test_knn  =  decision[-102:-2]

print("No of training samples : {}".format(xd_train_knn.shape))
print("No of test samples     : {}\n".format(xd_test_knn.shape))
print("y training samples : {}".format(yd_train_knn.shape))
print("y test samples     : {}\n".format(yd_test_knn.shape))

        Open        High        Low       Close    Volume   Adjusted
1  97.250000   98.790001  96.879997   98.309998  10524500  63.802544
2  97.599998   97.949997  96.910004   97.419998   7221300  63.224930
3  98.500000   99.500000  98.349998   98.900002  10340000  64.185463
4  99.080002  100.330002  99.070000  100.070000  11108200  64.944771
5  98.500000   99.050003  97.930000   98.889999   8744800  64.178978
(3689,)
No of training samples : (3587, 6)
No of test samples     : (100, 6)

y training samples : (3587,)
y test samples     : (100,)



In [134]:
for k in range(1,15,2):
    knnclassifier_IBM = KNeighborsClassifier(n_neighbors = k, metric = 'cosine')

    knnclassifier_IBM.fit(xd_train_knn, yd_train_knn)
    y_pred_knn = knnclassifier_IBM.predict(xd_test_knn)
    acc_knn = accuracy_score(yd_test_knn, y_pred_knn)
    print("------------")
    print("Accuracy of {} neighbors is: {}".format(k,acc_knn))
    print("confusion_matrix \n",confusion_matrix(yd_test_knn,y_pred_knn))

------------
Accuracy of 1 neighbors is: 0.5
confusion_matrix 
 [[ 8 36]
 [14 42]]
------------
Accuracy of 3 neighbors is: 0.48
confusion_matrix 
 [[ 8 36]
 [16 40]]
------------
Accuracy of 5 neighbors is: 0.5
confusion_matrix 
 [[11 33]
 [17 39]]
------------
Accuracy of 7 neighbors is: 0.5
confusion_matrix 
 [[ 7 37]
 [13 43]]
------------
Accuracy of 9 neighbors is: 0.52
confusion_matrix 
 [[ 9 35]
 [13 43]]
------------
Accuracy of 11 neighbors is: 0.51
confusion_matrix 
 [[ 6 38]
 [11 45]]
------------
Accuracy of 13 neighbors is: 0.46
confusion_matrix 
 [[ 5 39]
 [15 41]]


#### By using *f_classif* feature selection it gives good score for the feature "Volume"

In [139]:
features = SelectKBest(f_classif, k=6).fit(xd_train_knn, yd_train_knn)
df_scores = pd.DataFrame(features.scores_)
df_cols = pd.DataFrame(xd_train_knn.columns)

feature_scores = pd.concat([df_cols, df_scores], axis=1)
feature_scores.columns = ["Feature", "Score"]
print(feature_scores.nlargest(6, "Score"))

    Feature     Score
4    Volume  4.265510
0      Open  1.504121
5  Adjusted  0.633138
3     Close  0.394569
1      High  0.133339
2       Low  0.063128


In [136]:
features = SelectKBest(f_regression, k=6).fit(xd_train_knn, yd_train_knn)
df_scores = pd.DataFrame(features.scores_)
df_cols = pd.DataFrame(xd_train_knn.columns)

feature_scores = pd.concat([df_cols, df_scores], axis=1)
feature_scores.columns = ["Feature", "Score"]
print(feature_scores.nlargest(6, "Score"))

    Feature     Score
4    Volume  4.265510
0      Open  1.504121
5  Adjusted  0.633138
3     Close  0.394569
1      High  0.133339
2       Low  0.063128


In [137]:
features = SelectKBest(chi2, k=6).fit(xd_train_knn, yd_train_knn)
df_scores = pd.DataFrame(features.scores_)
df_cols = pd.DataFrame(xd_train_knn.columns)

feature_scores = pd.concat([df_cols, df_scores], axis=1)
feature_scores.columns = ["Feature", "Score"]
print(feature_scores.nlargest(6, "Score"))

    Feature         Score
4    Volume  7.994296e+06
0      Open  9.952064e+00
5  Adjusted  3.605536e+00
3     Close  2.608075e+00
1      High  8.737601e-01
2       Low  4.220623e-01


In [138]:
features = SelectKBest(mutual_info_classif, k=6).fit(xd_train_knn, yd_train_knn)
df_scores = pd.DataFrame(features.scores_)
df_cols = pd.DataFrame(xd_train_knn.columns)

feature_scores = pd.concat([df_cols, df_scores], axis=1)
feature_scores.columns = ["Feature", "Score"]
print(feature_scores.nlargest(6, "Score"))

    Feature     Score
3     Close  0.008422
0      Open  0.000000
1      High  0.000000
2       Low  0.000000
4    Volume  0.000000
5  Adjusted  0.000000
