## Week 5: Machine Learning & Data Mining

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn import preprocessing
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif, f_regression, chi2, mutual_info_classif 

In [5]:
pd.set_option('display.max_colwidth',None)
titanic = pd.read_csv('./titanic.csv')
print("Number of points in original data: {}".format(len(titanic.index)))

columns = titanic.columns
print("Features present in dataset: \n", list(columns))

titanic.loc[titanic['Sex'] == 'male', 'Sex'] = 1
titanic.loc[titanic['Sex'] == 'female', 'Sex'] = 0
titanic.head(5)

Number of points in original data: 887
Features present in dataset: 
 ['Survived', 'Pclass', 'Name', 'Sex', 'Age', 'Siblings/Spouses Aboard', 'Parents/Children Aboard', 'Fare']


Unnamed: 0,Survived,Pclass,Name,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
0,0,3,Mr. Owen Harris Braund,1,22.0,1,0,7.25
1,1,1,Mrs. John Bradley (Florence Briggs Thayer) Cumings,0,38.0,1,0,71.2833
2,1,3,Miss. Laina Heikkinen,0,26.0,0,0,7.925
3,1,1,Mrs. Jacques Heath (Lily May Peel) Futrelle,0,35.0,1,0,53.1
4,0,3,Mr. William Henry Allen,1,35.0,0,0,8.05


### Q1. Predict the class ‘Survived’ with a k-nearest neighbours classifier with 3 distance metrics and  k=3

#### Loading data

In [17]:
le = preprocessing.LabelEncoder()

xd = titanic[["Pclass","Sex","Age","Siblings/Spouses Aboard","Parents/Children Aboard","Fare"]]
yd = le.fit(titanic["Survived"])
yd = le.transform(titanic["Survived"])


# set the random state 
xd_train, xd_test, yd_train, yd_test = train_test_split(xd, yd, test_size = 0.25, random_state=0)


print("No of training samples: {}".format(xd_train.shape))
print("No of test samples    : {}".format(xd_test.shape))
print("y training samples    : {}".format(yd_train.shape))
print("y test samples        : {}".format(yd_test.shape))

No of training samples: (665, 6)
No of test samples    : (222, 6)
y training samples    : (665,)
y test samples        : (222,)


#### Train kNN with Manhattan distance

In [95]:
knnclassifier = KNeighborsClassifier(n_neighbors = 3, metric = 'manhattan')
knnclassifier.fit(xd_train.values, yd_train)
y_pred_m = knnclassifier.predict(xd_test.values)
acc_manhattan = accuracy_score(yd_test, y_pred_m)
print("Accuracy on test set: {:.2f}".format(100*acc_manhattan))
print("Confusion Matrix:" , confusion_matrix(yd_test,y_pred_m))


Accuracy on test set: 76.58
Confusion Matrix: [[121  21]
 [ 31  49]]


#### Train kNN with Euclidean distance

In [94]:
knnclassifier_e = KNeighborsClassifier(n_neighbors = 3, metric = 'euclidean')
knnclassifier_e.fit(xd_train.values, yd_train)
y_pred_e = knnclassifier_e.predict(xd_test.values)
acc_euclidien = accuracy_score(yd_test, y_pred_e)
print("Accuracy on test set: {:.2f}".format(100*acc_euclidien))
print("Confusion Matrix:\n" ,confusion_matrix(yd_test,y_pred_e))


Accuracy on test set: 69.82
Confusion Matrix:
 [[115  27]
 [ 40  40]]


#### Train kNN with Cosine distance

In [89]:

knnclassifier_c = KNeighborsClassifier(n_neighbors = 3, metric = 'cosine')
knnclassifier_c.fit(xd_train.values, yd_train)
y_pred_c = knnclassifier_c.predict(xd_test.values)
acc_cosine = accuracy_score(yd_test, y_pred_c)
print("Accuracy on test set: {:.2f}".format(100*acc_cosine))
print("Confusion Matrix:\n" ,confusion_matrix(yd_test,y_pred_c))


Accuracy on test set: 77.03
Confusion Matrix:
 [[120  22]
 [ 29  51]]


#### which distance do you think is the best distance measure? and why?


#### Answer.
As above accuracy of manhattan, euclidien, cosine, we observe that Cosine distance is best distance measure because it gives the accuracy of 77%.

#### Why? 
#### Answer.   
Because the features are not normalized, using absolute distance feaatures is not ideal since some features has different range of values. Like age is range 0-80 but Class is in range 1-3. Hence, cosine which only uses angle is more ideal than finding distances with te current unnormalised features

### Q2. determine the number attributes that is capable of giving the best prediction.

#### Loading data

In [23]:
df = pd.read_csv('./IBM.txt', delimiter = " ")
df_raw = df
print("Number of rows in original data: {}".format(len(df.index)))
print("Features: ", list(df.columns))


Number of rows in original data: 3692
Features:  ['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Adjusted']


#### Calcuate decision for NEXT day

In [97]:
 df['Daily_returns'] = 100*((df['Close'] - df['Close'].shift())/ df['Close'].shift())
conditions = [(df['Daily_returns'] >= 0.0),(df['Daily_returns'] < 0.0)]
# 1 for UP. -1 for Down

values = [1, -1]
df['Decision'] = np.select(conditions, values)
df['Decision(next_day)'] = df['Decision'].shift(-1)
print("Number of rows in processed data: {}".format(len(df.index)))

df_new = df[1:-2]
df_new['Decision(next_day)'] = df_new['Decision(next_day)'].astype('int32')
df_new.head(8)


Number of rows in processed data: 3692


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()


Unnamed: 0,Date,Open,High,Low,Close,Volume,Adjusted,Daily_returns,Decision,Decision(next_day)
1,2007-01-04,97.25,98.790001,96.879997,98.309998,10524500,63.802544,1.06919,1,-1
2,2007-01-05,97.599998,97.949997,96.910004,97.419998,7221300,63.22493,-0.9053,-1,1
3,2007-01-08,98.5,99.5,98.349998,98.900002,10340000,64.185463,1.519199,1,1
4,2007-01-09,99.080002,100.330002,99.07,100.07,11108200,64.944771,1.183011,1,-1
5,2007-01-10,98.5,99.050003,97.93,98.889999,8744800,64.178978,-1.179176,-1,-1
6,2007-01-11,99.0,99.900002,98.5,98.650002,8000700,64.023201,-0.242691,-1,1
7,2007-01-12,98.989998,99.690002,98.5,99.339996,6636500,64.471024,0.699436,1,1
8,2007-01-16,99.400002,100.839996,99.300003,100.82,9602200,65.431503,1.489837,1,-1


#### Split the data. Last 100 rows as test

In [26]:
df_new_IBM = df_new.copy()
xd_IBM = df_new_IBM[[ "Open", "High", "Low", "Close","Volume", "Adjusted"]]
le = preprocessing.LabelEncoder()
decision = le.fit(df_new_IBM["Decision(next_day)"])
decision = le.transform(df_new_IBM["Decision(next_day)"])

xd_train_knn =  xd_IBM[:-102]
xd_test_knn  =  xd_IBM[-102:-2]

yd_train_knn =  decision[:-102]
yd_test_knn  =  decision[-102:-2]

print("No of training samples : {}".format(xd_train_knn.shape))
print("No of test samples     : {}\n".format(xd_test_knn.shape))
print("y training samples     : {}".format(yd_train_knn.shape))
print("y test samples         : {}\n".format(yd_test_knn.shape))

No of training samples : (3587, 6)
No of test samples     : (100, 6)

y training samples     : (3587,)
y test samples         : (100,)



### We used kNN classifier with k=5, 10, 15, 20, 25 and 30  and selected ALL POSSIBLE COMBINATIONS of 6 attributes present in the data

### Best configuration: k=30, attributes= [high, low, close] and test accuracy: 63%. More analysis below in the output of cells.

In [84]:
feat_names = np.array(["Open", "High", "Low", "Close","Volume", "Adjusted"])
from itertools import chain, combinations
s = [x for x in range(6)]
all_combinations = chain.from_iterable(combinations(s, r) for r in range(len(s)+1))
all_combinations = list(all_combinations)[1:]

best_acc = 0.0 

for k in range(5,35,5):
    
    print("-----------------------------------------------------------------------------------------------")
    print("Evaluating k={} NN classifier\n".format(k))


    for attributes in all_combinations:
        knnclassifier_IBM = KNeighborsClassifier(n_neighbors = k, metric = 'cosine')

        attributes = list(attributes)

        xd_train_knn_i = xd_train_knn.iloc[:, attributes]
        xd_test_knn_i = xd_test_knn.iloc[:,attributes]

        knnclassifier_IBM.fit(xd_train_knn_i.values, yd_train_knn)
        y_pred_knn = knnclassifier_IBM.predict(xd_test_knn_i.values)
        acc_knn = accuracy_score(yd_test_knn, y_pred_knn)
        print("k={} neighbors, selecting {}/6 attributes: {:40s} Accuracy: {:.2f}%"
              .format(k, len(attributes), ",".join(feat_names[attributes]), 100*acc_knn))

        #print("confusion_matrix: \n",confusion_matrix(yd_test_knn,y_pred_knn))

        if best_acc < acc_knn:
            best_attributes = feat_names[attributes]
            best_acc = acc_knn
            best_k = k



-----------------------------------------------------------------------------------------------
Evaluating k=5 NN classifier

k=5 neighbors, selecting 1/6 attributes: Open                                     Accuracy: 56.00%
k=5 neighbors, selecting 1/6 attributes: High                                     Accuracy: 56.00%
k=5 neighbors, selecting 1/6 attributes: Low                                      Accuracy: 56.00%
k=5 neighbors, selecting 1/6 attributes: Close                                    Accuracy: 56.00%
k=5 neighbors, selecting 1/6 attributes: Volume                                   Accuracy: 56.00%
k=5 neighbors, selecting 1/6 attributes: Adjusted                                 Accuracy: 56.00%
k=5 neighbors, selecting 2/6 attributes: Open,High                                Accuracy: 52.00%
k=5 neighbors, selecting 2/6 attributes: Open,Low                                 Accuracy: 47.00%
k=5 neighbors, selecting 2/6 attributes: Open,Close                               

k=10 neighbors, selecting 3/6 attributes: Open,Close,Volume                        Accuracy: 45.00%
k=10 neighbors, selecting 3/6 attributes: Open,Close,Adjusted                      Accuracy: 50.00%
k=10 neighbors, selecting 3/6 attributes: Open,Volume,Adjusted                     Accuracy: 46.00%
k=10 neighbors, selecting 3/6 attributes: High,Low,Close                           Accuracy: 59.00%
k=10 neighbors, selecting 3/6 attributes: High,Low,Volume                          Accuracy: 41.00%
k=10 neighbors, selecting 3/6 attributes: High,Low,Adjusted                        Accuracy: 51.00%
k=10 neighbors, selecting 3/6 attributes: High,Close,Volume                        Accuracy: 50.00%
k=10 neighbors, selecting 3/6 attributes: High,Close,Adjusted                      Accuracy: 49.00%
k=10 neighbors, selecting 3/6 attributes: High,Volume,Adjusted                     Accuracy: 47.00%
k=10 neighbors, selecting 3/6 attributes: Low,Close,Volume                         Accuracy: 52.00%


k=15 neighbors, selecting 4/6 attributes: Open,Close,Volume,Adjusted               Accuracy: 53.00%
k=15 neighbors, selecting 4/6 attributes: High,Low,Close,Volume                    Accuracy: 49.00%
k=15 neighbors, selecting 4/6 attributes: High,Low,Close,Adjusted                  Accuracy: 53.00%
k=15 neighbors, selecting 4/6 attributes: High,Low,Volume,Adjusted                 Accuracy: 55.00%
k=15 neighbors, selecting 4/6 attributes: High,Close,Volume,Adjusted               Accuracy: 55.00%
k=15 neighbors, selecting 4/6 attributes: Low,Close,Volume,Adjusted                Accuracy: 52.00%
k=15 neighbors, selecting 5/6 attributes: Open,High,Low,Close,Volume               Accuracy: 46.00%
k=15 neighbors, selecting 5/6 attributes: Open,High,Low,Close,Adjusted             Accuracy: 55.00%
k=15 neighbors, selecting 5/6 attributes: Open,High,Low,Volume,Adjusted            Accuracy: 56.00%
k=15 neighbors, selecting 5/6 attributes: Open,High,Close,Volume,Adjusted          Accuracy: 61.00%


k=25 neighbors, selecting 2/6 attributes: High,Adjusted                            Accuracy: 45.00%
k=25 neighbors, selecting 2/6 attributes: Low,Close                                Accuracy: 58.00%
k=25 neighbors, selecting 2/6 attributes: Low,Volume                               Accuracy: 47.00%
k=25 neighbors, selecting 2/6 attributes: Low,Adjusted                             Accuracy: 51.00%
k=25 neighbors, selecting 2/6 attributes: Close,Volume                             Accuracy: 51.00%
k=25 neighbors, selecting 2/6 attributes: Close,Adjusted                           Accuracy: 52.00%
k=25 neighbors, selecting 2/6 attributes: Volume,Adjusted                          Accuracy: 44.00%
k=25 neighbors, selecting 3/6 attributes: Open,High,Low                            Accuracy: 61.00%
k=25 neighbors, selecting 3/6 attributes: Open,High,Close                          Accuracy: 56.00%
k=25 neighbors, selecting 3/6 attributes: Open,High,Volume                         Accuracy: 48.00%


k=30 neighbors, selecting 3/6 attributes: High,Volume,Adjusted                     Accuracy: 56.00%
k=30 neighbors, selecting 3/6 attributes: Low,Close,Volume                         Accuracy: 44.00%
k=30 neighbors, selecting 3/6 attributes: Low,Close,Adjusted                       Accuracy: 55.00%
k=30 neighbors, selecting 3/6 attributes: Low,Volume,Adjusted                      Accuracy: 53.00%
k=30 neighbors, selecting 3/6 attributes: Close,Volume,Adjusted                    Accuracy: 57.00%
k=30 neighbors, selecting 4/6 attributes: Open,High,Low,Close                      Accuracy: 52.00%
k=30 neighbors, selecting 4/6 attributes: Open,High,Low,Volume                     Accuracy: 47.00%
k=30 neighbors, selecting 4/6 attributes: Open,High,Low,Adjusted                   Accuracy: 49.00%
k=30 neighbors, selecting 4/6 attributes: Open,High,Close,Volume                   Accuracy: 46.00%
k=30 neighbors, selecting 4/6 attributes: Open,High,Close,Adjusted                 Accuracy: 53.00%


### Best accu: 63.00% with 3 attributes: ['Open' 'High' 'Low'] with k=30 nearest_neighbours


In [85]:
print("\nBest accu: {:.2f}% with {} attributes: {} with k={} nearest_neighbours"
      .format(best_acc*100, len(best_attributes), best_attributes, best_k))


Best accu: 63.00% with 3 attributes: ['Open' 'High' 'Low'] with k=30 nearest_neighbours


## Using SelectKBest to select attributes

#### By using *f_classif* feature selection it gives good score for the feature "Volume"

In [139]:
features = SelectKBest(f_classif, k=6).fit(xd_train_knn, yd_train_knn)
df_scores = pd.DataFrame(features.scores_)
df_cols = pd.DataFrame(xd_train_knn.columns)

feature_scores = pd.concat([df_cols, df_scores], axis=1)
feature_scores.columns = ["Feature", "Score"]
print(feature_scores.nlargest(6, "Score"))

    Feature     Score
4    Volume  4.265510
0      Open  1.504121
5  Adjusted  0.633138
3     Close  0.394569
1      High  0.133339
2       Low  0.063128


In [136]:
features = SelectKBest(f_regression, k=6).fit(xd_train_knn, yd_train_knn)
df_scores = pd.DataFrame(features.scores_)
df_cols = pd.DataFrame(xd_train_knn.columns)

feature_scores = pd.concat([df_cols, df_scores], axis=1)
feature_scores.columns = ["Feature", "Score"]
print(feature_scores.nlargest(6, "Score"))

    Feature     Score
4    Volume  4.265510
0      Open  1.504121
5  Adjusted  0.633138
3     Close  0.394569
1      High  0.133339
2       Low  0.063128


In [137]:
features = SelectKBest(chi2, k=6).fit(xd_train_knn, yd_train_knn)
df_scores = pd.DataFrame(features.scores_)
df_cols = pd.DataFrame(xd_train_knn.columns)

feature_scores = pd.concat([df_cols, df_scores], axis=1)
feature_scores.columns = ["Feature", "Score"]
print(feature_scores.nlargest(6, "Score"))

    Feature         Score
4    Volume  7.994296e+06
0      Open  9.952064e+00
5  Adjusted  3.605536e+00
3     Close  2.608075e+00
1      High  8.737601e-01
2       Low  4.220623e-01


In [138]:
features = SelectKBest(mutual_info_classif, k=6).fit(xd_train_knn, yd_train_knn)
df_scores = pd.DataFrame(features.scores_)
df_cols = pd.DataFrame(xd_train_knn.columns)

feature_scores = pd.concat([df_cols, df_scores], axis=1)
feature_scores.columns = ["Feature", "Score"]
print(feature_scores.nlargest(6, "Score"))

    Feature     Score
3     Close  0.008422
0      Open  0.000000
1      High  0.000000
2       Low  0.000000
4    Volume  0.000000
5  Adjusted  0.000000
