## Week 6: Machine Learning & Data Mining

In [32]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, accuracy_score

### Q1. use all attributes to predict the class ‘Survived’ with a Decision tree classifier.

In [None]:
# loading Titanic data

titanic = pd.read_csv('./titanic.csv')
print("Number of points in original data: {}".format(len(titanic.index)))

columns = titanic.columns
print("Features present in dataset: \n", list(columns))
titanic.head(5)

#### Converting continuous attaribute into classes

In [34]:
conditions = [(titanic['Age'] < 25.0),(titanic['Age'] > 45.0), 
              (titanic['Age'] > 25.0) & (titanic['Age'] < 45.0)]

values = [1, 3, 2]
titanic['New_age'] = np.select(conditions, values)
#titanic
conditions = [(titanic['Fare'] < 15),(titanic['Fare'] > 50), 
              (titanic['Fare'] > 15) & (titanic['Fare'] < 50)]

values = [1, 3, 2]
titanic['New_Fare'] = np.select(conditions, values)

#titanic.loc[titanic['Siblings/Spouses Aboard'] == 1 , 'Siblings/Spouses Aboard'] = 'True'
#titanic

titanic.loc[titanic['Sex'] == 'male', 'Sex'] = 1
titanic.loc[titanic['Sex'] == 'female', 'Sex'] = 0

#### Removing continuous attributes after creating new attribute of same column 

In [35]:
titanic.drop(columns=['Age','Fare'], axis = 1, inplace = True)
titanic

Unnamed: 0,Survived,Pclass,Name,Sex,Siblings/Spouses Aboard,Parents/Children Aboard,New_age,New_Fare
0,0,3,Mr. Owen Harris Braund,1,1,0,1,1
1,1,1,Mrs. John Bradley (Florence Briggs Thayer) Cum...,0,1,0,2,3
2,1,3,Miss. Laina Heikkinen,0,0,0,2,1
3,1,1,Mrs. Jacques Heath (Lily May Peel) Futrelle,0,1,0,2,3
4,0,3,Mr. William Henry Allen,1,0,0,2,1
...,...,...,...,...,...,...,...,...
882,0,2,Rev. Juozas Montvila,1,0,0,2,1
883,1,1,Miss. Margaret Edith Graham,0,0,0,1,2
884,0,3,Miss. Catherine Helen Johnston,0,1,2,1,2
885,1,1,Mr. Karl Howell Behr,1,0,0,2,2


In [36]:
le = preprocessing.LabelEncoder()

x = titanic[["Pclass","Sex","Siblings/Spouses Aboard","Parents/Children Aboard", "New_age","New_Fare"]]
y = le.fit(titanic["Survived"])
y = le.transform(titanic["Survived"])


# set the random state 
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.25, random_state=0)

print("No of training samples: {}".format(x_train.shape))
print("No of test samples    : {}".format(x_test.shape))
print("y training samples    : {}".format(y_train.shape))
print("y test samples        : {}".format(y_test.shape))
x_train.head(5)

No of training samples: (665, 6)
No of test samples    : (222, 6)
y training samples    : (665,)
y test samples        : (222,)


Unnamed: 0,Pclass,Sex,Siblings/Spouses Aboard,Parents/Children Aboard,New_age,New_Fare
310,2,0,1,1,2,2
317,1,0,1,1,2,3
704,1,1,0,0,2,2
346,3,1,1,1,1,2
489,1,1,0,0,3,2


In [37]:
dt_classifier = DecisionTreeClassifier()

dt_classifier.fit(x_train, y_train)
y_pred = dt_classifier.predict(x_test)
y_pred

array([1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1,
       0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1,
       1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1,
       1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0,
       0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0,
       1, 0], dtype=int64)

In [38]:
dt_accuracy = accuracy_score(y_test, y_pred) #confusion_matrix
print("Accuracy:{}".format(100*dt_accuracy))
print("confusion_matrix\n{}".format(confusion_matrix(y_test,y_pred)))

Accuracy:80.18018018018019
confusion_matrix
[[119  23]
 [ 21  59]]


#### a. Find the best tree depth for the model
Best depth i found is 8

In [39]:
for depth in range(1,11):
    dt_classifier_en = DecisionTreeClassifier(criterion = "entropy", max_depth = depth)
    dt_classifier_en.fit(x_train, y_train)
    y_pred_dt = dt_classifier_en.predict(x_test)
    dt_acc = accuracy_score(y_test, y_pred_dt) #confusion_matrix
    print("----------------")
    print(depth)
    print("Accuracy:{:.2f}%".format(100*dt_acc))
    #print("confusion_matrix\n{}".format(confusion_matrix(y_test,y_pred_dt)))

----------------
1
Accuracy:77.93%
----------------
2
Accuracy:77.93%
----------------
3
Accuracy:79.28%
----------------
4
Accuracy:77.93%
----------------
5
Accuracy:80.18%
----------------
6
Accuracy:79.73%
----------------
7
Accuracy:79.28%
----------------
8
Accuracy:81.53%
----------------
9
Accuracy:80.63%
----------------
10
Accuracy:80.18%


### Q2. determine the number attributes that is capable of giving the best prediction of ‘daily returns’.

#### loading data

In [40]:
df = pd.read_csv('./IBM.txt', delimiter = " ")
df_raw = df
print("Number of rows in original data: {}".format(len(df.index)))
print("Features: ", list(df.columns))


Number of rows in original data: 3692
Features:  ['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Adjusted']


In [41]:
pd.options.mode.chained_assignment = None

df['Daily_returns'] = 100*((df['Close'] - df['Close'].shift())/ df['Close'].shift())
conditions = [(df['Daily_returns'] >= 0.0),(df['Daily_returns'] < 0.0)]
# 1 for UP. -1 for Down

values1 = [1, -1]
df['Decision'] = np.select(conditions, values1)
df['Decision(next_day)'] = df['Decision'].shift(-1)
print("Number of rows in processed data: {}".format(len(df.index)))

df_new = df[1:-2]
df_new['Decision(next_day)'] = df_new['Decision(next_day)'].astype('int32')
df_new.head(8)


Number of rows in processed data: 3692


Unnamed: 0,Date,Open,High,Low,Close,Volume,Adjusted,Daily_returns,Decision,Decision(next_day)
1,2007-01-04,97.25,98.790001,96.879997,98.309998,10524500,63.802544,1.06919,1,-1
2,2007-01-05,97.599998,97.949997,96.910004,97.419998,7221300,63.22493,-0.9053,-1,1
3,2007-01-08,98.5,99.5,98.349998,98.900002,10340000,64.185463,1.519199,1,1
4,2007-01-09,99.080002,100.330002,99.07,100.07,11108200,64.944771,1.183011,1,-1
5,2007-01-10,98.5,99.050003,97.93,98.889999,8744800,64.178978,-1.179176,-1,-1
6,2007-01-11,99.0,99.900002,98.5,98.650002,8000700,64.023201,-0.242691,-1,1
7,2007-01-12,98.989998,99.690002,98.5,99.339996,6636500,64.471024,0.699436,1,1
8,2007-01-16,99.400002,100.839996,99.300003,100.82,9602200,65.431503,1.489837,1,-1


In [42]:
# pd.options.mode.chained_assignment = None

# df_new.loc[df_new['Open'] < 100, 'Open'] = 100
# df_new.loc[(df_new['Open'] > 100)&(df_new['Open'] < 120), 'Open'] = 120
# df_new.loc[df_new['Open'] > 120, 'Open'] = 150

# df_new.loc[df_new['High'] < 100, 'High'] = 100
# df_new.loc[((df_new['High'] > 100)&(df_new['High'] < 120)), 'High'] = 120
# df_new.loc[df_new['High'] > 120, 'High'] = 150
# df_new

Unnamed: 0,Date,Open,High,Low,Close,Volume,Adjusted,Daily_returns,Decision,Decision(next_day)
1,2007-01-04,100.0,100.0,96.879997,98.309998,10524500,63.802544,1.069190,1,-1
2,2007-01-05,100.0,100.0,96.910004,97.419998,7221300,63.224930,-0.905300,-1,1
3,2007-01-08,100.0,100.0,98.349998,98.900002,10340000,64.185463,1.519199,1,1
4,2007-01-09,100.0,120.0,99.070000,100.070000,11108200,64.944771,1.183011,1,-1
5,2007-01-10,100.0,100.0,97.930000,98.889999,8744800,64.178978,-1.179176,-1,-1
...,...,...,...,...,...,...,...,...,...,...
3685,2021-08-23,150.0,150.0,138.800003,139.619995,3039600,139.619995,0.366612,1,1
3686,2021-08-24,150.0,150.0,139.320007,139.839996,2365600,139.839996,0.157571,1,1
3687,2021-08-25,150.0,150.0,139.460007,139.860001,2012800,139.860001,0.014306,1,-1
3688,2021-08-26,150.0,150.0,138.710007,138.779999,2498700,138.779999,-0.772202,-1,1


#### Split the data. Last 100 rows as test

In [43]:
df_new_IBM = df_new.copy()
xd_IBM = df_new_IBM[[ "Open", "High", "Low", "Close","Volume", "Adjusted"]]
le = preprocessing.LabelEncoder()
decision = le.fit(df_new_IBM["Decision(next_day)"])
decision = le.transform(df_new_IBM["Decision(next_day)"])

xd_train_dt =  xd_IBM[:-102]
xd_test_dt  =  xd_IBM[-102:-2]

yd_train_dt =  decision[:-102]
yd_test_dt  =  decision[-102:-2]

print("No of training samples : {}".format(xd_train_dt.shape))
print("No of test samples     : {}\n".format(xd_test_dt.shape))
print("y training samples     : {}".format(yd_train_dt.shape))
print("y test samples         : {}\n".format(yd_test_dt.shape))

No of training samples : (3587, 6)
No of test samples     : (100, 6)

y training samples     : (3587,)
y test samples         : (100,)



In [44]:
feat_names = np.array(["Open", "High", "Low", "Close","Volume", "Adjusted"])
from itertools import chain, combinations
s = [x for x in range(6)]
all_combinations = chain.from_iterable(combinations(s, r) for r in range(len(s)+1))
all_combinations = list(all_combinations)[1:]

best_acc = 0.0

for depth in range(2,9):
    for attributes in all_combinations:
            dt_classifier_IBM = DecisionTreeClassifier(criterion = "entropy", max_depth = depth)

            attributes = list(attributes)

            xd_train_dt_IBM = xd_train_dt.iloc[:, attributes]
            xd_test_dt_IBM = xd_test_dt.iloc[:,attributes]

            dt_classifier_IBM.fit(xd_train_dt_IBM.values, yd_train_dt)
            y_pred_dt = dt_classifier_IBM.predict(xd_test_dt_IBM.values)
            acc_dt = accuracy_score(yd_test_dt, y_pred_dt)
            print("Depth {} of tree selecting {}/6 attributes: {:40s} Accuracy: {:.2f}%"
                  .format(depth, len(attributes), ",".join(feat_names[attributes]), 100*acc_dt))

            #print("confusion_matrix: \n",confusion_matrix(yd_test_knn,y_pred_knn))

            if best_acc < acc_dt:
                best_attributes = feat_names[attributes]
                best_acc = acc_dt
                best_depth = depth

Depth 2 of tree selecting 1/6 attributes: Open                                     Accuracy: 56.00%
Depth 2 of tree selecting 1/6 attributes: High                                     Accuracy: 56.00%
Depth 2 of tree selecting 1/6 attributes: Low                                      Accuracy: 56.00%
Depth 2 of tree selecting 1/6 attributes: Close                                    Accuracy: 56.00%
Depth 2 of tree selecting 1/6 attributes: Volume                                   Accuracy: 55.00%
Depth 2 of tree selecting 1/6 attributes: Adjusted                                 Accuracy: 44.00%
Depth 2 of tree selecting 2/6 attributes: Open,High                                Accuracy: 56.00%
Depth 2 of tree selecting 2/6 attributes: Open,Low                                 Accuracy: 56.00%
Depth 2 of tree selecting 2/6 attributes: Open,Close                               Accuracy: 56.00%
Depth 2 of tree selecting 2/6 attributes: Open,Volume                              Accuracy: 55.00%


Depth 3 of tree selecting 3/6 attributes: Open,Volume,Adjusted                     Accuracy: 44.00%
Depth 3 of tree selecting 3/6 attributes: High,Low,Close                           Accuracy: 56.00%
Depth 3 of tree selecting 3/6 attributes: High,Low,Volume                          Accuracy: 56.00%
Depth 3 of tree selecting 3/6 attributes: High,Low,Adjusted                        Accuracy: 44.00%
Depth 3 of tree selecting 3/6 attributes: High,Close,Volume                        Accuracy: 56.00%
Depth 3 of tree selecting 3/6 attributes: High,Close,Adjusted                      Accuracy: 44.00%
Depth 3 of tree selecting 3/6 attributes: High,Volume,Adjusted                     Accuracy: 44.00%
Depth 3 of tree selecting 3/6 attributes: Low,Close,Volume                         Accuracy: 56.00%
Depth 3 of tree selecting 3/6 attributes: Low,Close,Adjusted                       Accuracy: 44.00%
Depth 3 of tree selecting 3/6 attributes: Low,Volume,Adjusted                      Accuracy: 44.00%


Depth 4 of tree selecting 4/6 attributes: Open,Low,Close,Adjusted                  Accuracy: 44.00%
Depth 4 of tree selecting 4/6 attributes: Open,Low,Volume,Adjusted                 Accuracy: 44.00%
Depth 4 of tree selecting 4/6 attributes: Open,Close,Volume,Adjusted               Accuracy: 44.00%
Depth 4 of tree selecting 4/6 attributes: High,Low,Close,Volume                    Accuracy: 55.00%
Depth 4 of tree selecting 4/6 attributes: High,Low,Close,Adjusted                  Accuracy: 44.00%
Depth 4 of tree selecting 4/6 attributes: High,Low,Volume,Adjusted                 Accuracy: 44.00%
Depth 4 of tree selecting 4/6 attributes: High,Close,Volume,Adjusted               Accuracy: 44.00%
Depth 4 of tree selecting 4/6 attributes: Low,Close,Volume,Adjusted                Accuracy: 44.00%
Depth 4 of tree selecting 5/6 attributes: Open,High,Low,Close,Volume               Accuracy: 55.00%
Depth 4 of tree selecting 5/6 attributes: Open,High,Low,Close,Adjusted             Accuracy: 44.00%


Depth 6 of tree selecting 2/6 attributes: Close,Adjusted                           Accuracy: 48.00%
Depth 6 of tree selecting 2/6 attributes: Volume,Adjusted                          Accuracy: 45.00%
Depth 6 of tree selecting 3/6 attributes: Open,High,Low                            Accuracy: 56.00%
Depth 6 of tree selecting 3/6 attributes: Open,High,Close                          Accuracy: 56.00%
Depth 6 of tree selecting 3/6 attributes: Open,High,Volume                         Accuracy: 53.00%
Depth 6 of tree selecting 3/6 attributes: Open,High,Adjusted                       Accuracy: 60.00%
Depth 6 of tree selecting 3/6 attributes: Open,Low,Close                           Accuracy: 56.00%
Depth 6 of tree selecting 3/6 attributes: Open,Low,Volume                          Accuracy: 55.00%
Depth 6 of tree selecting 3/6 attributes: Open,Low,Adjusted                        Accuracy: 46.00%
Depth 6 of tree selecting 3/6 attributes: Open,Close,Volume                        Accuracy: 55.00%


Depth 7 of tree selecting 4/6 attributes: Open,High,Low,Volume                     Accuracy: 55.00%
Depth 7 of tree selecting 4/6 attributes: Open,High,Low,Adjusted                   Accuracy: 61.00%
Depth 7 of tree selecting 4/6 attributes: Open,High,Close,Volume                   Accuracy: 55.00%
Depth 7 of tree selecting 4/6 attributes: Open,High,Close,Adjusted                 Accuracy: 50.00%
Depth 7 of tree selecting 4/6 attributes: Open,High,Volume,Adjusted                Accuracy: 47.00%
Depth 7 of tree selecting 4/6 attributes: Open,Low,Close,Volume                    Accuracy: 55.00%
Depth 7 of tree selecting 4/6 attributes: Open,Low,Close,Adjusted                  Accuracy: 50.00%
Depth 7 of tree selecting 4/6 attributes: Open,Low,Volume,Adjusted                 Accuracy: 47.00%
Depth 7 of tree selecting 4/6 attributes: Open,Close,Volume,Adjusted               Accuracy: 48.00%
Depth 7 of tree selecting 4/6 attributes: High,Low,Close,Volume                    Accuracy: 55.00%


In [30]:
print("\nBest accu: {:.2f}% with {} attributes: {} with depth={} of tree"
      .format(best_acc*100, len(best_attributes), best_attributes, best_depth))


Best accu: 61.00% with 2 attributes: ['Low' 'Adjusted'] with depth=7 of tree
