# Data Science - Assignment 5 - Comparative Experimentation

Dataset: <b>Covtype</b>     (large dataset)

Thomas Bründl

se21m032

<br>

### Approach

In this exercise I experimented with three different algorithms (KNN, Perceptron, Decission Tree).
The input parameters of the respective algorithms were varied to determine how this would affect Effectiveness and Efficiency.


### Evaluation method

For each dataset, I investigated 2 parameters to determine Efficiency:

1. Training time
2. Testing time

To determine Effectiveness I took a look at 4 different parameters:

1. Accuracy score
2. Jaccard score
3. f1 score
4. Precision score




# Imports

In [11]:
import pandas as pd
import numpy as np
import time as time
import statistics
from tabulate import tabulate
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import Perceptron
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import accuracy_score
from sklearn.metrics import jaccard_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

import warnings
warnings.filterwarnings('ignore')

# Load Data

In [12]:
data = pd.read_csv(r'covtype.data', header=None)
data.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,45,46,47,48,49,50,51,52,53,54
count,581012.0,581012.0,581012.0,581012.0,581012.0,581012.0,581012.0,581012.0,581012.0,581012.0,...,581012.0,581012.0,581012.0,581012.0,581012.0,581012.0,581012.0,581012.0,581012.0,581012.0
mean,2959.365301,155.656807,14.103704,269.428217,46.418855,2350.146611,212.146049,223.318716,142.528263,1980.291226,...,0.090392,0.077716,0.002773,0.003255,0.000205,0.000513,0.026803,0.023762,0.01506,2.051471
std,279.984734,111.913721,7.488242,212.549356,58.295232,1559.25487,26.769889,19.768697,38.274529,1324.19521,...,0.286743,0.267725,0.052584,0.056957,0.01431,0.022641,0.161508,0.152307,0.121791,1.396504
min,1859.0,0.0,0.0,0.0,-173.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,2809.0,58.0,9.0,108.0,7.0,1106.0,198.0,213.0,119.0,1024.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,2996.0,127.0,13.0,218.0,30.0,1997.0,218.0,226.0,143.0,1710.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
75%,3163.0,260.0,18.0,384.0,69.0,3328.0,231.0,237.0,168.0,2550.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
max,3858.0,360.0,66.0,1397.0,601.0,7117.0,254.0,254.0,254.0,7173.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,7.0


# Hold Out Method

The first 53 columns (x) are used as independet variables to predict the dependent variable y which represents the column 54. 

The data (rows) is split into train and test data with a ratio of 66/33. 

In [13]:
X_train, X_test, y_train, y_test = train_test_split(data.loc[:,:53], data.loc[:,54:], test_size=0.33, random_state=1524401)

# KNN (k-nearest neighbors)

KNN was tested with the kd-tree algorithm and three different input parameters were chosen as neighbors.

In [14]:
train_times = []
test_times = []

accuracy_scores = []
jaccard_scores = []
f1_scores = []
precision_scores = []

neighbors = [3, 5, 8]

for n_neighbors in neighbors:
    # print("--[KNN]----[n_neighbors: " + str(n_neighbors) + "]---------------------------------------------")

    algo = KNeighborsClassifier(n_neighbors=n_neighbors, algorithm='kd_tree')

    # train ----------------------------------------------------
    start_training = time.time()
    model = algo.fit(X=X_train, y=y_train.values.ravel())
    training_time = time.time() - start_training
    # print("training_time: " + str(training_time))
    train_times.append(training_time)

    # predict ----------------------------------------------------
    start_testing = time.time()
    y_pred = model.predict(X=X_test)
    test_time = time.time() - start_testing
    # print("test_time: " + str(test_time))
    test_times.append(test_time)

    # --- accuracy -------------------------------------------------------------------
    accuracy_score_result = accuracy_score(y_true=y_test, y_pred=y_pred)
    # print("accuracy: " + str(accuracy_score_result))
    accuracy_scores.append(accuracy_score_result)

    # --- jaccard -------------------------------------------------------------------
    jaccard_score_result = jaccard_score(y_true=y_test, y_pred=y_pred, average='weighted')
    # print("jaccard: " + str(jaccard_score_result))
    jaccard_scores.append(jaccard_score_result)

    # --- f1 -------------------------------------------------------------------
    f1_score_result = f1_score(y_true=y_test, y_pred=y_pred, average='weighted')
    # print("f1_score: " + str(f1_score_result))
    f1_scores.append(f1_score_result)

    # --- precision -------------------------------------------------------------------
    precision_score_result = precision_score(y_true=y_test, y_pred=y_pred, average='weighted')
    # print("precision_score: " + str(precision_score_result))
    precision_scores.append(precision_score_result)
    

print("--[KNN]----[Mean Results]---------------------------------------------")

mean_training_time = statistics.mean(train_times)
mean_testing_time = statistics.mean(test_times)

print("mean training time: " + str(mean_training_time))
print("mean testing time: " + str(mean_testing_time))

mean_accuracy_score = statistics.mean(accuracy_scores)
mean_jaccard_score = statistics.mean(jaccard_scores)
mean_f1_score = statistics.mean(f1_scores)
mean_precision_score = statistics.mean(precision_scores)

print("mean accuracy score: " + str(mean_accuracy_score))
print("mean jaccard score: " + str(mean_jaccard_score))
print("mean f1 score: " + str(mean_f1_score))
print("mean precision score: " + str(mean_precision_score))

knn_mean_training_time = mean_training_time
knn_mean_testing_time = mean_testing_time

knn_mean_accuracy_score = mean_accuracy_score
knn_mean_jaccard_score = mean_jaccard_score
knn_mean_f1_score = mean_f1_score
knn_mean_precision_score = mean_precision_score

--[KNN]----[Mean Results]---------------------------------------------
mean training time: 10.97989821434021
mean testing time: 17.78772298494975
mean accuracy score: 0.963042548530777
mean jaccard score: 0.9289282118980144
mean f1 score: 0.9629408779632805
mean precision score: 0.9630232642959531


## KNN - Analyze the results based on different input parameters 

In [15]:
eval_criteria = [train_times, test_times, accuracy_scores, jaccard_scores, f1_scores, precision_scores]
eval_criteria_name = ["train_times", "test_times", "accuracy_scores", "jaccard_scores", "f1_scores", "precision_scores"]

i = 0
for criteria in eval_criteria:
    print("\n " + eval_criteria_name[i])

    headers = ["neighbors", "3", "5", "8"]
    table_data = [[""]]

    for idx, neighbor in enumerate(neighbors):
        table_data[0].append(eval_criteria[i][idx])
    print(tabulate(table_data, headers=headers, tablefmt="grid"))
    i += 1
    


 train_times
+-------------+---------+---------+---------+
| neighbors   |       3 |       5 |       8 |
|             | 11.4612 | 10.7346 | 10.7439 |
+-------------+---------+---------+---------+

 test_times
+-------------+---------+---------+---------+
| neighbors   |       3 |       5 |       8 |
|             | 14.5399 | 17.4712 | 21.3521 |
+-------------+---------+---------+---------+

 accuracy_scores
+-------------+----------+----------+----------+
| neighbors   |        3 |        5 |        8 |
|             | 0.966281 | 0.964555 | 0.958291 |
+-------------+----------+----------+----------+

 jaccard_scores
+-------------+----------+----------+----------+
| neighbors   |        3 |        5 |        8 |
|             | 0.934965 | 0.931737 | 0.920083 |
+-------------+----------+----------+----------+

 f1_scores
+-------------+----------+----------+----------+
| neighbors   |        3 |        5 |        8 |
|             | 0.966241 | 0.964483 | 0.958099 |
+-------------+----

The test time increases when the neighbours count is increased.
All other evaluation criteria are not substantially  influenced by the varying neighbours count.

# Perceptron

The perceptron was tested with different alphas (0.0001, 0.00001, 0.001) and penalties (l2, l1, elasticnet).
I can conclude that an alpha of 0.0001 will produce the best runtime result. 

In [37]:
train_times = []
test_times = []

accuracy_scores = []
jaccard_scores = []
f1_scores = []
precision_scores = []

alphas = [0.0001, 0.00001, 0.001]
penalties = ['l2', 'l1', 'elasticnet']

for alpha in alphas:
    for penalty in penalties:

        # print("--[Perceptron]----[alpha: " + str(alpha) + "]-----[penalty: " + str(penalty) + "]----------------------------------------")

        algo = Perceptron(alpha=alpha, penalty=penalty, random_state=1524401)

        # train ----------------------------------------------------
        start_training = time.time()
        model = algo.fit(X=X_train, y=y_train.values.ravel())
        training_time = time.time() - start_training
        # print("training_time: " + str(training_time))
        train_times.append(training_time)

        # predict ----------------------------------------------------
        start_testing = time.time()
        y_pred = model.predict(X=X_test)
        test_time = time.time() - start_testing
        # print("test_time: " + str(test_time))
        test_times.append(test_time)

        # --- accuracy -------------------------------------------------------------------
        accuracy_score_result = accuracy_score(y_true=y_test, y_pred=y_pred)
        # print("accuracy: " + str(accuracy_score_result))
        accuracy_scores.append(accuracy_score_result)

        # --- jaccard -------------------------------------------------------------------
        jaccard_score_result = jaccard_score(y_true=y_test, y_pred=y_pred, average='weighted')
        # print("jaccard: " + str(jaccard_score_result))
        jaccard_scores.append(jaccard_score_result)

        # --- f1 -------------------------------------------------------------------
        f1_score_result = f1_score(y_true=y_test, y_pred=y_pred, average='weighted')
        # print("f1_score: " + str(f1_score_result))
        f1_scores.append(f1_score_result)

        # --- precision -------------------------------------------------------------------
        precision_score_result = precision_score(y_true=y_test, y_pred=y_pred, average='weighted')
        # print("precision_score: " + str(precision_score_result))
        precision_scores.append(precision_score_result)
    

print("--[Perceptron]----[Mean Results]---------------------------------------------")


print("Take only the first element of the train_times and the test_times list due to highly volatile behaviour of the train_time when it comes to alpha (0.00001, 0.001).")
print("This means when we take a alpha of 0.00001 or 0.001 then the train_time is increased substantially.")
print("I choose to take only the first instance into account that is performed with a alpha of 0.0001 to not produce a misleading training result.")
mean_training_time = train_times[0]
mean_testing_time = test_times[0]

print("mean training time: " + str(mean_training_time))
print("mean testing time: " + str(mean_testing_time))

mean_accuracy_score = statistics.mean(accuracy_scores)
mean_jaccard_score = statistics.mean(jaccard_scores)
mean_f1_score = statistics.mean(f1_scores)
mean_precision_score = statistics.mean(precision_scores)

print("mean accuracy score: " + str(mean_accuracy_score))
print("mean jaccard score: " + str(mean_jaccard_score))
print("mean f1 score: " + str(mean_f1_score))
print("mean precision score: " + str(mean_precision_score))

perceptron_mean_training_time = mean_training_time
perceptron_mean_testing_time = mean_testing_time

perceptron_mean_accuracy_score = mean_accuracy_score
perceptron_mean_jaccard_score = mean_jaccard_score
perceptron_mean_f1_score = mean_f1_score
perceptron_mean_precision_score = mean_precision_score

--[Perceptron]----[Mean Results]---------------------------------------------
Take only the first element of the train_times and the test_times list due to highly volatile behaviour of the train_time when it comes to alpha (0.00001, 0.001).
This means when we take a alpha of 0.00001 or 0.001 then the train_time is increased substantially.
I choose to take only the first instance into account that is performed with a alpha of 0.0001 to not produce a misleading training result.
mean training time: 5.926147222518921
mean testing time: 0.06399273872375488
mean accuracy score: 0.4574149603095956
mean jaccard score: 0.2683143990277549
mean f1 score: 0.3802636686106108
mean precision score: 0.5165258227665548


## Perceptron - Analyze the results based on different input parameters 

In [38]:
eval_criteria = [train_times, test_times, accuracy_scores, jaccard_scores, f1_scores, precision_scores]
eval_criteria_name = ["train_times", "test_times", "accuracy_scores", "jaccard_scores", "f1_scores", "precision_scores"]
i = 0
for criteria in eval_criteria:
    print("\n " + eval_criteria_name[i])
    headers = ["penalty\\alpha", "0.0001", "0.00001", "0.001"]
    table_data = []
    for idy, y in enumerate(penalties):
        table_data.append([penalties[idy]])
        for idx, x in enumerate(alphas):
            table_data[idy].append(eval_criteria[i][len(alphas)*idy+idx])
    
    print(tabulate(table_data, headers=headers, tablefmt="grid"))
    i += 1


 train_times
+-----------------+----------+-----------+---------+
| penalty\alpha   |   0.0001 |   0.00001 |   0.001 |
| l2              |  5.92615 |   78.3912 | 95.4666 |
+-----------------+----------+-----------+---------+
| l1              |  6.65738 |   32.1401 | 11.1099 |
+-----------------+----------+-----------+---------+
| elasticnet      |  6.19961 |   31.7278 | 70.091  |
+-----------------+----------+-----------+---------+

 test_times
+-----------------+-----------+-----------+-----------+
| penalty\alpha   |    0.0001 |   0.00001 |     0.001 |
| l2              | 0.0639927 | 0.0560403 | 0.0590091 |
+-----------------+-----------+-----------+-----------+
| l1              | 0.0620506 | 0.057003  | 0.0580032 |
+-----------------+-----------+-----------+-----------+
| elasticnet      | 0.0668502 | 0.0640521 | 0.0600488 |
+-----------------+-----------+-----------+-----------+

 accuracy_scores
+-----------------+----------+-----------+----------+
| penalty\alpha   |   0.0001 

When the penalty is set to "l2" and alpha is increased (e.g. from 0.0001 to 0.001) then the training time increases substantially (i.e. 95 seconds).

Varying the penalty does not produce significantly different results.

# Decision Tree

The decision tree was tested with different min_samples_splits (2, 50, 100, 500, 1000) and min_samples_leafs (1, 50, 100, 500, 1000).

When using a min_samples_split of 2 and a min_samples_leafs of 1000 a training time of only 3.87374 can be achieved. However when chosing this configuration the accuracy_score decreases.

In [18]:
train_times = []
test_times = []

accuracy_scores = []
jaccard_scores = []
f1_scores = []
precision_scores = []

min_samples_splits = [2, 50, 100, 500, 1000]
min_samples_leafs = [1, 50, 100, 500, 1000]


for min_samples_split in min_samples_splits:
    for min_samples_leaf in min_samples_leafs:
        # print("--[DecisionTree]----[min_samples_splits: " + str(min_samples_split) + "]-----[min_samples_leafs: " + str(min_samples_leaf) + "]----------------------------------------")

        algo = DecisionTreeClassifier(criterion='gini', splitter='best', min_samples_split=min_samples_split, random_state=1524401)

        # train ----------------------------------------------------
        start_training = time.time()
        model = algo.fit(X=X_train, y=y_train.values.ravel())
        training_time = time.time() - start_training
        # print("training_time: " + str(training_time))
        train_times.append(training_time)

        # predict ----------------------------------------------------
        start_testing = time.time()
        y_pred = model.predict(X=X_test)
        test_time = time.time() - start_testing
        # print("test_time: " + str(test_time))
        test_times.append(test_time)

        # --- accuracy -------------------------------------------------------------------
        accuracy_score_result = accuracy_score(y_true=y_test, y_pred=y_pred)
        # print("accuracy: " + str(accuracy_score_result))
        accuracy_scores.append(accuracy_score_result)

        # --- jaccard -------------------------------------------------------------------
        jaccard_score_result = jaccard_score(y_true=y_test, y_pred=y_pred, average='weighted')
        # print("jaccard: " + str(jaccard_score_result))
        jaccard_scores.append(jaccard_score_result)

        # --- f1 -------------------------------------------------------------------
        f1_score_result = f1_score(y_true=y_test, y_pred=y_pred, average='weighted')
        # print("f1_score: " + str(f1_score_result))
        f1_scores.append(f1_score_result)

        # --- precision -------------------------------------------------------------------
        precision_score_result = precision_score(y_true=y_test, y_pred=y_pred, average='weighted')
        # print("precision_score: " + str(precision_score_result))
        precision_scores.append(precision_score_result)


print("--[DecisionTree]----[Mean Results]---------------------------------------------")

mean_training_time = statistics.mean(train_times)
mean_testing_time = statistics.mean(test_times)

print("mean training time: " + str(mean_training_time))
print("mean testing time: " + str(mean_testing_time))

mean_accuracy_score = statistics.mean(accuracy_scores)
mean_jaccard_score = statistics.mean(jaccard_scores)
mean_f1_score = statistics.mean(f1_scores)
mean_precision_score = statistics.mean(precision_scores)

print("mean accuracy score: " + str(mean_accuracy_score))
print("mean jaccard score: " + str(mean_jaccard_score))
print("mean f1 score: " + str(mean_f1_score))
print("mean precision score: " + str(mean_precision_score))

decisionTree_mean_training_time = mean_training_time
decisionTree_mean_testing_time = mean_testing_time

decisionTree_mean_accuracy_score = mean_accuracy_score
decisionTree_mean_jaccard_score = mean_jaccard_score
decisionTree_mean_f1_score = mean_f1_score
decisionTree_mean_precision_score = mean_precision_score  

--[DecisionTree]----[Mean Results]---------------------------------------------
mean training time: 4.568159532546997
mean testing time: 0.06457931518554688
mean accuracy score: 0.8640700136647648
mean jaccard score: 0.7656439620799526
mean f1 score: 0.8630173329961914
mean precision score: 0.8631774455252602


## Decision Tree - Analyze the results based on different input parameters 

In [19]:
eval_criteria = [train_times, test_times, accuracy_scores, jaccard_scores, f1_scores, precision_scores]
eval_criteria_name = ["train_times", "test_times", "accuracy_scores", "jaccard_scores", "f1_scores", "precision_scores"]
i = 0
for criteria in eval_criteria:
    print("\n " + eval_criteria_name[i])
    headers = ["leafs\splits", "2", "50", "100", "500", "1000"]
    table_data = []
    for idy, y in enumerate(min_samples_leafs):
        table_data.append([min_samples_leafs[idy]])
        for idx, x in enumerate(min_samples_splits):
            table_data[idy].append(eval_criteria[i][len(min_samples_leafs)*idy+idx])
    
    print(tabulate(table_data, headers=headers, tablefmt="grid"))
    i += 1


 train_times
+----------------+---------+---------+---------+---------+---------+
|   leafs\splits |       2 |      50 |     100 |     500 |    1000 |
|              1 | 5.42151 | 5.25915 | 4.99193 | 4.96798 | 4.94437 |
+----------------+---------+---------+---------+---------+---------+
|             50 | 4.85013 | 4.82666 | 4.86817 | 4.84664 | 4.85455 |
+----------------+---------+---------+---------+---------+---------+
|            100 | 4.74569 | 4.73399 | 4.74066 | 4.71897 | 4.71965 |
+----------------+---------+---------+---------+---------+---------+
|            500 | 4.1875  | 4.18849 | 4.24311 | 4.22385 | 4.24498 |
+----------------+---------+---------+---------+---------+---------+
|           1000 | 3.87374 | 3.89623 | 3.90543 | 3.87415 | 4.07646 |
+----------------+---------+---------+---------+---------+---------+

 test_times
+----------------+-----------+-----------+-----------+-----------+-----------+
|   leafs\splits |         2 |        50 |       100 |       500 |

# Results (KNN, Perceptron, Decision Tree)

In [39]:
headers = ["", "Train time", "Test time", "Accuracy", "Jaccard", "f1", "Precision"]

table_data = [
    ["K-NN", str(knn_mean_training_time), str(knn_mean_testing_time), str(knn_mean_accuracy_score),  str(knn_mean_jaccard_score), str(knn_mean_f1_score), str(knn_mean_precision_score)],
    ["Perceptron",  str(perceptron_mean_training_time), str(perceptron_mean_testing_time), str(perceptron_mean_accuracy_score),  str(perceptron_mean_jaccard_score), str(perceptron_mean_f1_score), str(perceptron_mean_precision_score)],
    ["Decision Tree",str(decisionTree_mean_training_time), str(decisionTree_mean_testing_time), str(decisionTree_mean_accuracy_score),  str(decisionTree_mean_jaccard_score), str(decisionTree_mean_f1_score), str(decisionTree_mean_precision_score)],
]

print(tabulate(table_data, headers=headers, tablefmt="grid"))

+---------------+--------------+-------------+------------+-----------+----------+-------------+
|               |   Train time |   Test time |   Accuracy |   Jaccard |       f1 |   Precision |
| K-NN          |     10.9799  |  17.7877    |   0.963043 |  0.928928 | 0.962941 |    0.963023 |
+---------------+--------------+-------------+------------+-----------+----------+-------------+
| Perceptron    |      5.92615 |   0.0639927 |   0.457415 |  0.268314 | 0.380264 |    0.516526 |
+---------------+--------------+-------------+------------+-----------+----------+-------------+
| Decision Tree |      4.56816 |   0.0645793 |   0.86407  |  0.765644 | 0.863017 |    0.863177 |
+---------------+--------------+-------------+------------+-----------+----------+-------------+


The prceptron and the decision tree achieve good efficiency. The train time and test time are similar with the perceptron and the decision tree.

KNN is not really efficient compared to the other methods. However KNN does achieve a very high accuracy, Jaccard, f1 and precission.

The decission tree seems to be the best suited method for this data set. It provides very low training and testing time and produces a very high precision of 0.86.

