In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import LinearSVC
from sklearn import svm
from sklearn.svm import SVC
from scipy import stats
import math
import matplotlib.pyplot as plt
import warnings; warnings.simplefilter('ignore')

## Urban Traffic in Sao Paulo Dataset
### Cleaning up the data:

In [2]:
df = pd.read_csv('Behavior of the urban traffic of the city of Sao Paulo in Brazil.csv', delimiter=';')

df["Slowness in traffic (%)"] = df["Slowness in traffic (%)"].map(lambda x: x.replace(',', '.')) 
time_range = df["Hour (Coded)"].max() - df["Hour (Coded)"].min()
df["Time of Day"] = df["Hour (Coded)"].map(lambda x: 1 if x <= math.ceil(time_range/2) else -1)
df.drop(columns = ["Hour (Coded)"], inplace = True)
df["Slowness in traffic (%)"] = df["Slowness in traffic (%)"].astype(float)
np_arr = df.values

X = np_arr[:, :-1]
y = np_arr[:, -1]


## Decision Tree
### Variable hyperparameters: 
    max_features: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,15
    max_depth : 1, 2, 3, 4
    

In [3]:
bestparamsav = None
bestparampersist = None
bestval = 0
## 80/20 Train/Test Split

train_scores = []
test_scores = []
actual_test_scores = []

print('Decision Tree using a 80/20 train/test split:')
for x in range(1,4):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

    set_size = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
                11, 12, 13, 14,15]
    max_depth = [1, 2, 3, 4]
    parameters = {'max_features': set_size, 'max_depth' : max_depth}

    dtc = DecisionTreeClassifier()
    search = GridSearchCV(dtc, parameters, n_jobs = -1, cv=3, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    bestparampersist = best_params
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print("Average Train Score: " + str(avg_train_scores))
print("Average Validation Score: " + str(avg_test_scores))
if bestval < avg_test_scores:
    bestval = avg_test_scores
    bestparamsav = bestparampersist
## 50/50 Train/Test Split

train_scores = []
test_scores = []
actual_test_scores = []

print('Decision Tree using a 50/50 train/test split:')
for x in range(1,4):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.50)

    set_size = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
                11, 12, 13, 14,15]
    max_depth = [1, 2, 3, 4]
    parameters = {'max_features': set_size, 'max_depth' : max_depth}

    dtc = DecisionTreeClassifier()
    search = GridSearchCV(dtc, parameters, n_jobs = -1, cv=3, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    bestparampersist = best_params
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print("Average Train Score: " + str(avg_train_scores))
print("Average Validation Score: " + str(avg_test_scores))
if bestval < avg_test_scores:
    bestval = avg_test_scores
    bestparamsav = bestparampersist
## 20/80 Train/Test Split

train_scores = []
test_scores = []
actual_test_scores = []

print('Decision Tree using a 20/80 train/test split:')
for x in range(1,4):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.80)

    set_size = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
                11, 12, 13, 14,15]
    max_depth = [1, 2, 3, 4]
    parameters = {'max_features': set_size, 'max_depth' : max_depth}

    dtc = DecisionTreeClassifier()
    search = GridSearchCV(dtc, parameters, n_jobs = -1, cv=3, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    bestparampersist = best_params
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print("Average Train Score: " + str(avg_train_scores))
print("Average Validation Score: " + str(avg_test_scores))
if bestval < avg_test_scores:
    bestval = avg_test_scores
    bestparamsav = bestparampersist
for x in range(1,4):
    dtc = DecisionTreeClassifier(max_features = bestparamsav['max_features'], max_depth = bestparamsav['max_depth'])
    dtc.fit(X_train, y_train)
    actual_test_scores.append(dtc.score(X_test, y_test))
print("Average Test Score: " + str(np.mean(actual_test_scores)) + '\n')



Decision Tree using a 80/20 train/test split:
Trial 1 best parameters:{'max_depth': 1, 'max_features': 11}
Trial 2 best parameters:{'max_depth': 4, 'max_features': 12}
Trial 3 best parameters:{'max_depth': 4, 'max_features': 4}
Average Train Score: 0.7246773292951237
Average Validation Score: 0.6482510288065844
Decision Tree using a 50/50 train/test split:
Trial 1 best parameters:{'max_depth': 4, 'max_features': 2}
Trial 2 best parameters:{'max_depth': 2, 'max_features': 14}
Trial 3 best parameters:{'max_depth': 2, 'max_features': 5}
Average Train Score: 0.7590592722718326
Average Validation Score: 0.6276119402985074
Decision Tree using a 20/80 train/test split:
Trial 1 best parameters:{'max_depth': 3, 'max_features': 12}
Trial 2 best parameters:{'max_depth': 1, 'max_features': 8}
Trial 3 best parameters:{'max_depth': 3, 'max_features': 7}
Average Train Score: 0.8406560154926167
Average Validation Score: 0.672633744855967
Average Test Score: 0.7222222222222223



## Support Vector Machine
### Variable hyperparameters: 
    kernels = "linear", "poly", "rbf"
    c = 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 0.001, 0.1, 1, 10, 100, 1e3


In [4]:
bestparamsav = None
bestparampersist = None
bestval = 0
## 80/20 Train/Test Split

train_scores = []
test_scores = []
actual_test_scores = []

print('SVM using a 80/20 train/test split:')
for x in range(1,4):
    scaler = MinMaxScaler(feature_range=(-1, 1))
    scaler.fit(X)
    scaled_X = scaler.transform(X)
    
    X_train, X_test, y_train, y_test = train_test_split(scaled_X, y, test_size=0.80)

    
    c = [1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 0.1, 1,
         10, 100, 1e3]
    dual = [True, False]
    parameters = {'dual': dual, 'C' : c}
    svm = LinearSVC()
    search = GridSearchCV(svm, parameters, n_jobs = -1, cv=3, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    bestparampersist = best_params
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print("Average Train Score: " + str(avg_train_scores))
print("Average Validation Score: " + str(avg_test_scores))
if bestval < avg_test_scores:
    bestval = avg_test_scores
    bestparamsav = bestparampersist

## 50/50 Train/Test Split

train_scores = []
test_scores = []
actual_test_scores = []

print('SVM using a 50/50 train/test split:')
for x in range(1,4):
    scaler = MinMaxScaler(feature_range=(-1, 1))
    scaler.fit(X)
    scaled_X = scaler.transform(X)
    
    X_train, X_test, y_train, y_test = train_test_split(scaled_X, y, test_size=0.50)

    
    c = [1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 0.1, 1,
         10, 100, 1e3]
    dual = [True, False]
    parameters = {'dual': dual, 'C' : c}
    svm = LinearSVC()
    search = GridSearchCV(svm, parameters, n_jobs = -1, cv=3, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    bestparampersist = best_params
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print("Average Train Score: " + str(avg_train_scores))
print("Average Validation Score: " + str(avg_test_scores))
if bestval < avg_test_scores:
    bestval = avg_test_scores
    bestparamsav = bestparampersist

## 20/80 Train/Test Split

train_scores = []
test_scores = []
actual_test_scores = []

print('SVM using a 20/80 train/test split:')
for x in range(1,4):
    scaler = MinMaxScaler(feature_range=(-1, 1))
    scaler.fit(X)
    scaled_X = scaler.transform(X)
    
    X_train, X_test, y_train, y_test = train_test_split(scaled_X, y, test_size=0.20)

    
    c = [1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 0.1, 1,
         10, 100, 1e3]
    dual = [True, False]
    parameters = {'dual': dual, 'C' : c}
    svm = LinearSVC()
    search = GridSearchCV(svm, parameters, n_jobs = -1, cv=3, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    bestparampersist = best_params
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print("Average Train Score: " + str(avg_train_scores))
print("Average Validation Score: " + str(avg_test_scores))
if bestval < avg_test_scores:
    bestval = avg_test_scores
    bestparamsav = bestparampersist

for x in range(1,4):
    svm = LinearSVC(dual = bestparamsav['dual'], C = bestparamsav['C'])
    svm.fit(X_train, y_train)
    actual_test_scores.append(svm.score(X_test, y_test))
print("Average Test Score: " + str(np.mean(actual_test_scores)) + '\n')



SVM using a 80/20 train/test split:
Trial 1 best parameters:{'C': 0.1, 'dual': True}
Trial 2 best parameters:{'C': 0.1, 'dual': True}
Trial 3 best parameters:{'C': 1, 'dual': True}
Average Train Score: 0.7312790790602969
Average Validation Score: 0.6279461279461279
SVM using a 50/50 train/test split:
Trial 1 best parameters:{'C': 0.1, 'dual': True}
Trial 2 best parameters:{'C': 0.01, 'dual': True}
Trial 3 best parameters:{'C': 0.1, 'dual': True}
Average Train Score: 0.6846214671972247
Average Validation Score: 0.6026684758028042
SVM using a 20/80 train/test split:
Trial 1 best parameters:{'C': 0.1, 'dual': True}
Trial 2 best parameters:{'C': 100, 'dual': True}
Trial 3 best parameters:{'C': 0.1, 'dual': True}
Average Train Score: 0.6873220029090182
Average Validation Score: 0.6398709315375981
Average Test Score: 0.7037037037037037



## Logistic Regression
### Variable hyperparameters: 
    c = 1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100, 1e3, 1e4
    penalty = l2, l1
    class weight = balanced, none


In [5]:
bestparamsav = None
bestparampersist = None
bestval = 0
## 80/20 Train/Test Split

train_scores = []
test_scores = []
actual_test_scores = []

print('Logistic Regression using a 80/20 train/test split:')
for x in range(1,4):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.80)

    c = [1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100, 1e3, 1e4]
    penalty = ['l2', 'l1']
    class_weight = ['balanced', None]
    parameters = {'C' : c, 'penalty' : penalty, 'class_weight' : class_weight}
    logr = LogisticRegression()
    search = GridSearchCV(logr, parameters, cv=3, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    bestparampersist = best_params
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print("Average Train Score: " + str(avg_train_scores))
print("Average Validation Score: " + str(avg_test_scores))
if bestval < avg_test_scores:
    bestval = avg_test_scores
    bestparamsav = bestparampersist

## 50/50 Train/Test Split

train_scores = []
test_scores = []
actual_test_scores = []

print('Logistic Regression using a 50/50 train/test split:')
for x in range(1,4):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.50)

    c = [1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100, 1e3, 1e4]
    penalty = ['l2', 'l1']
    class_weight = ['balanced', None]
    parameters = {'C' : c, 'penalty' : penalty, 'class_weight' : class_weight}
    logr = LogisticRegression()
    search = GridSearchCV(logr, parameters, cv=3, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    bestparampersist = best_params
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print("Average Train Score: " + str(avg_train_scores))
print("Average Validation Score: " + str(avg_test_scores))
if bestval < avg_test_scores:
    bestval = avg_test_scores
    bestparamsav = bestparampersist

## 20/80 Train/Test Split

train_scores = []
test_scores = []
actual_test_scores = []

print('Logistic Regression using a 20/80 train/test split:')
for x in range(1,4):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

    c = [1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100, 1e3, 1e4]
    penalty = ['l2', 'l1']
    class_weight = ['balanced', None]
    parameters = {'C' : c, 'penalty' : penalty, 'class_weight' : class_weight}
    logr = LogisticRegression()
    search = GridSearchCV(logr, parameters, cv=3, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    bestparampersist = best_params
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print("Average Train Score: " + str(avg_train_scores))
print("Average Validation Score: " + str(avg_test_scores))
if bestval < avg_test_scores:
    bestval = avg_test_scores
    bestparamsav = bestparampersist

for x in range(1,4):
    logr = LogisticRegression(C = bestparamsav['C'], penalty = bestparamsav['penalty'], class_weight = bestparamsav['class_weight'])
    logr.fit(X_train, y_train)
    actual_test_scores.append(logr.score(X_test, y_test))
print("Average Test Score: " + str(np.mean(actual_test_scores)) + '\n')



Logistic Regression using a 80/20 train/test split:
Trial 1 best parameters:{'C': 1, 'class_weight': 'balanced', 'penalty': 'l2'}
Trial 2 best parameters:{'C': 1, 'class_weight': None, 'penalty': 'l1'}
Trial 3 best parameters:{'C': 1, 'class_weight': 'balanced', 'penalty': 'l1'}
Average Train Score: 0.6425523125729523
Average Validation Score: 0.5365622032288698
Logistic Regression using a 50/50 train/test split:
Trial 1 best parameters:{'C': 1, 'class_weight': None, 'penalty': 'l1'}
Trial 2 best parameters:{'C': 1, 'class_weight': None, 'penalty': 'l1'}
Trial 3 best parameters:{'C': 1, 'class_weight': None, 'penalty': 'l1'}
Average Train Score: 0.6323422819799632
Average Validation Score: 0.5769230769230769
Logistic Regression using a 20/80 train/test split:
Trial 1 best parameters:{'C': 1, 'class_weight': 'balanced', 'penalty': 'l1'}
Trial 2 best parameters:{'C': 1, 'class_weight': None, 'penalty': 'l1'}
Trial 3 best parameters:{'C': 1, 'class_weight': 'balanced', 'penalty': 'l2'}
Av

## Banknote Authentication Dataset
### Cleaning up the data:

In [6]:
banknote_df = pd.read_csv('data_banknote_authentication.txt', header = None,
            names = ["Variance", "Skewness", "Curtosis", "Entropy", "Label"])
banknote_df["Label"] = banknote_df["Label"].map(lambda x: x if x==1 else -1) 
banknote_arr = banknote_df.values

X = banknote_arr[:, :-1]
y = banknote_arr[:, -1]

## Decision Tree
### Variable hyperparameters: 
    max_features : 1, 2, 3, 4
    max_depth : 1, 2, 3, 4
    

In [7]:
bestparamsav = None
bestparampersist = None
bestval = 0
## 80/20 Train/Test Split

train_scores = []
test_scores = []
actual_test_scores = []

print('Decision Tree using a 80/20 train/test split:')
for x in range(1,4):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

    set_size = [1, 2, 3, 4]
    max_depth = [1, 2, 3, 4]
    parameters = {'max_features': set_size, 'max_depth' : max_depth}

    dtc = DecisionTreeClassifier()
    search = GridSearchCV(dtc, parameters, n_jobs = -1, cv=3, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    bestparampersist = best_params
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print("Average Train Score: " + str(avg_train_scores))
print("Average Validation Score: " + str(avg_test_scores))
if bestval < avg_test_scores:
    bestval = avg_test_scores
    bestparamsav = bestparampersist
## 50/50 Train/Test Split

train_scores = []
test_scores = []
actual_test_scores = []

print('Decision Tree using a 50/50 train/test split:')
for x in range(1,4):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.50)

    set_size = [1, 2, 3, 4]
    max_depth = [1, 2, 3, 4]
    parameters = {'max_features': set_size, 'max_depth' : max_depth}

    dtc = DecisionTreeClassifier()
    search = GridSearchCV(dtc, parameters, n_jobs = -1, cv=3, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    bestparampersist = best_params
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print("Average Train Score: " + str(avg_train_scores))
print("Average Validation Score: " + str(avg_test_scores))
if bestval < avg_test_scores:
    bestval = avg_test_scores
    bestparamsav = bestparampersist
## 20/80 Train/Test Split

train_scores = []
test_scores = []
actual_test_scores = []

print('Decision Tree using a 20/80 train/test split:')
for x in range(1,4):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.80)

    set_size = [1, 2, 3, 4]
    max_depth = [1, 2, 3, 4]
    parameters = {'max_features': set_size, 'max_depth' : max_depth}

    dtc = DecisionTreeClassifier()
    search = GridSearchCV(dtc, parameters, n_jobs = -1, cv=3, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    bestparampersist = best_params
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print("Average Train Score: " + str(avg_train_scores))
print("Average Validation Score: " + str(avg_test_scores))
if bestval < avg_test_scores:
    bestval = avg_test_scores
    bestparamsav = bestparampersist
for x in range(1,4):
    dtc = DecisionTreeClassifier(max_features = bestparamsav['max_features'], max_depth = bestparamsav['max_depth'])
    dtc.fit(X_train, y_train)
    actual_test_scores.append(dtc.score(X_test, y_test))
print("Average Test Score: " + str(np.mean(actual_test_scores)) + '\n')



Decision Tree using a 80/20 train/test split:
Trial 1 best parameters:{'max_depth': 4, 'max_features': 3}
Trial 2 best parameters:{'max_depth': 4, 'max_features': 3}
Trial 3 best parameters:{'max_depth': 4, 'max_features': 2}
Average Train Score: 0.8771197548998844
Average Validation Score: 0.8616681859617138
Decision Tree using a 50/50 train/test split:
Trial 1 best parameters:{'max_depth': 4, 'max_features': 3}
Trial 2 best parameters:{'max_depth': 4, 'max_features': 3}
Trial 3 best parameters:{'max_depth': 4, 'max_features': 4}
Average Train Score: 0.8821745368556616
Average Validation Score: 0.8636722546161323
Decision Tree using a 20/80 train/test split:
Trial 1 best parameters:{'max_depth': 4, 'max_features': 4}
Trial 2 best parameters:{'max_depth': 4, 'max_features': 4}
Trial 3 best parameters:{'max_depth': 4, 'max_features': 3}
Average Train Score: 0.8771561566353975
Average Validation Score: 0.8423813868613138
Average Test Score: 0.9635701275045537



## Support Vector Machine
### Variable hyperparameters: 
    kernels = "linear", "poly", "rbf"
    c = 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 0.001, 0.1, 1, 10, 100, 1e3


In [8]:
bestparamsav = None
bestparampersist = None
bestval = 0
## 80/20 Train/Test Split

train_scores = []
test_scores = []
actual_test_scores = []

print('SVM using a 80/20 train/test split:')
for x in range(1,4):
    scaler = MinMaxScaler(feature_range=(-1, 1))
    scaler.fit(X)
    scaled_X = scaler.transform(X)
    
    X_train, X_test, y_train, y_test = train_test_split(scaled_X, y, test_size=0.80)

    
    c = [1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 0.1, 1,
         10, 100, 1e3]
    dual = [True, False]
    parameters = {'dual': dual, 'C' : c}
    svm = LinearSVC()
    search = GridSearchCV(svm, parameters, n_jobs = -1, cv=3, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    bestparampersist = best_params
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print("Average Train Score: " + str(avg_train_scores))
print("Average Validation Score: " + str(avg_test_scores))
if bestval < avg_test_scores:
    bestval = avg_test_scores
    bestparamsav = bestparampersist

## 50/50 Train/Test Split

train_scores = []
test_scores = []
actual_test_scores = []

print('SVM using a 50/50 train/test split:')
for x in range(1,4):
    scaler = MinMaxScaler(feature_range=(-1, 1))
    scaler.fit(X)
    scaled_X = scaler.transform(X)
    
    X_train, X_test, y_train, y_test = train_test_split(scaled_X, y, test_size=0.50)

    
    c = [1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 0.1, 1,
         10, 100, 1e3]
    dual = [True, False]
    parameters = {'dual': dual, 'C' : c}
    svm = LinearSVC()
    search = GridSearchCV(svm, parameters, n_jobs = -1, cv=3, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    bestparampersist = best_params
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print("Average Train Score: " + str(avg_train_scores))
print("Average Validation Score: " + str(avg_test_scores))
if bestval < avg_test_scores:
    bestval = avg_test_scores
    bestparamsav = bestparampersist

## 20/80 Train/Test Split

train_scores = []
test_scores = []
actual_test_scores = []

print('SVM using a 20/80 train/test split:')
for x in range(1,4):
    scaler = MinMaxScaler(feature_range=(-1, 1))
    scaler.fit(X)
    scaled_X = scaler.transform(X)
    
    X_train, X_test, y_train, y_test = train_test_split(scaled_X, y, test_size=0.20)

    
    c = [1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 0.1, 1,
         10, 100, 1e3]
    dual = [True, False]
    parameters = {'dual': dual, 'C' : c}
    svm = LinearSVC()
    search = GridSearchCV(svm, parameters, n_jobs = -1, cv=3, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    bestparampersist = best_params
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print("Average Train Score: " + str(avg_train_scores))
print("Average Validation Score: " + str(avg_test_scores))
if bestval < avg_test_scores:
    bestval = avg_test_scores
    bestparamsav = bestparampersist

for x in range(1,4):
    svm = LinearSVC(dual = bestparamsav['dual'], C = bestparamsav['C'])
    svm.fit(X_train, y_train)
    actual_test_scores.append(svm.score(X_test, y_test))
print("Average Test Score: " + str(np.mean(actual_test_scores)) + '\n')



SVM using a 80/20 train/test split:
Trial 1 best parameters:{'C': 100, 'dual': True}
Trial 2 best parameters:{'C': 10, 'dual': True}
Trial 3 best parameters:{'C': 10, 'dual': True}
Average Train Score: 0.8524512827791516
Average Validation Score: 0.8477106834771069
SVM using a 50/50 train/test split:
Trial 1 best parameters:{'C': 10, 'dual': True}
Trial 2 best parameters:{'C': 1000.0, 'dual': False}
Trial 3 best parameters:{'C': 1000.0, 'dual': True}
Average Train Score: 0.8735072481477266
Average Validation Score: 0.8725373266189593
SVM using a 20/80 train/test split:
Trial 1 best parameters:{'C': 1000.0, 'dual': False}
Trial 2 best parameters:{'C': 10, 'dual': True}
Trial 3 best parameters:{'C': 10, 'dual': True}
Average Train Score: 0.8597215440025495
Average Validation Score: 0.8585260075688517
Average Test Score: 0.9842424242424243



## Logistic Regression
### Variable hyperparameters: 
    c = 1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100, 1e3, 1e4
    penalty = l2, l1
    class weight = balanced, none


In [9]:
bestparamsav = None
bestparampersist = None
bestval = 0
## 80/20 Train/Test Split

train_scores = []
test_scores = []
actual_test_scores = []

print('Logistic Regression using a 80/20 train/test split:')
for x in range(1,4):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.80)

    c = [1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100, 1e3, 1e4]
    penalty = ['l2', 'l1']
    class_weight = ['balanced', None]
    parameters = {'C' : c, 'penalty' : penalty, 'class_weight' : class_weight}
    logr = LogisticRegression()
    search = GridSearchCV(logr, parameters, cv=3, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    bestparampersist = best_params
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print("Average Train Score: " + str(avg_train_scores))
print("Average Validation Score: " + str(avg_test_scores))
if bestval < avg_test_scores:
    bestval = avg_test_scores
    bestparamsav = bestparampersist

## 50/50 Train/Test Split

train_scores = []
test_scores = []
actual_test_scores = []

print('Logistic Regression using a 50/50 train/test split:')
for x in range(1,4):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.50)

    c = [1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100, 1e3, 1e4]
    penalty = ['l2', 'l1']
    class_weight = ['balanced', None]
    parameters = {'C' : c, 'penalty' : penalty, 'class_weight' : class_weight}
    logr = LogisticRegression()
    search = GridSearchCV(logr, parameters, cv=3, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    bestparampersist = best_params
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print("Average Train Score: " + str(avg_train_scores))
print("Average Validation Score: " + str(avg_test_scores))
if bestval < avg_test_scores:
    bestval = avg_test_scores
    bestparamsav = bestparampersist

## 20/80 Train/Test Split

train_scores = []
test_scores = []
actual_test_scores = []

print('Logistic Regression using a 20/80 train/test split:')
for x in range(1,4):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

    c = [1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100, 1e3, 1e4]
    penalty = ['l2', 'l1']
    class_weight = ['balanced', None]
    parameters = {'C' : c, 'penalty' : penalty, 'class_weight' : class_weight}
    logr = LogisticRegression()
    search = GridSearchCV(logr, parameters, cv=3, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    bestparampersist = best_params
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print("Average Train Score: " + str(avg_train_scores))
print("Average Validation Score: " + str(avg_test_scores))
if bestval < avg_test_scores:
    bestval = avg_test_scores
    bestparamsav = bestparampersist

for x in range(1,4):
    logr = LogisticRegression(C = bestparamsav['C'], penalty = bestparamsav['penalty'], class_weight = bestparamsav['class_weight'])
    logr.fit(X_train, y_train)
    actual_test_scores.append(logr.score(X_test, y_test))
print("Average Test Score: " + str(np.mean(actual_test_scores)) + '\n')



Logistic Regression using a 80/20 train/test split:
Trial 1 best parameters:{'C': 10, 'class_weight': None, 'penalty': 'l2'}
Trial 2 best parameters:{'C': 1, 'class_weight': None, 'penalty': 'l1'}
Trial 3 best parameters:{'C': 10, 'class_weight': 'balanced', 'penalty': 'l1'}
Average Train Score: 0.8257761749249769
Average Validation Score: 0.8215655998502714
Logistic Regression using a 50/50 train/test split:
Trial 1 best parameters:{'C': 1, 'class_weight': 'balanced', 'penalty': 'l2'}
Trial 2 best parameters:{'C': 10, 'class_weight': None, 'penalty': 'l2'}
Trial 3 best parameters:{'C': 1, 'class_weight': 'balanced', 'penalty': 'l2'}
Average Train Score: 0.832370240079527
Average Validation Score: 0.8290535994617628
Logistic Regression using a 20/80 train/test split:
Trial 1 best parameters:{'C': 10, 'class_weight': None, 'penalty': 'l1'}
Trial 2 best parameters:{'C': 1, 'class_weight': 'balanced', 'penalty': 'l2'}
Trial 3 best parameters:{'C': 10, 'class_weight': None, 'penalty': 'l1'

## Detection of Diseased Trees
### Cleaning up the data:

In [10]:
df1 = pd.read_csv('Testing.csv')
df2 = pd.read_csv('Training.csv')
frames = [df1, df2]
df = pd.concat(frames)
df.reset_index(drop=True)
# 'w' is diseased, 'n' is all other
df["class"] = df["class"].map(lambda x: 1 if x == 'w' else -1) 
np_arr = df.values

X = np_arr[:, 1:]
y = np_arr[:, 0]

## Decision Tree
### Variable hyperparameters: 
    max_features : 1, 2, 3, 4, 5
    max_depth : 1, 2, 3, 4
    

In [11]:
bestparamsav = None
bestparampersist = None
bestval = 0
## 80/20 Train/Test Split

train_scores = []
test_scores = []
actual_test_scores = []

print('Decision Tree using a 80/20 train/test split:')
for x in range(1,4):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

    set_size = [1, 2, 3, 4, 5]
    max_depth = [1, 2, 3, 4]

    parameters = {'max_features': set_size, 'max_depth' : max_depth}

    dtc = DecisionTreeClassifier()
    search = GridSearchCV(dtc, parameters, n_jobs = -1, cv=3, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    bestparampersist = best_params
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print("Average Train Score: " + str(avg_train_scores))
print("Average Validation Score: " + str(avg_test_scores))
if bestval < avg_test_scores:
    bestval = avg_test_scores
    bestparamsav = bestparampersist
## 50/50 Train/Test Split

train_scores = []
test_scores = []
actual_test_scores = []

print('Decision Tree using a 50/50 train/test split:')
for x in range(1,4):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.50)

    set_size = [1, 2, 3, 4, 5]
    max_depth = [1, 2, 3, 4]

    parameters = {'max_features': set_size, 'max_depth' : max_depth}

    dtc = DecisionTreeClassifier()
    search = GridSearchCV(dtc, parameters, n_jobs = -1, cv=3, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    bestparampersist = best_params
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print("Average Train Score: " + str(avg_train_scores))
print("Average Validation Score: " + str(avg_test_scores))
if bestval < avg_test_scores:
    bestval = avg_test_scores
    bestparamsav = bestparampersist
## 20/80 Train/Test Split

train_scores = []
test_scores = []
actual_test_scores = []

print('Decision Tree using a 20/80 train/test split:')
for x in range(1,4):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.80)

    set_size = [1, 2, 3, 4, 5]
    max_depth = [1, 2, 3, 4]

    parameters = {'max_features': set_size, 'max_depth' : max_depth}

    dtc = DecisionTreeClassifier()
    search = GridSearchCV(dtc, parameters, n_jobs = -1, cv=3, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    bestparampersist = best_params
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print("Average Train Score: " + str(avg_train_scores))
print("Average Validation Score: " + str(avg_test_scores))
if bestval < avg_test_scores:
    bestval = avg_test_scores
    bestparamsav = bestparampersist
for x in range(1,4):
    dtc = DecisionTreeClassifier(max_features = bestparamsav['max_features'], max_depth = bestparamsav['max_depth'])
    dtc.fit(X_train, y_train)
    actual_test_scores.append(dtc.score(X_test, y_test))
print("Average Test Score: " + str(np.mean(actual_test_scores)) + '\n')



Decision Tree using a 80/20 train/test split:
Trial 1 best parameters:{'max_depth': 4, 'max_features': 5}
Trial 2 best parameters:{'max_depth': 4, 'max_features': 5}
Trial 3 best parameters:{'max_depth': 4, 'max_features': 5}
Average Train Score: 0.962309625810333
Average Validation Score: 0.9601093601997761
Decision Tree using a 50/50 train/test split:
Trial 1 best parameters:{'max_depth': 4, 'max_features': 5}
Trial 2 best parameters:{'max_depth': 4, 'max_features': 5}
Trial 3 best parameters:{'max_depth': 4, 'max_features': 5}
Average Train Score: 0.960014073384301
Average Validation Score: 0.9563455973542786
Decision Tree using a 20/80 train/test split:
Trial 1 best parameters:{'max_depth': 3, 'max_features': 5}
Trial 2 best parameters:{'max_depth': 4, 'max_features': 5}
Trial 3 best parameters:{'max_depth': 4, 'max_features': 5}
Average Train Score: 0.960031110020271
Average Validation Score: 0.9503447087211306
Average Test Score: 0.9731404958677686



## Support Vector Machine
### Variable hyperparameters: 
    kernels = "linear", "poly", "rbf"
    c = 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 0.001, 0.1, 1, 10, 100, 1e3


In [12]:
bestparamsav = None
bestparampersist = None
bestval = 0
## 80/20 Train/Test Split

train_scores = []
test_scores = []
actual_test_scores = []

print('SVM using a 80/20 train/test split:')
for x in range(1,4):
    scaler = MinMaxScaler(feature_range=(-1, 1))
    scaler.fit(X)
    scaled_X = scaler.transform(X)
    
    X_train, X_test, y_train, y_test = train_test_split(scaled_X, y, test_size=0.80)

    
    c = [1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 0.1, 1,
         10, 100, 1e3]
    dual = [True, False]
    parameters = {'dual': dual, 'C' : c}
    svm = LinearSVC()
    search = GridSearchCV(svm, parameters, n_jobs = -1, cv=3, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    bestparampersist = best_params
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print("Average Train Score: " + str(avg_train_scores))
print("Average Validation Score: " + str(avg_test_scores))
if bestval < avg_test_scores:
    bestval = avg_test_scores
    bestparamsav = bestparampersist

## 50/50 Train/Test Split

train_scores = []
test_scores = []
actual_test_scores = []

print('SVM using a 50/50 train/test split:')
for x in range(1,4):
    scaler = MinMaxScaler(feature_range=(-1, 1))
    scaler.fit(X)
    scaled_X = scaler.transform(X)
    
    X_train, X_test, y_train, y_test = train_test_split(scaled_X, y, test_size=0.50)

    
    c = [1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 0.1, 1,
         10, 100, 1e3]
    dual = [True, False]
    parameters = {'dual': dual, 'C' : c}
    svm = LinearSVC()
    search = GridSearchCV(svm, parameters, n_jobs = -1, cv=3, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    bestparampersist = best_params
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print("Average Train Score: " + str(avg_train_scores))
print("Average Validation Score: " + str(avg_test_scores))
if bestval < avg_test_scores:
    bestval = avg_test_scores
    bestparamsav = bestparampersist

## 20/80 Train/Test Split

train_scores = []
test_scores = []
actual_test_scores = []

print('SVM using a 20/80 train/test split:')
for x in range(1,4):
    scaler = MinMaxScaler(feature_range=(-1, 1))
    scaler.fit(X)
    scaled_X = scaler.transform(X)
    
    X_train, X_test, y_train, y_test = train_test_split(scaled_X, y, test_size=0.20)

    
    c = [1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 0.1, 1,
         10, 100, 1e3]
    dual = [True, False]
    parameters = {'dual': dual, 'C' : c}
    svm = LinearSVC()
    search = GridSearchCV(svm, parameters, n_jobs = -1, cv=3, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    bestparampersist = best_params
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print("Average Train Score: " + str(avg_train_scores))
print("Average Validation Score: " + str(avg_test_scores))
if bestval < avg_test_scores:
    bestval = avg_test_scores
    bestparamsav = bestparampersist

for x in range(1,4):
    svm = LinearSVC(dual = bestparamsav['dual'], C = bestparamsav['C'])
    svm.fit(X_train, y_train)
    actual_test_scores.append(svm.score(X_test, y_test))
print("Average Test Score: " + str(np.mean(actual_test_scores)) + '\n')



SVM using a 80/20 train/test split:
Trial 1 best parameters:{'C': 1000.0, 'dual': False}
Trial 2 best parameters:{'C': 1000.0, 'dual': False}
Trial 3 best parameters:{'C': 1000.0, 'dual': False}
Average Train Score: 0.9507385032581701
Average Validation Score: 0.9504872927830529
SVM using a 50/50 train/test split:
Trial 1 best parameters:{'C': 1000.0, 'dual': False}
Trial 2 best parameters:{'C': 1000.0, 'dual': False}
Trial 3 best parameters:{'C': 1000.0, 'dual': False}
Average Train Score: 0.9512288406030617
Average Validation Score: 0.9510065516679821
SVM using a 20/80 train/test split:
Trial 1 best parameters:{'C': 1000.0, 'dual': False}
Trial 2 best parameters:{'C': 1000.0, 'dual': False}
Trial 3 best parameters:{'C': 1000.0, 'dual': False}
Average Train Score: 0.9464922305418147
Average Validation Score: 0.9463219119638651
Average Test Score: 0.9762396694214877



## Logistic Regression
### Variable hyperparameters: 
    c = 1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100, 1e3, 1e4
    penalty = l2, l1
    class weight = balanced, none


In [13]:
bestparamsav = None
bestparampersist = None
bestval = 0
## 80/20 Train/Test Split

train_scores = []
test_scores = []
actual_test_scores = []

print('Logistic Regression using a 80/20 train/test split:')
for x in range(1,4):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.80)

    c = [1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100, 1e3, 1e4]
    penalty = ['l2', 'l1']
    class_weight = ['balanced', None]
    parameters = {'C' : c, 'penalty' : penalty, 'class_weight' : class_weight}
    logr = LogisticRegression()
    search = GridSearchCV(logr, parameters, cv=3, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    bestparampersist = best_params
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print("Average Train Score: " + str(avg_train_scores))
print("Average Validation Score: " + str(avg_test_scores))
if bestval < avg_test_scores:
    bestval = avg_test_scores
    bestparamsav = bestparampersist

## 50/50 Train/Test Split

train_scores = []
test_scores = []
actual_test_scores = []

print('Logistic Regression using a 50/50 train/test split:')
for x in range(1,4):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.50)

    c = [1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100, 1e3, 1e4]
    penalty = ['l2', 'l1']
    class_weight = ['balanced', None]
    parameters = {'C' : c, 'penalty' : penalty, 'class_weight' : class_weight}
    logr = LogisticRegression()
    search = GridSearchCV(logr, parameters, cv=3, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    bestparampersist = best_params
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print("Average Train Score: " + str(avg_train_scores))
print("Average Validation Score: " + str(avg_test_scores))
if bestval < avg_test_scores:
    bestval = avg_test_scores
    bestparamsav = bestparampersist

## 20/80 Train/Test Split

train_scores = []
test_scores = []
actual_test_scores = []

print('Logistic Regression using a 20/80 train/test split:')
for x in range(1,4):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

    c = [1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100, 1e3, 1e4]
    penalty = ['l2', 'l1']
    class_weight = ['balanced', None]
    parameters = {'C' : c, 'penalty' : penalty, 'class_weight' : class_weight}
    logr = LogisticRegression()
    search = GridSearchCV(logr, parameters, cv=3, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    bestparampersist = best_params
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print("Average Train Score: " + str(avg_train_scores))
print("Average Validation Score: " + str(avg_test_scores))
if bestval < avg_test_scores:
    bestval = avg_test_scores
    bestparamsav = bestparampersist

for x in range(1,4):
    logr = LogisticRegression(C = bestparamsav['C'], penalty = bestparamsav['penalty'], class_weight = bestparamsav['class_weight'])
    logr.fit(X_train, y_train)
    actual_test_scores.append(logr.score(X_test, y_test))
print("Average Test Score: " + str(np.mean(actual_test_scores)) + '\n')



Logistic Regression using a 80/20 train/test split:
Trial 1 best parameters:{'C': 100, 'class_weight': None, 'penalty': 'l1'}
Trial 2 best parameters:{'C': 100, 'class_weight': None, 'penalty': 'l1'}
Trial 3 best parameters:{'C': 100, 'class_weight': None, 'penalty': 'l1'}
Average Train Score: 0.9337954235846923
Average Validation Score: 0.9319266565905658
Logistic Regression using a 50/50 train/test split:
Trial 1 best parameters:{'C': 10, 'class_weight': None, 'penalty': 'l1'}
Trial 2 best parameters:{'C': 1000.0, 'class_weight': None, 'penalty': 'l2'}
Trial 3 best parameters:{'C': 10, 'class_weight': None, 'penalty': 'l1'}
Average Train Score: 0.9292861642023131
Average Validation Score: 0.9285040438409599
Logistic Regression using a 20/80 train/test split:
Trial 1 best parameters:{'C': 100, 'class_weight': None, 'penalty': 'l1'}
Trial 2 best parameters:{'C': 10000.0, 'class_weight': None, 'penalty': 'l1'}
Trial 3 best parameters:{'C': 1000.0, 'class_weight': None, 'penalty': 'l1'}


## Detecting Room Occupancy
### Cleaning up the data:

In [14]:
df1 = pd.read_csv('datatest.txt')
df2 = pd.read_csv('datatest2.txt')
df3 = pd.read_csv('datatraining.txt')
frames = [df1, df2, df3]
df = pd.concat(frames)
df = df.reset_index(drop=True)
df["Occupancy"] = df["Occupancy"].map(lambda x: 1 if x == 1 else -1) 
df = df.drop(columns = ["date"])

np_arr = df.values
np.random.shuffle(np_arr)
X = np_arr[:2000, :-1]
y = np_arr[:2000, -1]


## Decision Tree
### Variable hyperparameters: 
    max_features = 1, 2, 3, 4, 5
    max_depth = 1, 2, 3, 4
    

In [15]:
bestparamsav = None
bestparampersist = None
bestval = 0
## 80/20 Train/Test Split

train_scores = []
test_scores = []
actual_test_scores = []

print('Decision Tree using a 80/20 train/test split:')
for x in range(1,4):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

    set_size = [1, 2, 3, 4, 5]
    max_depth = [1, 2, 3, 4]

    parameters = {'max_features': set_size, 'max_depth' : max_depth}

    dtc = DecisionTreeClassifier()
    search = GridSearchCV(dtc, parameters, n_jobs = -1, cv=3, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    bestparampersist = best_params
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print("Average Train Score: " + str(avg_train_scores))
print("Average Validation Score: " + str(avg_test_scores))
if bestval < avg_test_scores:
    bestval = avg_test_scores
    bestparamsav = bestparampersist
## 50/50 Train/Test Split

train_scores = []
test_scores = []
actual_test_scores = []

print('Decision Tree using a 50/50 train/test split:')
for x in range(1,4):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.50)

    set_size = [1, 2, 3, 4, 5]
    max_depth = [1, 2, 3, 4]

    parameters = {'max_features': set_size, 'max_depth' : max_depth}

    dtc = DecisionTreeClassifier()
    search = GridSearchCV(dtc, parameters, n_jobs = -1, cv=3, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    bestparampersist = best_params
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print("Average Train Score: " + str(avg_train_scores))
print("Average Validation Score: " + str(avg_test_scores))
if bestval < avg_test_scores:
    bestval = avg_test_scores
    bestparamsav = bestparampersist
## 20/80 Train/Test Split

train_scores = []
test_scores = []
actual_test_scores = []

print('Decision Tree using a 20/80 train/test split:')
for x in range(1,4):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.80)

    set_size = [1, 2, 3, 4, 5]
    max_depth = [1, 2, 3, 4]

    parameters = {'max_features': set_size, 'max_depth' : max_depth}

    dtc = DecisionTreeClassifier()
    search = GridSearchCV(dtc, parameters, n_jobs = -1, cv=3, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    bestparampersist = best_params
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print("Average Train Score: " + str(avg_train_scores))
print("Average Validation Score: " + str(avg_test_scores))
if bestval < avg_test_scores:
    bestval = avg_test_scores
    bestparamsav = bestparampersist
for x in range(1,4):
    dtc = DecisionTreeClassifier(max_features = bestparamsav['max_features'], max_depth = bestparamsav['max_depth'])
    dtc.fit(X_train, y_train)
    actual_test_scores.append(dtc.score(X_test, y_test))
print("Average Test Score: " + str(np.mean(actual_test_scores)) + '\n')



Decision Tree using a 80/20 train/test split:
Trial 1 best parameters:{'max_depth': 1, 'max_features': 5}
Trial 2 best parameters:{'max_depth': 1, 'max_features': 3}
Trial 3 best parameters:{'max_depth': 1, 'max_features': 4}
Average Train Score: 0.9623686674290144
Average Validation Score: 0.9595104166666667
Decision Tree using a 50/50 train/test split:
Trial 1 best parameters:{'max_depth': 1, 'max_features': 5}
Trial 2 best parameters:{'max_depth': 1, 'max_features': 5}
Trial 3 best parameters:{'max_depth': 2, 'max_features': 3}
Average Train Score: 0.9528653288575721
Average Validation Score: 0.9479333333333332
Decision Tree using a 20/80 train/test split:
Trial 1 best parameters:{'max_depth': 1, 'max_features': 5}
Trial 2 best parameters:{'max_depth': 1, 'max_features': 5}
Trial 3 best parameters:{'max_depth': 1, 'max_features': 4}
Average Train Score: 0.959794095754227
Average Validation Score: 0.9454166666666667
Average Test Score: 0.98875



## Support Vector Machine
### Variable hyperparameters: 
    kernels = "linear", "poly", "rbf"
    c = 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 0.001, 0.1, 1, 10, 100, 1e3


In [16]:
bestparamsav = None
bestparampersist = None
bestval = 0
## 80/20 Train/Test Split

train_scores = []
test_scores = []
actual_test_scores = []

print('SVM using a 80/20 train/test split:')
for x in range(1,4):
    scaler = MinMaxScaler(feature_range=(-1, 1))
    scaler.fit(X)
    scaled_X = scaler.transform(X)
    
    X_train, X_test, y_train, y_test = train_test_split(scaled_X, y, test_size=0.80)

    
    c = [1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 0.1, 1,
         10, 100, 1e3]
    dual = [True, False]
    parameters = {'dual': dual, 'C' : c}
    svm = LinearSVC()
    search = GridSearchCV(svm, parameters, n_jobs = -1, cv=3, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    bestparampersist = best_params
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print("Average Train Score: " + str(avg_train_scores))
print("Average Validation Score: " + str(avg_test_scores))
if bestval < avg_test_scores:
    bestval = avg_test_scores
    bestparamsav = bestparampersist

## 50/50 Train/Test Split

train_scores = []
test_scores = []
actual_test_scores = []

print('SVM using a 50/50 train/test split:')
for x in range(1,4):
    scaler = MinMaxScaler(feature_range=(-1, 1))
    scaler.fit(X)
    scaled_X = scaler.transform(X)
    
    X_train, X_test, y_train, y_test = train_test_split(scaled_X, y, test_size=0.50)

    
    c = [1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 0.1, 1,
         10, 100, 1e3]
    dual = [True, False]
    parameters = {'dual': dual, 'C' : c}
    svm = LinearSVC()
    search = GridSearchCV(svm, parameters, n_jobs = -1, cv=3, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    bestparampersist = best_params
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print("Average Train Score: " + str(avg_train_scores))
print("Average Validation Score: " + str(avg_test_scores))
if bestval < avg_test_scores:
    bestval = avg_test_scores
    bestparamsav = bestparampersist

## 20/80 Train/Test Split

train_scores = []
test_scores = []
actual_test_scores = []

print('SVM using a 20/80 train/test split:')
for x in range(1,4):
    scaler = MinMaxScaler(feature_range=(-1, 1))
    scaler.fit(X)
    scaled_X = scaler.transform(X)
    
    X_train, X_test, y_train, y_test = train_test_split(scaled_X, y, test_size=0.20)

    
    c = [1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 0.1, 1,
         10, 100, 1e3]
    dual = [True, False]
    parameters = {'dual': dual, 'C' : c}
    svm = LinearSVC()
    search = GridSearchCV(svm, parameters, n_jobs = -1, cv=3, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    bestparampersist = best_params
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print("Average Train Score: " + str(avg_train_scores))
print("Average Validation Score: " + str(avg_test_scores))
if bestval < avg_test_scores:
    bestval = avg_test_scores
    bestparamsav = bestparampersist

for x in range(1,4):
    svm = LinearSVC(dual = bestparamsav['dual'], C = bestparamsav['C'])
    svm.fit(X_train, y_train)
    actual_test_scores.append(svm.score(X_test, y_test))
print("Average Test Score: " + str(np.mean(actual_test_scores)) + '\n')



SVM using a 80/20 train/test split:
Trial 1 best parameters:{'C': 1, 'dual': True}
Trial 2 best parameters:{'C': 0.1, 'dual': True}
Trial 3 best parameters:{'C': 1, 'dual': True}
Average Train Score: 0.9022631997721806
Average Validation Score: 0.901969696969697
SVM using a 50/50 train/test split:
Trial 1 best parameters:{'C': 1, 'dual': True}
Trial 2 best parameters:{'C': 1, 'dual': True}
Trial 3 best parameters:{'C': 1, 'dual': True}
Average Train Score: 0.9033029155071427
Average Validation Score: 0.9029090909090908
SVM using a 20/80 train/test split:
Trial 1 best parameters:{'C': 0.1, 'dual': True}
Trial 2 best parameters:{'C': 1, 'dual': True}
Trial 3 best parameters:{'C': 0.1, 'dual': True}
Average Train Score: 0.9044765841833531
Average Validation Score: 0.9038731060606062
Average Test Score: 0.985



## Logistic Regression
### Variable hyperparameters: 
    c = 1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100, 1e3, 1e4
    penalty = l2, l1
    class weight = balanced, none


In [17]:
bestparamsav = None
bestparampersist = None
bestval = 0
## 80/20 Train/Test Split

train_scores = []
test_scores = []
actual_test_scores = []

print('Logistic Regression using a 80/20 train/test split:')
for x in range(1,4):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.80)

    c = [1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100, 1e3, 1e4]
    penalty = ['l2', 'l1']
    class_weight = ['balanced', None]
    parameters = {'C' : c, 'penalty' : penalty, 'class_weight' : class_weight}
    logr = LogisticRegression()
    search = GridSearchCV(logr, parameters, cv=3, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    bestparampersist = best_params
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print("Average Train Score: " + str(avg_train_scores))
print("Average Validation Score: " + str(avg_test_scores))
if bestval < avg_test_scores:
    bestval = avg_test_scores
    bestparamsav = bestparampersist

## 50/50 Train/Test Split

train_scores = []
test_scores = []
actual_test_scores = []

print('Logistic Regression using a 50/50 train/test split:')
for x in range(1,4):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.50)

    c = [1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100, 1e3, 1e4]
    penalty = ['l2', 'l1']
    class_weight = ['balanced', None]
    parameters = {'C' : c, 'penalty' : penalty, 'class_weight' : class_weight}
    logr = LogisticRegression()
    search = GridSearchCV(logr, parameters, cv=3, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    bestparampersist = best_params
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print("Average Train Score: " + str(avg_train_scores))
print("Average Validation Score: " + str(avg_test_scores))
if bestval < avg_test_scores:
    bestval = avg_test_scores
    bestparamsav = bestparampersist

## 20/80 Train/Test Split

train_scores = []
test_scores = []
actual_test_scores = []

print('Logistic Regression using a 20/80 train/test split:')
for x in range(1,4):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

    c = [1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100, 1e3, 1e4]
    penalty = ['l2', 'l1']
    class_weight = ['balanced', None]
    parameters = {'C' : c, 'penalty' : penalty, 'class_weight' : class_weight}
    logr = LogisticRegression()
    search = GridSearchCV(logr, parameters, cv=3, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    bestparampersist = best_params
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print("Average Train Score: " + str(avg_train_scores))
print("Average Validation Score: " + str(avg_test_scores))
if bestval < avg_test_scores:
    bestval = avg_test_scores
    bestparamsav = bestparampersist

for x in range(1,4):
    logr = LogisticRegression(C = bestparamsav['C'], penalty = bestparamsav['penalty'], class_weight = bestparamsav['class_weight'])
    logr.fit(X_train, y_train)
    actual_test_scores.append(logr.score(X_test, y_test))
print("Average Test Score: " + str(np.mean(actual_test_scores)) + '\n')



Logistic Regression using a 80/20 train/test split:
Trial 1 best parameters:{'C': 0.1, 'class_weight': 'balanced', 'penalty': 'l2'}
Trial 2 best parameters:{'C': 0.1, 'class_weight': None, 'penalty': 'l2'}
Trial 3 best parameters:{'C': 0.1, 'class_weight': None, 'penalty': 'l2'}
Average Train Score: 0.9147828739565503
Average Validation Score: 0.9143589743589743
Logistic Regression using a 50/50 train/test split:
Trial 1 best parameters:{'C': 0.01, 'class_weight': None, 'penalty': 'l2'}
Trial 2 best parameters:{'C': 0.1, 'class_weight': 'balanced', 'penalty': 'l1'}
Trial 3 best parameters:{'C': 0.1, 'class_weight': None, 'penalty': 'l2'}
Average Train Score: 0.9240995983874544
Average Validation Score: 0.9238205128205129
Logistic Regression using a 20/80 train/test split:
Trial 1 best parameters:{'C': 0.01, 'class_weight': None, 'penalty': 'l2'}
Trial 2 best parameters:{'C': 0.01, 'class_weight': None, 'penalty': 'l2'}
Trial 3 best parameters:{'C': 0.01, 'class_weight': None, 'penalty'