In [89]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import LinearSVC
from sklearn import svm
from sklearn.svm import SVC
from scipy import stats
import math
import matplotlib.pyplot as plt
import warnings; warnings.simplefilter('ignore')

## Urban Traffic in Sao Paulo Dataset
### Cleaning up the data:

In [90]:
df = pd.read_csv('Behavior of the urban traffic of the city of Sao Paulo in Brazil.csv', delimiter=';')

df["Slowness in traffic (%)"] = df["Slowness in traffic (%)"].map(lambda x: x.replace(',', '.')) 
time_range = df["Hour (Coded)"].max() - df["Hour (Coded)"].min()
df["Time of Day"] = df["Hour (Coded)"].map(lambda x: 1 if x <= math.ceil(time_range/2) else -1)
df.drop(columns = ["Hour (Coded)"], inplace = True)
df["Slowness in traffic (%)"] = df["Slowness in traffic (%)"].astype(float)
np_arr = df.values

X = np_arr[:, :-1]
y = np_arr[:, -1]


## Decision Tree
### Variable hyperparameters: 
    max_features: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,15
    max_depth : 1, 2, 3, 4
    

In [91]:
bestparamsav = None
bestparampersist = None
bestval = 0
## 80/20 Train/Test Split

train_scores = []
test_scores = []
actual_test_scores = []

print('Decision Tree using a 80/20 train/test split:')
for x in range(1,4):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

    set_size = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
                11, 12, 13, 14,15]
    max_depth = [1, 2, 3, 4]
    parameters = {'max_features': set_size, 'max_depth' : max_depth}

    dtc = DecisionTreeClassifier()
    search = GridSearchCV(dtc, parameters, n_jobs = -1, cv=3, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    bestparampersist = best_params
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print("Average Train Score: " + str(avg_train_scores))
print("Average Validation Score: " + str(avg_test_scores))
if bestval < avg_test_scores:
    bestval = avg_test_scores
    bestparamsav = bestparampersist
## 50/50 Train/Test Split

train_scores = []
test_scores = []
actual_test_scores = []

print('Decision Tree using a 50/50 train/test split:')
for x in range(1,4):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.50)

    set_size = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
                11, 12, 13, 14,15]
    max_depth = [1, 2, 3, 4]
    parameters = {'max_features': set_size, 'max_depth' : max_depth}

    dtc = DecisionTreeClassifier()
    search = GridSearchCV(dtc, parameters, n_jobs = -1, cv=3, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    bestparampersist = best_params
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print("Average Train Score: " + str(avg_train_scores))
print("Average Validation Score: " + str(avg_test_scores))
if bestval < avg_test_scores:
    bestval = avg_test_scores
    bestparamsav = bestparampersist
## 20/80 Train/Test Split

train_scores = []
test_scores = []
actual_test_scores = []

print('Decision Tree using a 20/80 train/test split:')
for x in range(1,4):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.80)

    set_size = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
                11, 12, 13, 14,15]
    max_depth = [1, 2, 3, 4]
    parameters = {'max_features': set_size, 'max_depth' : max_depth}

    dtc = DecisionTreeClassifier()
    search = GridSearchCV(dtc, parameters, n_jobs = -1, cv=3, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    bestparampersist = best_params
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print("Average Train Score: " + str(avg_train_scores))
print("Average Validation Score: " + str(avg_test_scores))
if bestval < avg_test_scores:
    bestval = avg_test_scores
    bestparamsav = bestparampersist
for x in range(1,4):
    dtc = DecisionTreeClassifier(max_features = bestparamsav['max_features'], max_depth = bestparamsav['max_depth'])
    dtc.fit(X_train, y_train)
    actual_test_scores.append(dtc.score(X_test, y_test))
print("Average Test Score: " + str(np.mean(actual_test_scores)) + '\n')



Decision Tree using a 80/20 train/test split:
Trial 1 best parameters:{'max_depth': 3, 'max_features': 12}
Trial 2 best parameters:{'max_depth': 4, 'max_features': 10}
Trial 3 best parameters:{'max_depth': 2, 'max_features': 11}
Average Train Score: 0.7373495050295006
Average Validation Score: 0.6445987654320987
Decision Tree using a 50/50 train/test split:
Trial 1 best parameters:{'max_depth': 3, 'max_features': 15}
Trial 2 best parameters:{'max_depth': 2, 'max_features': 13}
Trial 3 best parameters:{'max_depth': 1, 'max_features': 12}
Average Train Score: 0.7603077066965955
Average Validation Score: 0.6621890547263681
Decision Tree using a 20/80 train/test split:
Trial 1 best parameters:{'max_depth': 3, 'max_features': 4}
Trial 2 best parameters:{'max_depth': 4, 'max_features': 2}
Trial 3 best parameters:{'max_depth': 4, 'max_features': 2}
Average Train Score: 0.8338865955739021
Average Validation Score: 0.6434156378600823
Average Test Score: 0.5802469135802469



## Support Vector Machine
### Variable hyperparameters: 
    kernels = "linear", "poly", "rbf"
    c = 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 0.001, 0.1, 1, 10, 100, 1e3


In [92]:
bestparamsav = None
bestparampersist = None
bestval = 0
## 80/20 Train/Test Split

train_scores = []
test_scores = []
actual_test_scores = []

print('SVM using a 80/20 train/test split:')
for x in range(1,4):
    scaler = MinMaxScaler(feature_range=(-1, 1))
    scaler.fit(X)
    scaled_X = scaler.transform(X)
    
    X_train, X_test, y_train, y_test = train_test_split(scaled_X, y, test_size=0.80)

    
    c = [1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 0.1, 1,
         10, 100, 1e3]
    dual = [True, False]
    parameters = {'dual': dual, 'C' : c}
    svm = LinearSVC()
    search = GridSearchCV(svm, parameters, n_jobs = -1, cv=3, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    bestparampersist = best_params
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print("Average Train Score: " + str(avg_train_scores))
print("Average Validation Score: " + str(avg_test_scores))
if bestval < avg_test_scores:
    bestval = avg_test_scores
    bestparamsav = bestparampersist

## 50/50 Train/Test Split

train_scores = []
test_scores = []
actual_test_scores = []

print('SVM using a 50/50 train/test split:')
for x in range(1,4):
    scaler = MinMaxScaler(feature_range=(-1, 1))
    scaler.fit(X)
    scaled_X = scaler.transform(X)
    
    X_train, X_test, y_train, y_test = train_test_split(scaled_X, y, test_size=0.50)

    
    c = [1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 0.1, 1,
         10, 100, 1e3]
    dual = [True, False]
    parameters = {'dual': dual, 'C' : c}
    svm = LinearSVC()
    search = GridSearchCV(svm, parameters, n_jobs = -1, cv=3, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    bestparampersist = best_params
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print("Average Train Score: " + str(avg_train_scores))
print("Average Validation Score: " + str(avg_test_scores))
if bestval < avg_test_scores:
    bestval = avg_test_scores
    bestparamsav = bestparampersist

## 20/80 Train/Test Split

train_scores = []
test_scores = []
actual_test_scores = []

print('SVM using a 20/80 train/test split:')
for x in range(1,4):
    scaler = MinMaxScaler(feature_range=(-1, 1))
    scaler.fit(X)
    scaled_X = scaler.transform(X)
    
    X_train, X_test, y_train, y_test = train_test_split(scaled_X, y, test_size=0.20)

    
    c = [1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 0.1, 1,
         10, 100, 1e3]
    dual = [True, False]
    parameters = {'dual': dual, 'C' : c}
    svm = LinearSVC()
    search = GridSearchCV(svm, parameters, n_jobs = -1, cv=3, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    bestparampersist = best_params
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print("Average Train Score: " + str(avg_train_scores))
print("Average Validation Score: " + str(avg_test_scores))
if bestval < avg_test_scores:
    bestval = avg_test_scores
    bestparamsav = bestparampersist

for x in range(1,4):
    svm = LinearSVC(dual = bestparamsav['dual'], C = bestparamsav['C'])
    svm.fit(X_train, y_train)
    actual_test_scores.append(svm.score(X_test, y_test))
print("Average Test Score: " + str(np.mean(actual_test_scores)) + '\n')



SVM using a 80/20 train/test split:
Trial 1 best parameters:{'C': 0.1, 'dual': True}
Trial 2 best parameters:{'C': 0.1, 'dual': True}
Trial 3 best parameters:{'C': 0.1, 'dual': True}
Average Train Score: 0.7422904657166783
Average Validation Score: 0.5920314253647587
SVM using a 50/50 train/test split:
Trial 1 best parameters:{'C': 0.01, 'dual': True}
Trial 2 best parameters:{'C': 0.01, 'dual': True}
Trial 3 best parameters:{'C': 1, 'dual': True}
Average Train Score: 0.6976801391162393
Average Validation Score: 0.6492537313432837
SVM using a 20/80 train/test split:
Trial 1 best parameters:{'C': 0.01, 'dual': True}
Trial 2 best parameters:{'C': 0.1, 'dual': True}
Trial 3 best parameters:{'C': 0.01, 'dual': True}
Average Train Score: 0.6788958328325813
Average Validation Score: 0.6306116722783389
Average Test Score: 0.7407407407407408



## Logistic Regression
### Variable hyperparameters: 
    c = 1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100, 1e3, 1e4
    penalty = l2, l1
    class weight = balanced, none


In [93]:
bestparamsav = None
bestparampersist = None
bestval = 0
## 80/20 Train/Test Split

train_scores = []
test_scores = []
actual_test_scores = []

print('Logistic Regression using a 80/20 train/test split:')
for x in range(1,4):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.80)

    c = [1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100, 1e3, 1e4]
    penalty = ['l2', 'l1']
    class_weight = ['balanced', None]
    parameters = {'C' : c, 'penalty' : penalty, 'class_weight' : class_weight}
    logr = LogisticRegression()
    search = GridSearchCV(logr, parameters, cv=3, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    bestparampersist = best_params
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print("Average Train Score: " + str(avg_train_scores))
print("Average Validation Score: " + str(avg_test_scores))
if bestval < avg_test_scores:
    bestval = avg_test_scores
    bestparamsav = bestparampersist

## 50/50 Train/Test Split

train_scores = []
test_scores = []
actual_test_scores = []

print('Logistic Regression using a 50/50 train/test split:')
for x in range(1,4):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.50)

    c = [1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100, 1e3, 1e4]
    penalty = ['l2', 'l1']
    class_weight = ['balanced', None]
    parameters = {'C' : c, 'penalty' : penalty, 'class_weight' : class_weight}
    logr = LogisticRegression()
    search = GridSearchCV(logr, parameters, cv=3, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    bestparampersist = best_params
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print("Average Train Score: " + str(avg_train_scores))
print("Average Validation Score: " + str(avg_test_scores))
if bestval < avg_test_scores:
    bestval = avg_test_scores
    bestparamsav = bestparampersist

## 20/80 Train/Test Split

train_scores = []
test_scores = []
actual_test_scores = []

print('Logistic Regression using a 20/80 train/test split:')
for x in range(1,4):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

    c = [1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100, 1e3, 1e4]
    penalty = ['l2', 'l1']
    class_weight = ['balanced', None]
    parameters = {'C' : c, 'penalty' : penalty, 'class_weight' : class_weight}
    logr = LogisticRegression()
    search = GridSearchCV(logr, parameters, cv=3, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    bestparampersist = best_params
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print("Average Train Score: " + str(avg_train_scores))
print("Average Validation Score: " + str(avg_test_scores))
if bestval < avg_test_scores:
    bestval = avg_test_scores
    bestparamsav = bestparampersist

for x in range(1,4):
    logr = LogisticRegression(C = bestparamsav['C'], penalty = bestparamsav['penalty'], class_weight = bestparamsav['class_weight'])
    logr.fit(X_train, y_train)
    actual_test_scores.append(logr.score(X_test, y_test))
print("Average Test Score: " + str(np.mean(actual_test_scores)) + '\n')



Logistic Regression using a 80/20 train/test split:
Trial 1 best parameters:{'C': 100, 'class_weight': None, 'penalty': 'l2'}
Trial 2 best parameters:{'C': 1, 'class_weight': 'balanced', 'penalty': 'l1'}
Trial 3 best parameters:{'C': 1, 'class_weight': None, 'penalty': 'l1'}
Average Train Score: 0.6704653371320037
Average Validation Score: 0.5925925925925927
Logistic Regression using a 50/50 train/test split:
Trial 1 best parameters:{'C': 1, 'class_weight': 'balanced', 'penalty': 'l2'}
Trial 2 best parameters:{'C': 1, 'class_weight': 'balanced', 'penalty': 'l2'}
Trial 3 best parameters:{'C': 1, 'class_weight': 'balanced', 'penalty': 'l1'}
Average Train Score: 0.6300332383665717
Average Validation Score: 0.5845771144278606
Logistic Regression using a 20/80 train/test split:
Trial 1 best parameters:{'C': 1, 'class_weight': 'balanced', 'penalty': 'l1'}
Trial 2 best parameters:{'C': 1, 'class_weight': 'balanced', 'penalty': 'l1'}
Trial 3 best parameters:{'C': 1, 'class_weight': 'balanced',

## Banknote Authentication Dataset
### Cleaning up the data:

In [94]:
banknote_df = pd.read_csv('data_banknote_authentication.txt', header = None,
            names = ["Variance", "Skewness", "Curtosis", "Entropy", "Label"])
banknote_df["Label"] = banknote_df["Label"].map(lambda x: x if x==1 else -1) 
banknote_arr = banknote_df.values

X = banknote_arr[:, :-1]
y = banknote_arr[:, -1]

## Decision Tree
### Variable hyperparameters: 
    max_features : 1, 2, 3, 4
    max_depth : 1, 2, 3, 4
    

In [95]:
bestparamsav = None
bestparampersist = None
bestval = 0
## 80/20 Train/Test Split

train_scores = []
test_scores = []
actual_test_scores = []

print('Decision Tree using a 80/20 train/test split:')
for x in range(1,4):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

    set_size = [1, 2, 3, 4]
    max_depth = [1, 2, 3, 4]
    parameters = {'max_features': set_size, 'max_depth' : max_depth}

    dtc = DecisionTreeClassifier()
    search = GridSearchCV(dtc, parameters, n_jobs = -1, cv=3, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    bestparampersist = best_params
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print("Average Train Score: " + str(avg_train_scores))
print("Average Validation Score: " + str(avg_test_scores))
if bestval < avg_test_scores:
    bestval = avg_test_scores
    bestparamsav = bestparampersist
## 50/50 Train/Test Split

train_scores = []
test_scores = []
actual_test_scores = []

print('Decision Tree using a 50/50 train/test split:')
for x in range(1,4):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.50)

    set_size = [1, 2, 3, 4]
    max_depth = [1, 2, 3, 4]
    parameters = {'max_features': set_size, 'max_depth' : max_depth}

    dtc = DecisionTreeClassifier()
    search = GridSearchCV(dtc, parameters, n_jobs = -1, cv=3, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    bestparampersist = best_params
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print("Average Train Score: " + str(avg_train_scores))
print("Average Validation Score: " + str(avg_test_scores))
if bestval < avg_test_scores:
    bestval = avg_test_scores
    bestparamsav = bestparampersist
## 20/80 Train/Test Split

train_scores = []
test_scores = []
actual_test_scores = []

print('Decision Tree using a 20/80 train/test split:')
for x in range(1,4):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.80)

    set_size = [1, 2, 3, 4]
    max_depth = [1, 2, 3, 4]
    parameters = {'max_features': set_size, 'max_depth' : max_depth}

    dtc = DecisionTreeClassifier()
    search = GridSearchCV(dtc, parameters, n_jobs = -1, cv=3, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    bestparampersist = best_params
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print("Average Train Score: " + str(avg_train_scores))
print("Average Validation Score: " + str(avg_test_scores))
if bestval < avg_test_scores:
    bestval = avg_test_scores
    bestparamsav = bestparampersist
for x in range(1,4):
    dtc = DecisionTreeClassifier(max_features = bestparamsav['max_features'], max_depth = bestparamsav['max_depth'])
    dtc.fit(X_train, y_train)
    actual_test_scores.append(dtc.score(X_test, y_test))
print("Average Test Score: " + str(np.mean(actual_test_scores)) + '\n')



Decision Tree using a 80/20 train/test split:
Trial 1 best parameters:{'max_depth': 4, 'max_features': 3}
Trial 2 best parameters:{'max_depth': 4, 'max_features': 3}
Trial 3 best parameters:{'max_depth': 4, 'max_features': 3}
Average Train Score: 0.8664182590143591
Average Validation Score: 0.8551542084472805
Decision Tree using a 50/50 train/test split:
Trial 1 best parameters:{'max_depth': 4, 'max_features': 4}
Trial 2 best parameters:{'max_depth': 4, 'max_features': 3}
Trial 3 best parameters:{'max_depth': 4, 'max_features': 3}
Average Train Score: 0.8662693682984345
Average Validation Score: 0.8507045675413023
Decision Tree using a 20/80 train/test split:
Trial 1 best parameters:{'max_depth': 3, 'max_features': 3}
Trial 2 best parameters:{'max_depth': 3, 'max_features': 2}
Trial 3 best parameters:{'max_depth': 4, 'max_features': 4}
Average Train Score: 0.8891973959407583
Average Validation Score: 0.8576642335766423
Average Test Score: 0.9350333940497876



## Support Vector Machine
### Variable hyperparameters: 
    kernels = "linear", "poly", "rbf"
    c = 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 0.001, 0.1, 1, 10, 100, 1e3


In [96]:
bestparamsav = None
bestparampersist = None
bestval = 0
## 80/20 Train/Test Split

train_scores = []
test_scores = []
actual_test_scores = []

print('SVM using a 80/20 train/test split:')
for x in range(1,4):
    scaler = MinMaxScaler(feature_range=(-1, 1))
    scaler.fit(X)
    scaled_X = scaler.transform(X)
    
    X_train, X_test, y_train, y_test = train_test_split(scaled_X, y, test_size=0.80)

    
    c = [1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 0.1, 1,
         10, 100, 1e3]
    dual = [True, False]
    parameters = {'dual': dual, 'C' : c}
    svm = LinearSVC()
    search = GridSearchCV(svm, parameters, n_jobs = -1, cv=3, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    bestparampersist = best_params
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print("Average Train Score: " + str(avg_train_scores))
print("Average Validation Score: " + str(avg_test_scores))
if bestval < avg_test_scores:
    bestval = avg_test_scores
    bestparamsav = bestparampersist

## 50/50 Train/Test Split

train_scores = []
test_scores = []
actual_test_scores = []

print('SVM using a 50/50 train/test split:')
for x in range(1,4):
    scaler = MinMaxScaler(feature_range=(-1, 1))
    scaler.fit(X)
    scaled_X = scaler.transform(X)
    
    X_train, X_test, y_train, y_test = train_test_split(scaled_X, y, test_size=0.50)

    
    c = [1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 0.1, 1,
         10, 100, 1e3]
    dual = [True, False]
    parameters = {'dual': dual, 'C' : c}
    svm = LinearSVC()
    search = GridSearchCV(svm, parameters, n_jobs = -1, cv=3, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    bestparampersist = best_params
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print("Average Train Score: " + str(avg_train_scores))
print("Average Validation Score: " + str(avg_test_scores))
if bestval < avg_test_scores:
    bestval = avg_test_scores
    bestparamsav = bestparampersist

## 20/80 Train/Test Split

train_scores = []
test_scores = []
actual_test_scores = []

print('SVM using a 20/80 train/test split:')
for x in range(1,4):
    scaler = MinMaxScaler(feature_range=(-1, 1))
    scaler.fit(X)
    scaled_X = scaler.transform(X)
    
    X_train, X_test, y_train, y_test = train_test_split(scaled_X, y, test_size=0.20)

    
    c = [1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 0.1, 1,
         10, 100, 1e3]
    dual = [True, False]
    parameters = {'dual': dual, 'C' : c}
    svm = LinearSVC()
    search = GridSearchCV(svm, parameters, n_jobs = -1, cv=3, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    bestparampersist = best_params
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print("Average Train Score: " + str(avg_train_scores))
print("Average Validation Score: " + str(avg_test_scores))
if bestval < avg_test_scores:
    bestval = avg_test_scores
    bestparamsav = bestparampersist

for x in range(1,4):
    svm = LinearSVC(dual = bestparamsav['dual'], C = bestparamsav['C'])
    svm.fit(X_train, y_train)
    actual_test_scores.append(svm.score(X_test, y_test))
print("Average Test Score: " + str(np.mean(actual_test_scores)) + '\n')



SVM using a 80/20 train/test split:
Trial 1 best parameters:{'C': 100, 'dual': True}
Trial 2 best parameters:{'C': 1000.0, 'dual': True}
Trial 3 best parameters:{'C': 100, 'dual': True}
Average Train Score: 0.839887983508796
Average Validation Score: 0.8380889183808893
SVM using a 50/50 train/test split:
Trial 1 best parameters:{'C': 1000.0, 'dual': False}
Trial 2 best parameters:{'C': 1000.0, 'dual': False}
Trial 3 best parameters:{'C': 10, 'dual': True}
Average Train Score: 0.8577411188279461
Average Validation Score: 0.8552433960597224
SVM using a 20/80 train/test split:
Trial 1 best parameters:{'C': 10, 'dual': True}
Trial 2 best parameters:{'C': 1000.0, 'dual': True}
Trial 3 best parameters:{'C': 100, 'dual': True}
Average Train Score: 0.86191051107458
Average Validation Score: 0.8617027153946024
Average Test Score: 0.9818181818181818



## Logistic Regression
### Variable hyperparameters: 
    c = 1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100, 1e3, 1e4
    penalty = l2, l1
    class weight = balanced, none


In [97]:
bestparamsav = None
bestparampersist = None
bestval = 0
## 80/20 Train/Test Split

train_scores = []
test_scores = []
actual_test_scores = []

print('Logistic Regression using a 80/20 train/test split:')
for x in range(1,4):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.80)

    c = [1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100, 1e3, 1e4]
    penalty = ['l2', 'l1']
    class_weight = ['balanced', None]
    parameters = {'C' : c, 'penalty' : penalty, 'class_weight' : class_weight}
    logr = LogisticRegression()
    search = GridSearchCV(logr, parameters, cv=3, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    bestparampersist = best_params
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print("Average Train Score: " + str(avg_train_scores))
print("Average Validation Score: " + str(avg_test_scores))
if bestval < avg_test_scores:
    bestval = avg_test_scores
    bestparamsav = bestparampersist

## 50/50 Train/Test Split

train_scores = []
test_scores = []
actual_test_scores = []

print('Logistic Regression using a 50/50 train/test split:')
for x in range(1,4):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.50)

    c = [1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100, 1e3, 1e4]
    penalty = ['l2', 'l1']
    class_weight = ['balanced', None]
    parameters = {'C' : c, 'penalty' : penalty, 'class_weight' : class_weight}
    logr = LogisticRegression()
    search = GridSearchCV(logr, parameters, cv=3, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    bestparampersist = best_params
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print("Average Train Score: " + str(avg_train_scores))
print("Average Validation Score: " + str(avg_test_scores))
if bestval < avg_test_scores:
    bestval = avg_test_scores
    bestparamsav = bestparampersist

## 20/80 Train/Test Split

train_scores = []
test_scores = []
actual_test_scores = []

print('Logistic Regression using a 20/80 train/test split:')
for x in range(1,4):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

    c = [1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100, 1e3, 1e4]
    penalty = ['l2', 'l1']
    class_weight = ['balanced', None]
    parameters = {'C' : c, 'penalty' : penalty, 'class_weight' : class_weight}
    logr = LogisticRegression()
    search = GridSearchCV(logr, parameters, cv=3, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    bestparampersist = best_params
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print("Average Train Score: " + str(avg_train_scores))
print("Average Validation Score: " + str(avg_test_scores))
if bestval < avg_test_scores:
    bestval = avg_test_scores
    bestparamsav = bestparampersist

for x in range(1,4):
    logr = LogisticRegression(C = bestparamsav['C'], penalty = bestparamsav['penalty'], class_weight = bestparamsav['class_weight'])
    logr.fit(X_train, y_train)
    actual_test_scores.append(logr.score(X_test, y_test))
print("Average Test Score: " + str(np.mean(actual_test_scores)) + '\n')



Logistic Regression using a 80/20 train/test split:
Trial 1 best parameters:{'C': 10, 'class_weight': None, 'penalty': 'l2'}
Trial 2 best parameters:{'C': 1, 'class_weight': 'balanced', 'penalty': 'l1'}
Trial 3 best parameters:{'C': 1, 'class_weight': 'balanced', 'penalty': 'l2'}
Average Train Score: 0.8247503337352012
Average Validation Score: 0.8177989893318363
Logistic Regression using a 50/50 train/test split:
Trial 1 best parameters:{'C': 1, 'class_weight': None, 'penalty': 'l2'}
Trial 2 best parameters:{'C': 1, 'class_weight': 'balanced', 'penalty': 'l2'}
Trial 3 best parameters:{'C': 10, 'class_weight': None, 'penalty': 'l2'}
Average Train Score: 0.8251885802940648
Average Validation Score: 0.8240636914106304
Logistic Regression using a 20/80 train/test split:
Trial 1 best parameters:{'C': 1, 'class_weight': 'balanced', 'penalty': 'l2'}
Trial 2 best parameters:{'C': 1, 'class_weight': 'balanced', 'penalty': 'l2'}
Trial 3 best parameters:{'C': 10, 'class_weight': None, 'penalty':

## Detection of Diseased Trees
### Cleaning up the data:

In [98]:
df1 = pd.read_csv('Testing.csv')
df2 = pd.read_csv('Training.csv')
frames = [df1, df2]
df = pd.concat(frames)
df.reset_index(drop=True)
# 'w' is diseased, 'n' is all other
df["class"] = df["class"].map(lambda x: 1 if x == 'w' else -1) 
np_arr = df.values

X = np_arr[:, 1:]
y = np_arr[:, 0]

## Decision Tree
### Variable hyperparameters: 
    max_features : 1, 2, 3, 4, 5
    max_depth : 1, 2, 3, 4
    

In [99]:
bestparamsav = None
bestparampersist = None
bestval = 0
## 80/20 Train/Test Split

train_scores = []
test_scores = []
actual_test_scores = []

print('Decision Tree using a 80/20 train/test split:')
for x in range(1,4):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

    set_size = [1, 2, 3, 4, 5]
    max_depth = [1, 2, 3, 4]

    parameters = {'max_features': set_size, 'max_depth' : max_depth}

    dtc = DecisionTreeClassifier()
    search = GridSearchCV(dtc, parameters, n_jobs = -1, cv=3, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    bestparampersist = best_params
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print("Average Train Score: " + str(avg_train_scores))
print("Average Validation Score: " + str(avg_test_scores))
if bestval < avg_test_scores:
    bestval = avg_test_scores
    bestparamsav = bestparampersist
## 50/50 Train/Test Split

train_scores = []
test_scores = []
actual_test_scores = []

print('Decision Tree using a 50/50 train/test split:')
for x in range(1,4):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.50)

    set_size = [1, 2, 3, 4, 5]
    max_depth = [1, 2, 3, 4]

    parameters = {'max_features': set_size, 'max_depth' : max_depth}

    dtc = DecisionTreeClassifier()
    search = GridSearchCV(dtc, parameters, n_jobs = -1, cv=3, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    bestparampersist = best_params
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print("Average Train Score: " + str(avg_train_scores))
print("Average Validation Score: " + str(avg_test_scores))
if bestval < avg_test_scores:
    bestval = avg_test_scores
    bestparamsav = bestparampersist
## 20/80 Train/Test Split

train_scores = []
test_scores = []
actual_test_scores = []

print('Decision Tree using a 20/80 train/test split:')
for x in range(1,4):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.80)

    set_size = [1, 2, 3, 4, 5]
    max_depth = [1, 2, 3, 4]

    parameters = {'max_features': set_size, 'max_depth' : max_depth}

    dtc = DecisionTreeClassifier()
    search = GridSearchCV(dtc, parameters, n_jobs = -1, cv=3, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    bestparampersist = best_params
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print("Average Train Score: " + str(avg_train_scores))
print("Average Validation Score: " + str(avg_test_scores))
if bestval < avg_test_scores:
    bestval = avg_test_scores
    bestparamsav = bestparampersist
for x in range(1,4):
    dtc = DecisionTreeClassifier(max_features = bestparamsav['max_features'], max_depth = bestparamsav['max_depth'])
    dtc.fit(X_train, y_train)
    actual_test_scores.append(dtc.score(X_test, y_test))
print("Average Test Score: " + str(np.mean(actual_test_scores)) + '\n')



Decision Tree using a 80/20 train/test split:
Trial 1 best parameters:{'max_depth': 4, 'max_features': 4}
Trial 2 best parameters:{'max_depth': 4, 'max_features': 5}
Trial 3 best parameters:{'max_depth': 4, 'max_features': 5}
Average Train Score: 0.9600814555849561
Average Validation Score: 0.9571945233789718
Decision Tree using a 50/50 train/test split:
Trial 1 best parameters:{'max_depth': 4, 'max_features': 5}
Trial 2 best parameters:{'max_depth': 4, 'max_features': 5}
Trial 3 best parameters:{'max_depth': 4, 'max_features': 4}
Average Train Score: 0.9620404953786633
Average Validation Score: 0.9579509439162188
Decision Tree using a 20/80 train/test split:
Trial 1 best parameters:{'max_depth': 2, 'max_features': 4}
Trial 2 best parameters:{'max_depth': 2, 'max_features': 3}
Trial 3 best parameters:{'max_depth': 4, 'max_features': 5}
Average Train Score: 0.964779393359056
Average Validation Score: 0.9571699413995173
Average Test Score: 0.9704717630853995



## Support Vector Machine
### Variable hyperparameters: 
    kernels = "linear", "poly", "rbf"
    c = 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 0.001, 0.1, 1, 10, 100, 1e3


In [100]:
bestparamsav = None
bestparampersist = None
bestval = 0
## 80/20 Train/Test Split

train_scores = []
test_scores = []
actual_test_scores = []

print('SVM using a 80/20 train/test split:')
for x in range(1,4):
    scaler = MinMaxScaler(feature_range=(-1, 1))
    scaler.fit(X)
    scaled_X = scaler.transform(X)
    
    X_train, X_test, y_train, y_test = train_test_split(scaled_X, y, test_size=0.80)

    
    c = [1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 0.1, 1,
         10, 100, 1e3]
    dual = [True, False]
    parameters = {'dual': dual, 'C' : c}
    svm = LinearSVC()
    search = GridSearchCV(svm, parameters, n_jobs = -1, cv=3, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    bestparampersist = best_params
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print("Average Train Score: " + str(avg_train_scores))
print("Average Validation Score: " + str(avg_test_scores))
if bestval < avg_test_scores:
    bestval = avg_test_scores
    bestparamsav = bestparampersist

## 50/50 Train/Test Split

train_scores = []
test_scores = []
actual_test_scores = []

print('SVM using a 50/50 train/test split:')
for x in range(1,4):
    scaler = MinMaxScaler(feature_range=(-1, 1))
    scaler.fit(X)
    scaled_X = scaler.transform(X)
    
    X_train, X_test, y_train, y_test = train_test_split(scaled_X, y, test_size=0.50)

    
    c = [1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 0.1, 1,
         10, 100, 1e3]
    dual = [True, False]
    parameters = {'dual': dual, 'C' : c}
    svm = LinearSVC()
    search = GridSearchCV(svm, parameters, n_jobs = -1, cv=3, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    bestparampersist = best_params
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print("Average Train Score: " + str(avg_train_scores))
print("Average Validation Score: " + str(avg_test_scores))
if bestval < avg_test_scores:
    bestval = avg_test_scores
    bestparamsav = bestparampersist

## 20/80 Train/Test Split

train_scores = []
test_scores = []
actual_test_scores = []

print('SVM using a 20/80 train/test split:')
for x in range(1,4):
    scaler = MinMaxScaler(feature_range=(-1, 1))
    scaler.fit(X)
    scaled_X = scaler.transform(X)
    
    X_train, X_test, y_train, y_test = train_test_split(scaled_X, y, test_size=0.20)

    
    c = [1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 0.1, 1,
         10, 100, 1e3]
    dual = [True, False]
    parameters = {'dual': dual, 'C' : c}
    svm = LinearSVC()
    search = GridSearchCV(svm, parameters, n_jobs = -1, cv=3, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    bestparampersist = best_params
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print("Average Train Score: " + str(avg_train_scores))
print("Average Validation Score: " + str(avg_test_scores))
if bestval < avg_test_scores:
    bestval = avg_test_scores
    bestparamsav = bestparampersist

for x in range(1,4):
    svm = LinearSVC(dual = bestparamsav['dual'], C = bestparamsav['C'])
    svm.fit(X_train, y_train)
    actual_test_scores.append(svm.score(X_test, y_test))
print("Average Test Score: " + str(np.mean(actual_test_scores)) + '\n')



SVM using a 80/20 train/test split:
Trial 1 best parameters:{'C': 1000.0, 'dual': False}
Trial 2 best parameters:{'C': 1000.0, 'dual': False}
Trial 3 best parameters:{'C': 1000.0, 'dual': False}
Average Train Score: 0.9484031478773957
Average Validation Score: 0.9478863087963401
SVM using a 50/50 train/test split:
Trial 1 best parameters:{'C': 1000.0, 'dual': False}
Trial 2 best parameters:{'C': 1000.0, 'dual': False}
Trial 3 best parameters:{'C': 1000.0, 'dual': False}
Average Train Score: 0.9499171191919629
Average Validation Score: 0.9496536259661518
SVM using a 20/80 train/test split:
Trial 1 best parameters:{'C': 1000.0, 'dual': False}
Trial 2 best parameters:{'C': 1000.0, 'dual': False}
Trial 3 best parameters:{'C': 1000.0, 'dual': False}
Average Train Score: 0.9492829577643036
Average Validation Score: 0.9491009292094283
Average Test Score: 0.9659090909090909



## Logistic Regression
### Variable hyperparameters: 
    c = 1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100, 1e3, 1e4
    penalty = l2, l1
    class weight = balanced, none


In [101]:
bestparamsav = None
bestparampersist = None
bestval = 0
## 80/20 Train/Test Split

train_scores = []
test_scores = []
actual_test_scores = []

print('Logistic Regression using a 80/20 train/test split:')
for x in range(1,4):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.80)

    c = [1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100, 1e3, 1e4]
    penalty = ['l2', 'l1']
    class_weight = ['balanced', None]
    parameters = {'C' : c, 'penalty' : penalty, 'class_weight' : class_weight}
    logr = LogisticRegression()
    search = GridSearchCV(logr, parameters, cv=3, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    bestparampersist = best_params
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print("Average Train Score: " + str(avg_train_scores))
print("Average Validation Score: " + str(avg_test_scores))
if bestval < avg_test_scores:
    bestval = avg_test_scores
    bestparamsav = bestparampersist

## 50/50 Train/Test Split

train_scores = []
test_scores = []
actual_test_scores = []

print('Logistic Regression using a 50/50 train/test split:')
for x in range(1,4):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.50)

    c = [1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100, 1e3, 1e4]
    penalty = ['l2', 'l1']
    class_weight = ['balanced', None]
    parameters = {'C' : c, 'penalty' : penalty, 'class_weight' : class_weight}
    logr = LogisticRegression()
    search = GridSearchCV(logr, parameters, cv=3, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    bestparampersist = best_params
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print("Average Train Score: " + str(avg_train_scores))
print("Average Validation Score: " + str(avg_test_scores))
if bestval < avg_test_scores:
    bestval = avg_test_scores
    bestparamsav = bestparampersist

## 20/80 Train/Test Split

train_scores = []
test_scores = []
actual_test_scores = []

print('Logistic Regression using a 20/80 train/test split:')
for x in range(1,4):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

    c = [1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100, 1e3, 1e4]
    penalty = ['l2', 'l1']
    class_weight = ['balanced', None]
    parameters = {'C' : c, 'penalty' : penalty, 'class_weight' : class_weight}
    logr = LogisticRegression()
    search = GridSearchCV(logr, parameters, cv=3, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    bestparampersist = best_params
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print("Average Train Score: " + str(avg_train_scores))
print("Average Validation Score: " + str(avg_test_scores))
if bestval < avg_test_scores:
    bestval = avg_test_scores
    bestparamsav = bestparampersist

for x in range(1,4):
    logr = LogisticRegression(C = bestparamsav['C'], penalty = bestparamsav['penalty'], class_weight = bestparamsav['class_weight'])
    logr.fit(X_train, y_train)
    actual_test_scores.append(logr.score(X_test, y_test))
print("Average Test Score: " + str(np.mean(actual_test_scores)) + '\n')



Logistic Regression using a 80/20 train/test split:
Trial 1 best parameters:{'C': 10, 'class_weight': None, 'penalty': 'l1'}
Trial 2 best parameters:{'C': 100, 'class_weight': None, 'penalty': 'l1'}
Trial 3 best parameters:{'C': 1000.0, 'class_weight': None, 'penalty': 'l1'}
Average Train Score: 0.9301254557474291
Average Validation Score: 0.9290032614748233
Logistic Regression using a 50/50 train/test split:
Trial 1 best parameters:{'C': 10, 'class_weight': None, 'penalty': 'l1'}
Trial 2 best parameters:{'C': 100, 'class_weight': None, 'penalty': 'l1'}
Trial 3 best parameters:{'C': 100, 'class_weight': None, 'penalty': 'l2'}
Average Train Score: 0.932493975361295
Average Validation Score: 0.931895994318483
Logistic Regression using a 20/80 train/test split:
Trial 1 best parameters:{'C': 10, 'class_weight': None, 'penalty': 'l1'}
Trial 2 best parameters:{'C': 10, 'class_weight': None, 'penalty': 'l1'}
Trial 3 best parameters:{'C': 100, 'class_weight': None, 'penalty': 'l1'}
Average Tra

## Detecting Room Occupancy
### Cleaning up the data:

In [102]:
df1 = pd.read_csv('datatest.txt')
df2 = pd.read_csv('datatest2.txt')
df3 = pd.read_csv('datatraining.txt')
frames = [df1, df2, df3]
df = pd.concat(frames)
df = df.reset_index(drop=True)
df["Occupancy"] = df["Occupancy"].map(lambda x: 1 if x == 1 else -1) 
df = df.drop(columns = ["date"])

np_arr = df.values
np.random.shuffle(np_arr)
X = np_arr[:2000, :-1]
y = np_arr[:2000, -1]


## Decision Tree
### Variable hyperparameters: 
    max_features = 1, 2, 3, 4, 5
    max_depth = 1, 2, 3, 4
    

In [103]:
bestparamsav = None
bestparampersist = None
bestval = 0
## 80/20 Train/Test Split

train_scores = []
test_scores = []
actual_test_scores = []

print('Decision Tree using a 80/20 train/test split:')
for x in range(1,4):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

    set_size = [1, 2, 3, 4, 5]
    max_depth = [1, 2, 3, 4]

    parameters = {'max_features': set_size, 'max_depth' : max_depth}

    dtc = DecisionTreeClassifier()
    search = GridSearchCV(dtc, parameters, n_jobs = -1, cv=3, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    bestparampersist = best_params
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print("Average Train Score: " + str(avg_train_scores))
print("Average Validation Score: " + str(avg_test_scores))
if bestval < avg_test_scores:
    bestval = avg_test_scores
    bestparamsav = bestparampersist
## 50/50 Train/Test Split

train_scores = []
test_scores = []
actual_test_scores = []

print('Decision Tree using a 50/50 train/test split:')
for x in range(1,4):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.50)

    set_size = [1, 2, 3, 4, 5]
    max_depth = [1, 2, 3, 4]

    parameters = {'max_features': set_size, 'max_depth' : max_depth}

    dtc = DecisionTreeClassifier()
    search = GridSearchCV(dtc, parameters, n_jobs = -1, cv=3, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    bestparampersist = best_params
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print("Average Train Score: " + str(avg_train_scores))
print("Average Validation Score: " + str(avg_test_scores))
if bestval < avg_test_scores:
    bestval = avg_test_scores
    bestparamsav = bestparampersist
## 20/80 Train/Test Split

train_scores = []
test_scores = []
actual_test_scores = []

print('Decision Tree using a 20/80 train/test split:')
for x in range(1,4):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.80)

    set_size = [1, 2, 3, 4, 5]
    max_depth = [1, 2, 3, 4]

    parameters = {'max_features': set_size, 'max_depth' : max_depth}

    dtc = DecisionTreeClassifier()
    search = GridSearchCV(dtc, parameters, n_jobs = -1, cv=3, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    bestparampersist = best_params
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print("Average Train Score: " + str(avg_train_scores))
print("Average Validation Score: " + str(avg_test_scores))
if bestval < avg_test_scores:
    bestval = avg_test_scores
    bestparamsav = bestparampersist
for x in range(1,4):
    dtc = DecisionTreeClassifier(max_features = bestparamsav['max_features'], max_depth = bestparamsav['max_depth'])
    dtc.fit(X_train, y_train)
    actual_test_scores.append(dtc.score(X_test, y_test))
print("Average Test Score: " + str(np.mean(actual_test_scores)) + '\n')



Decision Tree using a 80/20 train/test split:
Trial 1 best parameters:{'max_depth': 2, 'max_features': 4}
Trial 2 best parameters:{'max_depth': 1, 'max_features': 4}
Trial 3 best parameters:{'max_depth': 4, 'max_features': 3}
Average Train Score: 0.9662078211472837
Average Validation Score: 0.9643229166666667
Decision Tree using a 50/50 train/test split:
Trial 1 best parameters:{'max_depth': 1, 'max_features': 3}
Trial 2 best parameters:{'max_depth': 3, 'max_features': 4}
Trial 3 best parameters:{'max_depth': 3, 'max_features': 4}
Average Train Score: 0.9659564706072514
Average Validation Score: 0.9601833333333333
Decision Tree using a 20/80 train/test split:
Trial 1 best parameters:{'max_depth': 1, 'max_features': 4}
Trial 2 best parameters:{'max_depth': 3, 'max_features': 3}
Trial 3 best parameters:{'max_depth': 4, 'max_features': 3}
Average Train Score: 0.9582597254684778
Average Validation Score: 0.946875
Average Test Score: 0.9874999999999999



## Support Vector Machine
### Variable hyperparameters: 
    kernels = "linear", "poly", "rbf"
    c = 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 0.001, 0.1, 1, 10, 100, 1e3


In [104]:
bestparamsav = None
bestparampersist = None
bestval = 0
## 80/20 Train/Test Split

train_scores = []
test_scores = []
actual_test_scores = []

print('SVM using a 80/20 train/test split:')
for x in range(1,4):
    scaler = MinMaxScaler(feature_range=(-1, 1))
    scaler.fit(X)
    scaled_X = scaler.transform(X)
    
    X_train, X_test, y_train, y_test = train_test_split(scaled_X, y, test_size=0.80)

    
    c = [1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 0.1, 1,
         10, 100, 1e3]
    dual = [True, False]
    parameters = {'dual': dual, 'C' : c}
    svm = LinearSVC()
    search = GridSearchCV(svm, parameters, n_jobs = -1, cv=3, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    bestparampersist = best_params
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print("Average Train Score: " + str(avg_train_scores))
print("Average Validation Score: " + str(avg_test_scores))
if bestval < avg_test_scores:
    bestval = avg_test_scores
    bestparamsav = bestparampersist

## 50/50 Train/Test Split

train_scores = []
test_scores = []
actual_test_scores = []

print('SVM using a 50/50 train/test split:')
for x in range(1,4):
    scaler = MinMaxScaler(feature_range=(-1, 1))
    scaler.fit(X)
    scaled_X = scaler.transform(X)
    
    X_train, X_test, y_train, y_test = train_test_split(scaled_X, y, test_size=0.50)

    
    c = [1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 0.1, 1,
         10, 100, 1e3]
    dual = [True, False]
    parameters = {'dual': dual, 'C' : c}
    svm = LinearSVC()
    search = GridSearchCV(svm, parameters, n_jobs = -1, cv=3, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    bestparampersist = best_params
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print("Average Train Score: " + str(avg_train_scores))
print("Average Validation Score: " + str(avg_test_scores))
if bestval < avg_test_scores:
    bestval = avg_test_scores
    bestparamsav = bestparampersist

## 20/80 Train/Test Split

train_scores = []
test_scores = []
actual_test_scores = []

print('SVM using a 20/80 train/test split:')
for x in range(1,4):
    scaler = MinMaxScaler(feature_range=(-1, 1))
    scaler.fit(X)
    scaled_X = scaler.transform(X)
    
    X_train, X_test, y_train, y_test = train_test_split(scaled_X, y, test_size=0.20)

    
    c = [1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 0.1, 1,
         10, 100, 1e3]
    dual = [True, False]
    parameters = {'dual': dual, 'C' : c}
    svm = LinearSVC()
    search = GridSearchCV(svm, parameters, n_jobs = -1, cv=3, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    bestparampersist = best_params
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print("Average Train Score: " + str(avg_train_scores))
print("Average Validation Score: " + str(avg_test_scores))
if bestval < avg_test_scores:
    bestval = avg_test_scores
    bestparamsav = bestparampersist

for x in range(1,4):
    svm = LinearSVC(dual = bestparamsav['dual'], C = bestparamsav['C'])
    svm.fit(X_train, y_train)
    actual_test_scores.append(svm.score(X_test, y_test))
print("Average Test Score: " + str(np.mean(actual_test_scores)) + '\n')



SVM using a 80/20 train/test split:
Trial 1 best parameters:{'C': 1, 'dual': True}
Trial 2 best parameters:{'C': 100, 'dual': True}
Trial 3 best parameters:{'C': 0.1, 'dual': True}
Average Train Score: 0.8941876262657042
Average Validation Score: 0.8933333333333333
SVM using a 50/50 train/test split:
Trial 1 best parameters:{'C': 1, 'dual': True}
Trial 2 best parameters:{'C': 10, 'dual': True}
Trial 3 best parameters:{'C': 10, 'dual': True}
Average Train Score: 0.8994004041210976
Average Validation Score: 0.899
SVM using a 20/80 train/test split:
Trial 1 best parameters:{'C': 0.1, 'dual': True}
Trial 2 best parameters:{'C': 100, 'dual': True}
Trial 3 best parameters:{'C': 10, 'dual': True}
Average Train Score: 0.901496525624941
Average Validation Score: 0.9013636363636363
Average Test Score: 0.9825



## Logistic Regression
### Variable hyperparameters: 
    c = 1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100, 1e3, 1e4
    penalty = l2, l1
    class weight = balanced, none


In [105]:
bestparamsav = None
bestparampersist = None
bestval = 0
## 80/20 Train/Test Split

train_scores = []
test_scores = []
actual_test_scores = []

print('Logistic Regression using a 80/20 train/test split:')
for x in range(1,4):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.80)

    c = [1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100, 1e3, 1e4]
    penalty = ['l2', 'l1']
    class_weight = ['balanced', None]
    parameters = {'C' : c, 'penalty' : penalty, 'class_weight' : class_weight}
    logr = LogisticRegression()
    search = GridSearchCV(logr, parameters, cv=3, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    bestparampersist = best_params
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print("Average Train Score: " + str(avg_train_scores))
print("Average Validation Score: " + str(avg_test_scores))
if bestval < avg_test_scores:
    bestval = avg_test_scores
    bestparamsav = bestparampersist

## 50/50 Train/Test Split

train_scores = []
test_scores = []
actual_test_scores = []

print('Logistic Regression using a 50/50 train/test split:')
for x in range(1,4):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.50)

    c = [1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100, 1e3, 1e4]
    penalty = ['l2', 'l1']
    class_weight = ['balanced', None]
    parameters = {'C' : c, 'penalty' : penalty, 'class_weight' : class_weight}
    logr = LogisticRegression()
    search = GridSearchCV(logr, parameters, cv=3, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    bestparampersist = best_params
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print("Average Train Score: " + str(avg_train_scores))
print("Average Validation Score: " + str(avg_test_scores))
if bestval < avg_test_scores:
    bestval = avg_test_scores
    bestparamsav = bestparampersist

## 20/80 Train/Test Split

train_scores = []
test_scores = []
actual_test_scores = []

print('Logistic Regression using a 20/80 train/test split:')
for x in range(1,4):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

    c = [1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100, 1e3, 1e4]
    penalty = ['l2', 'l1']
    class_weight = ['balanced', None]
    parameters = {'C' : c, 'penalty' : penalty, 'class_weight' : class_weight}
    logr = LogisticRegression()
    search = GridSearchCV(logr, parameters, cv=3, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    bestparampersist = best_params
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print("Average Train Score: " + str(avg_train_scores))
print("Average Validation Score: " + str(avg_test_scores))
if bestval < avg_test_scores:
    bestval = avg_test_scores
    bestparamsav = bestparampersist

for x in range(1,4):
    logr = LogisticRegression(C = bestparamsav['C'], penalty = bestparamsav['penalty'], class_weight = bestparamsav['class_weight'])
    logr.fit(X_train, y_train)
    actual_test_scores.append(logr.score(X_test, y_test))
print("Average Test Score: " + str(np.mean(actual_test_scores)) + '\n')



Logistic Regression using a 80/20 train/test split:
Trial 1 best parameters:{'C': 0.1, 'class_weight': 'balanced', 'penalty': 'l2'}
Trial 2 best parameters:{'C': 0.1, 'class_weight': 'balanced', 'penalty': 'l2'}
Trial 3 best parameters:{'C': 0.1, 'class_weight': None, 'penalty': 'l2'}
Average Train Score: 0.9207307811088139
Average Validation Score: 0.9196474358974359
Logistic Regression using a 50/50 train/test split:
Trial 1 best parameters:{'C': 10, 'class_weight': None, 'penalty': 'l1'}
Trial 2 best parameters:{'C': 0.1, 'class_weight': None, 'penalty': 'l2'}
Trial 3 best parameters:{'C': 10000.0, 'class_weight': None, 'penalty': 'l1'}
Average Train Score: 0.9264725320967409
Average Validation Score: 0.9262115384615385
Logistic Regression using a 20/80 train/test split:
Trial 1 best parameters:{'C': 0.1, 'class_weight': None, 'penalty': 'l2'}
Trial 2 best parameters:{'C': 100, 'class_weight': None, 'penalty': 'l1'}
Trial 3 best parameters:{'C': 0.1, 'class_weight': None, 'penalty':