In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import LinearSVC
from sklearn import svm
from sklearn.svm import SVC
from scipy import stats
import math
import matplotlib.pyplot as plt

## Urban Traffic in Sao Paulo Dataset
### Cleaning up the data:

In [None]:
df = pd.read_csv('Behavior of the urban traffic of the city of Sao Paulo in Brazil.csv', delimiter=';')

df["Slowness in traffic (%)"] = df["Slowness in traffic (%)"].map(lambda x: x.replace(',', '.')) 
time_range = df["Hour (Coded)"].max() - df["Hour (Coded)"].min()
df["Time of Day"] = df["Hour (Coded)"].map(lambda x: 1 if x <= math.ceil(time_range/2) else -1)
df.drop(columns = ["Hour (Coded)"], inplace = True)
df["Slowness in traffic (%)"] = df["Slowness in traffic (%)"].astype(float)
np_arr = df.values

X = np_arr[:, :-1]
y = np_arr[:, -1]
print(X.shape)
print(y[0:30])


## Decision Tree
### Variable hyperparameters: 
    max_features: 1, 2, 3, 6, 8, 10, 15
    max_depth : 1, 2, 3
    

In [None]:
## 80/20 Train/Test Split

train_scores = []
test_scores = []

print('Random Forest using a 80/20 train/test split:')
for x in range(1,4):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

    set_size = [1, 2, 3, 6, 8, 10, 15]
    max_depth = [1, 2, 3]

    parameters = {'max_features': set_size, 'max_depth' : max_depth}

    dtc = DecisionTreeClassifier()
    search = GridSearchCV(dtc, parameters, n_jobs = -1, cv=3, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print("Average Train Score: " + str(avg_train_scores))
print("Average Test Score: " + str(avg_test_scores) + '\n')


## 50/50 Train/Test Split

train_scores = []
test_scores = []

print('Random Forest using a 50/50 train/test split:')
for x in range(1,4):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.50)

    dtc = DecisionTreeClassifier()
    search = GridSearchCV(dtc, parameters, n_jobs = -1, cv=3, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print("Average Train Score: " + str(avg_train_scores))
print("Average Test Score: " + str(avg_test_scores) + '\n')


## 20/80 Train/Test Split

train_scores = []
test_scores = []

print('Random Forest using a 20/80 train/test split:')
for x in range(1,4):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.80)

    dtc = DecisionTreeClassifier()
    search = GridSearchCV(dtc, parameters, n_jobs = -1, cv=3, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print("Average Train Score: " + str(avg_train_scores))
print("Average Test Score: " + str(avg_test_scores) + '\n')



## Support Vector Machine
### Variable hyperparameters: 
    kernels = "linear", "poly", "rbf"
    c = 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 0.001, 0.1, 1, 10, 100, 1e3
    gamma = 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 2


In [None]:
## 80/20 Train/Test Split

train_scores = []
test_scores = []

print('SVM using a 80/20 train/test split:')
for x in range(1,4):
    scaler = MinMaxScaler(feature_range=(-1, 1))
    scaler.fit(X)
    scaled_X = scaler.transform(X)
    
    X_train, X_test, y_train, y_test = train_test_split(scaled_X, y, test_size=0.20)

    
    c = [1e-7, 1e-5, 1e-3, 0.1, 10, 1e3]
    dual = [True, False]
    parameters = {'dual': dual, 'C' : c}
    svm = LinearSVC()
    search = GridSearchCV(svm, parameters, n_jobs = -1, cv=3, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print("Average Train Score: " + str(avg_train_scores))
print("Average Test Score: " + str(avg_test_scores) + '\n')


## 50/50 Train/Test Split
train_scores = []
test_scores = []

print('SVM using a 50/50 train/test split:')
for x in range(1,4):
    scaler = MinMaxScaler(feature_range=(-1, 1))
    scaler.fit(X)
    scaled_X = scaler.transform(X)
    
    X_train, X_test, y_train, y_test = train_test_split(scaled_X, y, test_size=0.50)

    svm = LinearSVC()
    search = GridSearchCV(svm, parameters, n_jobs = -1, cv=3, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print("Average Train Score: " + str(avg_train_scores))
print("Average Test Score: " + str(avg_test_scores) + '\n')


## 20/80 Train/Test Split
train_scores = []
test_scores = []

print('SVM using a 20/80 train/test split:')
for x in range(1,4):
    scaler = MinMaxScaler(feature_range=(-1, 1))
    scaler.fit(X)
    scaled_X = scaler.transform(X)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.80)

    svm = LinearSVC()
    search = GridSearchCV(svm, parameters, n_jobs = -1, cv=3, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print("Average Train Score: " + str(avg_train_scores))
print("Average Test Score: " + str(avg_test_scores) + '\n')



## Logistic Regression
### Variable hyperparameters: 
    c = 1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100, 1e3, 1e4
    penalty = l2, l1

In [None]:
## 80/20 Train/Test Split

train_scores = []
test_scores = []

print('Logistic Regression using a 80/20 train/test split:')
for x in range(1,4):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

    c = [1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100, 1e3, 1e4]
    penalty = ['l2', 'l1']
    parameters = {'C' : c, 'penalty' : penalty}
    logr = LogisticRegression()
    search = GridSearchCV(logr, parameters, cv=3, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print("Average Train Score: " + str(avg_train_scores))
print("Average Test Score: " + str(avg_test_scores) + '\n')



## 50/50 Train/Test Split
train_scores = []
test_scores = []
print('Logistic Regression using a 50/50 train/test split:')
for x in range(1,4):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.50)

    logr = LogisticRegression()
    search = GridSearchCV(logr, parameters, cv=3, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print("Average Train Score: " + str(avg_train_scores))
print("Average Test Score: " + str(avg_test_scores) + '\n')

## 20/80 Train/Test Split
train_scores = []
test_scores = []
print('Logistic Regression using a 20/80 train/test split:')
for x in range(1,4):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8)

    logr = LogisticRegression()
    search = GridSearchCV(logr, parameters, cv=3, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print("Average Train Score: " + str(avg_train_scores))
print("Average Test Score: " + str(avg_test_scores) + '\n')


## Banknote Authentication Dataset
### Cleaning up the data:

In [None]:
banknote_df = pd.read_csv('data_banknote_authentication.txt', header = None,
            names = ["Variance", "Skewness", "Curtosis", "Entropy", "Label"])
banknote_df["Label"] = banknote_df["Label"].map(lambda x: x if x==1 else -1) 
banknote_arr = banknote_df.values

X = banknote_arr[:, :-1]
y = banknote_arr[:, -1]

## Decision Tree
### Variable hyperparameters: 
    max_features : 1, 2, 3, 6, 8, 10, 15
    max_depth : 1, 2, 3
    

In [None]:
## 80/20 Train/Test Split

train_scores = []
test_scores = []

print('Logistic Regression using a 80/20 train/test split:')
for x in range(1,4):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

    set_size = [1, 2, 3, 6, 8, 10, 15]
    max_depth = [1, 2, 3]
    parameters = {'max_features': set_size, 'max_depth' : max_depth}

    dtc = DecisionTreeClassifier()
    search = GridSearchCV(dtc, parameters, n_jobs = -1, cv=3, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print("Average Train Score: " + str(avg_train_scores))
print("Average Test Score: " + str(avg_test_scores) + '\n')


## 50/50 Train/Test Split

train_scores = []
test_scores = []

print('Logistic Regression using a 50/50 train/test split:')
for x in range(1,4):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.50)

    dtc = DecisionTreeClassifier()
    search = GridSearchCV(dtc, parameters, n_jobs = -1, cv=3, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print("Average Train Score: " + str(avg_train_scores))
print("Average Test Score: " + str(avg_test_scores) + '\n')


## 20/80 Train/Test Split

train_scores = []
test_scores = []

print('Logistic Regression using a 20/80 train/test split:')
for x in range(1,4):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.80)

    dtc = DecisionTreeClassifier()
    search = GridSearchCV(dtc, parameters, n_jobs = -1, cv=3, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print("Average Train Score: " + str(avg_train_scores))
print("Average Test Score: " + str(avg_test_scores) + '\n')



## Support Vector Machine
### Variable hyperparameters: 
    kernels = "linear", "poly", "rbf"
    c = 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 0.001, 0.1, 1, 10, 100, 1e3
    gamma = 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 2


In [None]:
## 80/20 Train/Test Split

train_scores = []
test_scores = []

print('SVM using a 80/20 train/test split:')
for x in range(1,4):
    scaler = MinMaxScaler(feature_range=(-1, 1))
    scaler.fit(X)
    scaled_X = scaler.transform(X)
    
    X_train, X_test, y_train, y_test = train_test_split(scaled_X, y, test_size=0.20)

    
    c = [1e-7, 1e-5, 1e-3, 0.1, 10, 1e3]
    dual = [True, False]
    parameters = {'dual': dual, 'C' : c}
    svm = LinearSVC()
    search = GridSearchCV(svm, parameters, n_jobs = -1, cv=3, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print("Average Train Score: " + str(avg_train_scores))
print("Average Test Score: " + str(avg_test_scores) + '\n')


## 50/50 Train/Test Split
train_scores = []
test_scores = []

print('SVM using a 50/50 train/test split:')
for x in range(1,4):
    scaler = MinMaxScaler(feature_range=(-1, 1))
    scaler.fit(X)
    scaled_X = scaler.transform(X)
    
    X_train, X_test, y_train, y_test = train_test_split(scaled_X, y, test_size=0.50)

    svm = LinearSVC()
    search = GridSearchCV(svm, parameters, n_jobs = -1, cv=3, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print("Average Train Score: " + str(avg_train_scores))
print("Average Test Score: " + str(avg_test_scores) + '\n')


## 20/80 Train/Test Split
train_scores = []
test_scores = []

print('SVM using a 20/80 train/test split:')
for x in range(1,4):
    scaler = MinMaxScaler(feature_range=(-1, 1))
    scaler.fit(X)
    scaled_X = scaler.transform(X)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.80)

    svm = LinearSVC()
    search = GridSearchCV(svm, parameters, n_jobs = -1, cv=3, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print("Average Train Score: " + str(avg_train_scores))
print("Average Test Score: " + str(avg_test_scores) + '\n')



## Logistic Regression
### Variable hyperparameters: 
    c = 1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100, 1e3, 1e4
    penalty = l2, l1

In [None]:
## 80/20 Train/Test Split

train_scores = []
test_scores = []

print('Logistic Regression using a 80/20 train/test split:')
for x in range(1,4):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

    c = [1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100, 1e3, 1e4]
    penalty = ['l2', 'l1']
    parameters = {'C' : c, 'penalty' : penalty}
    logr = LogisticRegression()
    search = GridSearchCV(logr, parameters, cv=3, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print("Average Train Score: " + str(avg_train_scores))
print("Average Test Score: " + str(avg_test_scores) + '\n')



## 50/50 Train/Test Split
train_scores = []
test_scores = []
print('Logistic Regression using a 50/50 train/test split:')
for x in range(1,4):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.50)

    logr = LogisticRegression()
    search = GridSearchCV(logr, parameters, cv=3, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print("Average Train Score: " + str(avg_train_scores))
print("Average Test Score: " + str(avg_test_scores) + '\n')

## 20/80 Train/Test Split
train_scores = []
test_scores = []
print('Logistic Regression using a 20/80 train/test split:')
for x in range(1,4):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8)

    logr = LogisticRegression()
    search = GridSearchCV(logr, parameters, cv=3, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print("Average Train Score: " + str(avg_train_scores))
print("Average Test Score: " + str(avg_test_scores) + '\n')


## Detection of Diseased Trees
### Cleaning up the data:

In [None]:
df1 = pd.read_csv('Testing.csv')
df2 = pd.read_csv('Training.csv')
frames = [df1, df2]
df = pd.concat(frames)
df.reset_index(drop=True)
# 'w' is diseased, 'n' is all other
df["class"] = df["class"].map(lambda x: 1 if x == 'w' else -1) 
np_arr = df.values

X = np_arr[:, 1:]
y = np_arr[:, 0]

## Decision Tree
### Variable hyperparameters: 
    max_features : 1, 2, 3, 4
    max_depth : 1, 2, 3
    

In [None]:
## 80/20 Train/Test Split

train_scores = []
test_scores = []

print('Random Forest using a 80/20 train/test split:')
for x in range(1,4):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

    set_size = [1, 2, 3, 4]
    max_depth = [1, 2, 3]

    parameters = {'max_features': set_size, 'max_depth' : max_depth}

    dtc = DecisionTreeClassifier()
    search = GridSearchCV(dtc, parameters, n_jobs = -1, cv=3, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print("Average Train Score: " + str(avg_train_scores))
print("Average Test Score: " + str(avg_test_scores) + '\n')


## 50/50 Train/Test Split

train_scores = []
test_scores = []

print('Random Forest using a 50/50 train/test split:')
for x in range(1,4):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.50)

    dtc = DecisionTreeClassifier()
    search = GridSearchCV(dtc, parameters, n_jobs = -1, cv=3, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print("Average Train Score: " + str(avg_train_scores))
print("Average Test Score: " + str(avg_test_scores) + '\n')


## 20/80 Train/Test Split

train_scores = []
test_scores = []

print('Random Forest using a 20/80 train/test split:')
for x in range(1,4):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.80)

    dtc = DecisionTreeClassifier()
    search = GridSearchCV(dtc, parameters, n_jobs = -1, cv=3, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print("Average Train Score: " + str(avg_train_scores))
print("Average Test Score: " + str(avg_test_scores) + '\n')



## Support Vector Machine
### Variable hyperparameters: 
    kernels = "linear", "poly", "rbf"
    c = 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 0.001, 0.1, 1, 10, 100, 1e3
    gamma = 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 2


In [None]:
## 80/20 Train/Test Split

train_scores = []
test_scores = []

print('SVM using a 80/20 train/test split:')
for x in range(1,4):
    scaler = MinMaxScaler(feature_range=(-1, 1))
    scaler.fit(X)
    scaled_X = scaler.transform(X)
    
    X_train, X_test, y_train, y_test = train_test_split(scaled_X, y, test_size=0.20)

    
    c = [1e-7, 1e-5, 1e-3, 0.1, 10, 1e3]
    dual = [True, False]
    parameters = {'dual': dual, 'C' : c}
    svm = LinearSVC()
    search = GridSearchCV(svm, parameters, n_jobs = -1, cv=3, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print("Average Train Score: " + str(avg_train_scores))
print("Average Test Score: " + str(avg_test_scores) + '\n')


## 50/50 Train/Test Split
train_scores = []
test_scores = []

print('SVM using a 50/50 train/test split:')
for x in range(1,4):
    scaler = MinMaxScaler(feature_range=(-1, 1))
    scaler.fit(X)
    scaled_X = scaler.transform(X)
    
    X_train, X_test, y_train, y_test = train_test_split(scaled_X, y, test_size=0.50)

    svm = LinearSVC()
    search = GridSearchCV(svm, parameters, n_jobs = -1, cv=3, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print("Average Train Score: " + str(avg_train_scores))
print("Average Test Score: " + str(avg_test_scores) + '\n')


## 20/80 Train/Test Split
train_scores = []
test_scores = []

print('SVM using a 20/80 train/test split:')
for x in range(1,4):
    scaler = MinMaxScaler(feature_range=(-1, 1))
    scaler.fit(X)
    scaled_X = scaler.transform(X)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.80)

    svm = LinearSVC()
    search = GridSearchCV(svm, parameters, n_jobs = -1, cv=3, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print("Average Train Score: " + str(avg_train_scores))
print("Average Test Score: " + str(avg_test_scores) + '\n')



## Logistic Regression
### Variable hyperparameters: 
    c = 1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100, 1e3, 1e4
    penalty = l2, l1

In [None]:
## 80/20 Train/Test Split

train_scores = []
test_scores = []

print('Logistic Regression using a 80/20 train/test split:')
for x in range(1,4):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

    c = [1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100, 1e3, 1e4]
    penalty = ['l2', 'l1']
    parameters = {'C' : c, 'penalty' : penalty}
    logr = LogisticRegression()
    search = GridSearchCV(logr, parameters, cv=3, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print("Average Train Score: " + str(avg_train_scores))
print("Average Test Score: " + str(avg_test_scores) + '\n')


## 50/50 Train/Test Split
train_scores = []
test_scores = []
print('Logistic Regression using a 50/50 train/test split:')
for x in range(1,4):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.50)

    logr = LogisticRegression()
    search = GridSearchCV(logr, parameters, cv=3, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print("Average Train Score: " + str(avg_train_scores))
print("Average Test Score: " + str(avg_test_scores) + '\n')

## 20/80 Train/Test Split
train_scores = []
test_scores = []
print('Logistic Regression using a 20/80 train/test split:')
for x in range(1,4):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8)

    logr = LogisticRegression()
    search = GridSearchCV(logr, parameters, cv=3, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print("Average Train Score: " + str(avg_train_scores))
print("Average Test Score: " + str(avg_test_scores) + '\n')


## Detecting Room Occupancy
### Cleaning up the data:

In [None]:
df1 = pd.read_csv('datatest.txt')
df2 = pd.read_csv('datatest2.txt')
df3 = pd.read_csv('datatraining.txt')
frames = [df1, df2, df3]
df = pd.concat(frames)
df = df.reset_index(drop=True)
df["Occupancy"] = df["Occupancy"].map(lambda x: 1 if x == 1 else -1) 
df = df.drop(columns = ["date"])

np_arr = df.values
np.random.shuffle(np_arr)
X = np_arr[:2000, :-1]
y = np_arr[:2000, -1]


## Decision Tree
### Variable hyperparameters: 
    max_features = 1, 2, 3, 4, 5
    max_depth = 1, 2, 3
    

In [None]:
## 80/20 Train/Test Split

train_scores = []
test_scores = []

print('Random Forest using a 80/20 train/test split:')
for x in range(1,4):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

    set_size = [1, 2, 3, 4, 5]
    max_depth = [1, 2, 3]

    parameters = {'max_features': set_size, 'max_depth' : max_depth}

    dtc = DecisionTreeClassifier()
    search = GridSearchCV(dtc, parameters, n_jobs = -1, cv=3, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print("Average Train Score: " + str(avg_train_scores))
print("Average Test Score: " + str(avg_test_scores) + '\n')


## 50/50 Train/Test Split

train_scores = []
test_scores = []

print('Random Forest using a 50/50 train/test split:')
for x in range(1,4):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.50)

    dtc = DecisionTreeClassifier()
    search = GridSearchCV(dtc, parameters, n_jobs = -1, cv=3, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print("Average Train Score: " + str(avg_train_scores))
print("Average Test Score: " + str(avg_test_scores) + '\n')


## 20/80 Train/Test Split

train_scores = []
test_scores = []

print('Random Forest using a 20/80 train/test split:')
for x in range(1,4):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.80)

    dtc = DecisionTreeClassifier()
    search = GridSearchCV(dtc, parameters, n_jobs = -1, cv=3, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print("Average Train Score: " + str(avg_train_scores))
print("Average Test Score: " + str(avg_test_scores) + '\n')



## Support Vector Machine
### Variable hyperparameters: 
    kernels = "linear", "poly", "rbf"
    c = 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 0.001, 0.1, 1, 10, 100, 1e3
    gamma = 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 2


In [None]:
## 80/20 Train/Test Split

train_scores = []
test_scores = []

print('SVM using a 80/20 train/test split:')
for x in range(1,4):
    scaler = MinMaxScaler(feature_range=(-1, 1))
    scaler.fit(X)
    scaled_X = scaler.transform(X)
    
    X_train, X_test, y_train, y_test = train_test_split(scaled_X, y, test_size=0.20)

    
    c = [1e-7, 1e-5, 1e-3, 0.1, 10, 1e3]
    dual = [True, False]
    parameters = {'dual': dual, 'C' : c}
    svm = LinearSVC()
    search = GridSearchCV(svm, parameters, n_jobs = -1, cv=3, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print("Average Train Score: " + str(avg_train_scores))
print("Average Test Score: " + str(avg_test_scores) + '\n')


## 50/50 Train/Test Split
train_scores = []
test_scores = []

print('SVM using a 50/50 train/test split:')
for x in range(1,4):
    scaler = MinMaxScaler(feature_range=(-1, 1))
    scaler.fit(X)
    scaled_X = scaler.transform(X)
    
    X_train, X_test, y_train, y_test = train_test_split(scaled_X, y, test_size=0.50)

    svm = LinearSVC()
    search = GridSearchCV(svm, parameters, n_jobs = -1, cv=3, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print("Average Train Score: " + str(avg_train_scores))
print("Average Test Score: " + str(avg_test_scores) + '\n')


## 20/80 Train/Test Split
train_scores = []
test_scores = []

print('SVM using a 20/80 train/test split:')
for x in range(1,4):
    scaler = MinMaxScaler(feature_range=(-1, 1))
    scaler.fit(X)
    scaled_X = scaler.transform(X)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.80)

    svm = LinearSVC()
    search = GridSearchCV(svm, parameters, n_jobs = -1, cv=3, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print("Average Train Score: " + str(avg_train_scores))
print("Average Test Score: " + str(avg_test_scores) + '\n')



## Logistic Regression
### Variable hyperparameters: 
    c = 1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100, 1e3, 1e4
    penalty = l2, l1

In [None]:
## 80/20 Train/Test Split

train_scores = []
test_scores = []

print('Logistic Regression using a 80/20 train/test split:')
for x in range(1,4):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

    c = [1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100, 1e3, 1e4]
    penalty = ['l2', 'l1']
    parameters = {'C' : c, 'penalty' : penalty}
    logr = LogisticRegression()
    search = GridSearchCV(logr, parameters, cv=3, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print("Average Train Score: " + str(avg_train_scores))
print("Average Test Score: " + str(avg_test_scores) + '\n')


## 50/50 Train/Test Split
train_scores = []
test_scores = []
print('Logistic Regression using a 50/50 train/test split:')
for x in range(1,4):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.50)

    logr = LogisticRegression()
    search = GridSearchCV(logr, parameters, cv=3, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print("Average Train Score: " + str(avg_train_scores))
print("Average Test Score: " + str(avg_test_scores) + '\n')

## 20/80 Train/Test Split
train_scores = []
test_scores = []
print('Logistic Regression using a 20/80 train/test split:')
for x in range(1,4):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8)

    logr = LogisticRegression()
    search = GridSearchCV(logr, parameters, cv=3, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print("Average Train Score: " + str(avg_train_scores))
print("Average Test Score: " + str(avg_test_scores) + '\n')


## Best Classifiers, Hyper-parameters, and Parameters for each Dataset:

### Behavior of the Urban Traffic of the City of Sao Paulo in Brazil Data Set:


### Banknote Authentication Data Set:

### Wilt Data Set:

### Occupancy Detection Dataset:
