In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
from sklearn import svm
from sklearn.svm import SVC
from scipy import stats
import math
import matplotlib.pyplot as plt

## Urban Traffic in Sao Paulo Dataset
### Cleaning up the data:

In [None]:
df = pd.read_csv('Behavior of the urban traffic of the city of Sao Paulo in Brazil.csv', delimiter=';')

df["Slowness in traffic (%)"] = df["Slowness in traffic (%)"].map(lambda x: x.replace(',', '.')) 
time_range = df["Hour (Coded)"].max() - df["Hour (Coded)"].min()
df["Time of Day"] = df["Hour (Coded)"].map(lambda x: 1 if x <= math.ceil(time_range/2) else -1)
df.drop(columns = ["Hour (Coded)"], inplace = True)
df["Slowness in traffic (%)"] = df["Slowness in traffic (%)"].astype(float)
np_arr = df.values

X = np_arr[:, :-1]
y = np_arr[:, -1]
print(X.shape)


## Random Forest
### Variable hyperparameters: 
    max_features: 1, 2, 3, 4, 6, 8, 10, 12, 16
    n_estimators = 10, 30, 50
    

In [None]:
## 80/20 Train/Test Split

train_scores = []
test_scores = []

print('Random Forest using a 80/20 train/test split:')
for x in range(1,4):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

    set_size = [1, 2, 3, 4, 6, 8, 10, 12, 16]
    n_estimators = [10, 30, 70]

    parameters = {'max_features': set_size, 'n_estimators' : n_estimators}

    rfc = RandomForestClassifier()
    search = GridSearchCV(rfc, parameters, n_jobs = -1, cv=3, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print(avg_train_scores)
print(str(avg_test_scores) + '\n')


## 50/50 Train/Test Split

train_scores = []
test_scores = []

print('Random Forest using a 80/20 train/test split:')
for x in range(1,4):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.50)

    rfc = RandomForestClassifier(n_estimators = num_trees)
    search = GridSearchCV(rfc, parameters, n_jobs = -1, cv=3, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print(avg_train_scores)
print(str(avg_test_scores) + '\n')


## 20/80 Train/Test Split

train_scores = []
test_scores = []

print('Random Forest using a 80/20 train/test split:')
for x in range(1,4):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.80)

    rfc = RandomForestClassifier(n_estimators = num_trees)
    search = GridSearchCV(rfc, parameters, n_jobs = -1, cv=3, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print(avg_train_scores)
print(str(avg_test_scores) + '\n')



## Support Vector Machine
### Variable hyperparameters: 
    kernels = "linear", "poly", "rbf"
    c = 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 0.001, 0.1, 1, 10, 100, 1e3
    gamma = 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 2


In [None]:
## 80/20 Train/Test Split

train_scores = []
test_scores = []

print('SVM using a 80/20 train/test split:')
for x in range(1,4):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

    kernels = ["linear", "rbf"]
    c = [1e-7, 1e-5, 1e-3, 0.1, 10, 1e3]
    gamma = [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 2]
    #degree = [3]
    parameters = {'kernel': kernels, 'C' : c, 'gamma' : gamma}
    svm = SVC()
    search = GridSearchCV(svm, parameters, n_jobs = -1, cv=5, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print(avg_train_scores)
print(str(avg_test_scores) + '\n')


## 50/50 Train/Test Split
train_scores = []
test_scores = []

print('SVM using a 50/50 train/test split:')
for x in range(1,4):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.50)

    svm = SVC()
    search = GridSearchCV(svm, parameters, n_jobs = -1, cv=5, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print(avg_train_scores)
print(str(avg_test_scores) + '\n')


## 20/80 Train/Test Split
train_scores = []
test_scores = []

print('SVM using a 20/80 train/test split:')
for x in range(1,4):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.80)

    svm = SVC()
    search = GridSearchCV(svm, parameters, n_jobs = -1, cv=5, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print(avg_train_scores)
print(str(avg_test_scores) + '\n')

print(np.mean(search.cv_results_['mean_train_score']))
print(np.mean(search.cv_results_['mean_test_score']))
print('')


## Logistic Regression
### Variable hyperparameters: 
    c = 1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100, 1e3, 1e4
    penalty = l2, l1

In [None]:
## 80/20 Train/Test Split

train_scores = []
test_scores = []

print('Logistic Regression using a 80/20 train/test split:')
for x in range(1,4):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

    c = [1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100, 1e3, 1e4]
    penalty = ['l2', 'l1']
    parameters = {'C' : c, 'penalty' : penalty}
    logr = LogisticRegression()
    search = GridSearchCV(logr, parameters, cv=5, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print(avg_train_scores)
print(str(avg_test_scores) + '\n')



## 50/50 Train/Test Split
train_scores = []
test_scores = []
print('Logistic Regression using a 50/50 train/test split:')
for x in range(1,4):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.50)

    logr = LogisticRegression()
    search = GridSearchCV(logr, parameters, cv=5, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print(avg_train_scores)
print(str(avg_test_scores) + '\n')

## 20/80 Train/Test Split
train_scores = []
test_scores = []
print('Logistic Regression using a 20/80 train/test split:')
for x in range(1,4):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8)

    logr = LogisticRegression()
    search = GridSearchCV(logr, parameters, cv=5, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print(avg_train_scores)
print(str(avg_test_scores) + '\n')


## Banknote Authentication Dataset
### Cleaning up the data:

In [None]:
banknote_df = pd.read_csv('data_banknote_authentication.txt', header = None,
            names = ["Variance", "Skewness", "Curtosis", "Entropy", "Label"])
banknote_df["Label"] = banknote_df["Label"].map(lambda x: x if x==1 else -1) 
banknote_arr = banknote_df.values

X = banknote_arr[:, :-1]
y = banknote_arr[:, -1]

## Random Forest
### Variable hyperparameters: 
    max_features: 1, 2, 3
    n_estimators = 10, 30, 50
    

In [None]:
## 80/20 Train/Test Split

train_scores = []
test_scores = []

print('Logistic Regression using a 80/20 train/test split:')
for x in range(1,4):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

    set_size = [1, 2, 3]
    num_trees = [10, 30, 50]
    parameters = {'max_features': set_size, 'num_trees' : num_trees}

    rfc = RandomForestClassifier()
    search = GridSearchCV(rfc, parameters, n_jobs = -1, cv=3, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print(avg_train_scores)
print(str(avg_test_scores) + '\n')


## 50/50 Train/Test Split

train_scores = []
test_scores = []

print('Logistic Regression using a 80/20 train/test split:')
for x in range(1,4):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.50)

    rfc = RandomForestClassifier(n_estimators = num_trees)
    search = GridSearchCV(rfc, parameters, n_jobs = -1, cv=3, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print(avg_train_scores)
print(str(avg_test_scores) + '\n')


## 20/80 Train/Test Split

train_scores = []
test_scores = []

print('Logistic Regression using a 80/20 train/test split:')
for x in range(1,4):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.80)

    rfc = RandomForestClassifier(n_estimators = num_trees)
    search = GridSearchCV(rfc, parameters, n_jobs = -1, cv=3, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print(avg_train_scores)
print(str(avg_test_scores) + '\n')



## Support Vector Machine
### Variable hyperparameters: 
    kernels = "linear", "poly", "rbf"
    c = 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 0.001, 0.1, 1, 10, 100, 1e3
    gamma = 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 2


In [None]:
## 80/20 Train/Test Split

train_scores = []
test_scores = []

print('SVM using a 80/20 train/test split:')
for x in range(1,4):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

    kernels = ["linear", "poly", "rbf"]
    c = [1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 0.001, 0.1, 1, 10, 100, 1e3]
    gamma = [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 2]
    degree = [3]
    parameters = {'kernel': kernels, 'C' : c, 'gamma' : gamma, 'degree' : degree}
    svm = SVC()
    search = GridSearchCV(svm, parameters, n_jobs = -1, cv=5, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print(avg_train_scores)
print(str(avg_test_scores) + '\n')


## 50/50 Train/Test Split
train_scores = []
test_scores = []

print('SVM using a 50/50 train/test split:')
for x in range(1,4):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.50)

    svm = SVC()
    search = GridSearchCV(svm, parameters, n_jobs = -1, cv=5, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print(avg_train_scores)
print(str(avg_test_scores) + '\n')


## 20/80 Train/Test Split
train_scores = []
test_scores = []

print('SVM using a 20/80 train/test split:')
for x in range(1,4):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.80)

    svm = SVC()
    search = GridSearchCV(svm, parameters, n_jobs = -1, cv=5, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print(avg_train_scores)
print(str(avg_test_scores) + '\n')

print(np.mean(search.cv_results_['mean_train_score']))
print(np.mean(search.cv_results_['mean_test_score']))
print('')


## Logistic Regression
### Variable hyperparameters: 
    c = 1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100, 1e3, 1e4
    penalty = l2, l1

In [None]:
## 80/20 Train/Test Split

train_scores = []
test_scores = []

print('Logistic Regression using a 80/20 train/test split:')
for x in range(1,4):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

    c = [1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100, 1e3, 1e4]
    penalty = ['l2', 'l1']
    parameters = {'C' : c, 'penalty' : penalty}
    logr = LogisticRegression()
    search = GridSearchCV(logr, parameters, cv=5, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print(avg_train_scores)
print(str(avg_test_scores) + '\n')



## 50/50 Train/Test Split
train_scores = []
test_scores = []
print('Logistic Regression using a 50/50 train/test split:')
for x in range(1,4):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.50)

    logr = LogisticRegression()
    search = GridSearchCV(logr, parameters, cv=5, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print(avg_train_scores)
print(str(avg_test_scores) + '\n')

## 20/80 Train/Test Split
train_scores = []
test_scores = []
print('Logistic Regression using a 20/80 train/test split:')
for x in range(1,4):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8)

    logr = LogisticRegression()
    search = GridSearchCV(logr, parameters, cv=5, return_train_score=True)
    search.fit(X_train, y_train)
    best_params = search.best_params_
    train_scores.append(search.cv_results_['mean_train_score'])
    test_scores.append(search.cv_results_['mean_test_score'])
    print('Trial ' + str(x) + ' best parameters:' + str(best_params))

avg_train_scores = np.mean(train_scores)
avg_test_scores = np.mean(test_scores)

print(avg_train_scores)
print(str(avg_test_scores) + '\n')


# TODO
### 'average training' accuracy + 'testing' 