In [144]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
from sklearn import svm
from scipy import stats
import math
import matplotlib.pyplot as plt

## Urban Traffic in Sao Paulo Dataset
### Cleaning up the data:

In [145]:
df = pd.read_csv('Behavior of the urban traffic of the city of Sao Paulo in Brazil.csv', delimiter=';')

df["Slowness in traffic (%)"] = df["Slowness in traffic (%)"].map(lambda x: x.replace(',', '.')) 
time_range = df["Hour (Coded)"].max() - df["Hour (Coded)"].min()
df["Time of Day"] = df["Hour (Coded)"].map(lambda x: 1 if x <= math.ceil(time_range/2) else -1)
df.drop(columns = ["Hour (Coded)"], inplace = True)
df["Slowness in traffic (%)"] = df["Slowness in traffic (%)"].astype(float)
np_arr = df.values

X = np_arr[:, :-1]
y = np_arr[:, -1]


[[ 0.   0.   0.  ...  0.   0.   4.1]
 [ 0.   0.   0.  ...  0.   0.   6.6]
 [ 0.   0.   0.  ...  0.   0.   8.7]
 ...
 [ 1.   0.   0.  ...  0.   0.  17.7]
 [ 0.   4.   0.  ...  1.   0.  17.4]
 [ 0.   0.   0.  ...  0.   0.  12.1]]


## Banknote Authentication Dataset
### Cleaning up the data:

In [147]:
banknote_df = pd.read_csv('data_banknote_authentication.txt', header = None,
            names = ["Variance", "Skewness", "Curtosis", "Entropy", "Label"])
banknote_df["Label"] = banknote_df["Label"].map(lambda x: x if x==1 else -1) 
banknote_arr = banknote_df.values

X = banknote_arr[:, :-1]
y = banknote_arr[:, -1]

### 80/20 Training/Testing split
### Variable hyperparameters: 
    n_estimators: 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120
    max_depth: 1, 2, 3, 4, 5

In [149]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

n_list = list(range(10,121,10))
depth_list = list(range(1,6))
parameters = {'n_estimators': n_list, 'max_depth': depth_list}

rfc = RandomForestClassifier()
search = GridSearchCV(rfc, parameters, cv=5)
search.fit(X_train, y_train)

best_depth, best_n = search.best_params_
best_params = search.best_params_
best_n = best_params['n_estimators']
best_depth = best_params['max_depth']


### 50/50 Training/Testing split
### Variable hyperparameters: 
    n_estimators: 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120
    max_depth: 1, 2, 3, 4, 5

In [150]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.50)

n_list = list(range(10,121,10))
depth_list = list(range(1,6))
parameters = {'n_estimators': n_list, 'max_depth': depth_list}

rfc = RandomForestClassifier()
search = GridSearchCV(rfc, parameters, cv=5)
search.fit(X_train, y_train)

best_depth, best_n = search.best_params_
best_params = search.best_params_
best_n = best_params['n_estimators']
best_depth = best_params['max_depth']

### 20/80 Training/Testing split
### Variable hyperparameters: 
    n_estimators: 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120
    max_depth: 1, 2, 3, 4, 5

In [151]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.80)

n_list = list(range(10,121,10))
depth_list = list(range(1,6))
parameters = {'n_estimators': n_list, 'max_depth': depth_list}

rfc = RandomForestClassifier()
search = GridSearchCV(rfc, parameters, cv=5)
search.fit(X_train, y_train)

best_depth, best_n = search.best_params_
best_params = search.best_params_
best_n = best_params['n_estimators']
best_depth = best_params['max_depth']

## TODO: Logistic Regression + KNN

In [None]:

clf = svm.LinearSVC()
clf.fit(X_train, y_train)
clf.score(X_test, y_test)


clf = LogisticRegression().fit(X_train, y_train)


neigh = KNeighborsClassifier(n_neighbors=10).fit(X_train, y_train)
neigh.score(X_test, y_test)


## Unnecessary

In [None]:

'''
fig, axs = plt.subplots(2, 2, figsize=(8,8))

axs[0, 0].hist(X[:, 0], color = 'mediumvioletred')
axs[0, 0].set_title('Distribution of Feature 1')
axs[0, 1].hist(X[:, 1], color = 'darkturquoise')
axs[0, 1].set_title('Distribution of Feature 2')
axs[1, 0].hist(X[:, 2], color = 'darkorchid')
axs[1, 0].set_title('Distribution of Feature 3')
axs[1, 1].hist(X[:, 3], color = 'chartreuse')
axs[1, 1].set_title('Distribution of Feature 4')

plt.show()
k2, p = stats.normaltest(X_train)
print(p)'''