In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
from sklearn import svm
from sklearn.svm import SVC
from scipy import stats
import math
import matplotlib.pyplot as plt

## Urban Traffic in Sao Paulo Dataset
### Cleaning up the data:

In [154]:
df = pd.read_csv('Behavior of the urban traffic of the city of Sao Paulo in Brazil.csv', delimiter=';')

df["Slowness in traffic (%)"] = df["Slowness in traffic (%)"].map(lambda x: x.replace(',', '.')) 
time_range = df["Hour (Coded)"].max() - df["Hour (Coded)"].min()
df["Time of Day"] = df["Hour (Coded)"].map(lambda x: 1 if x <= math.ceil(time_range/2) else -1)
df.drop(columns = ["Hour (Coded)"], inplace = True)
df["Slowness in traffic (%)"] = df["Slowness in traffic (%)"].astype(float)
np_arr = df.values

X = np_arr[:, :-1]
y = np_arr[:, -1]


## Banknote Authentication Dataset
### Cleaning up the data:

In [2]:
banknote_df = pd.read_csv('data_banknote_authentication.txt', header = None,
            names = ["Variance", "Skewness", "Curtosis", "Entropy", "Label"])
banknote_df["Label"] = banknote_df["Label"].map(lambda x: x if x==1 else -1) 
banknote_arr = banknote_df.values

X = banknote_arr[:, :-1]
y = banknote_arr[:, -1]

### 80/20 Training/Testing split
### Variable hyperparameters: 
    n_estimators: 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120
    max_depth: 1, 2, 3, 4, 5

In [149]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

n_list = list(range(10,121,10))
depth_list = list(range(1,6))
parameters = {'n_estimators': n_list, 'max_depth': depth_list}

rfc = RandomForestClassifier()
search = GridSearchCV(rfc, parameters, cv=5)
search.fit(X_train, y_train)

best_depth, best_n = search.best_params_
best_params = search.best_params_
best_n = best_params['n_estimators']
best_depth = best_params['max_depth']


### 50/50 Training/Testing split
### Variable hyperparameters: 
    n_estimators: 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120
    max_depth: 1, 2, 3, 4, 5

### 20/80 Training/Testing split
### Variable hyperparameters: 
    max_features: 1, 2, 3
    

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

set_size = [1, 2, 3]
num_trees = 1024
parameters = {'max_features': set_size}

rfc = RandomForestClassifier(n_estimators = num_trees)
search = GridSearchCV(rfc, parameters, cv=5)
search.fit(X_train, y_train)

best_params = search.best_params_
print(best_params)
print(np.mean(search.cv_results_['mean_train_score']))
print(np.mean(search.cv_results_['mean_test_score']))
print('')




X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.50)

rfc = RandomForestClassifier(n_estimators = num_trees)
search = GridSearchCV(rfc, parameters, cv=5)
search.fit(X_train, y_train)

best_params = search.best_params_
print(best_params)
print(np.mean(search.cv_results_['mean_train_score']))
print(np.mean(search.cv_results_['mean_test_score']))
print('')



X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.80)

rfc = RandomForestClassifier(n_estimators = num_trees)
search = GridSearchCV(rfc, parameters, cv=5)
search.fit(X_train, y_train)

best_params = search.best_params_
print(best_params)
print(np.mean(search.cv_results_['mean_train_score']))
print(np.mean(search.cv_results_['mean_test_score']))
print('')


## TODO: Logistic Regression + KNN

In [3]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

kernels = ["linear", "poly", "rbf"]
c = [1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 0.001, 0.1, 1, 10, 100, 1e3]
gamma = [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 2]
degree = [3]
parameters = {'kernel': kernels, 'C' : c, 'gamma' : gamma, 'degree' : degree}
svm = SVC()
search = GridSearchCV(svm, parameters, cv=5, return_train_score=True)
search.fit(X_train, y_train)
best_params = search.best_params_


print(best_params)
print(np.mean(search.cv_results_['mean_train_score']))
print(np.mean(search.cv_results_['mean_test_score']))
print('')


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.50)

kernels = ["linear", "poly", "rbf"]
c = [1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 0.001, 0.1, 1, 10, 100, 1e3]
gamma = [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 2]
degree = [3]
parameters = {'kernel': kernels, 'C' : c, 'gamma' : gamma, 'degree' : degree}
svm = SVC()
search = GridSearchCV(svm, parameters, cv=5, return_train_score=True)
search.fit(X_train, y_train)
best_params = search.best_params_


print(best_params)
print(np.mean(search.cv_results_['mean_train_score']))
print(np.mean(search.cv_results_['mean_test_score']))
print('')


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.80)

kernels = ["linear", "poly", "rbf"]
c = [1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 0.001, 0.1, 1, 10, 100, 1e3]
gamma = [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 2]
degree = [3]
parameters = {'kernel': kernels, 'C' : c, 'gamma' : gamma, 'degree' : degree}
svm = SVC()
search = GridSearchCV(svm, parameters, cv=5, return_train_score=True)
search.fit(X_train, y_train)
best_params = search.best_params_


print(best_params)
print(np.mean(search.cv_results_['mean_train_score']))
print(np.mean(search.cv_results_['mean_test_score']))
print('')
















#clf.fit(X_train, y_train)
#clf.score(X_test, y_test)


#clf = LogisticRegression().fit(X_train, y_train)


#neigh = KNeighborsClassifier(n_neighbors=10).fit(X_train, y_train)
#neigh.score(X_test, y_test)


{'C': 1, 'degree': 3, 'gamma': 0.05, 'kernel': 'rbf'}
0.8159509033134099
0.8147668572691362

{'C': 1, 'degree': 3, 'gamma': 0.1, 'kernel': 'rbf'}
0.8079343428483127
0.8057359307359307

{'C': 10, 'degree': 3, 'gamma': 0.01, 'kernel': 'rbf'}
0.7953714901306643
0.786012497235125



In [175]:
#neigh = KNeighborsClassifier(n_neighbors=10).fit(X_train, y_train)
#neigh.score(X_test, y_test)

{'C': 100, 'degree': 3, 'gamma': 0.005, 'kernel': 'rbf'}
0.8133310062014919
0.8030994249059942


In [None]:

c = [1e-8, 1e-7, 1e-6, 1e-5, 1e-4 1e-3, 1e-2, 1e-1, 1, 10, 100, 1e3, 1e4]

## Unnecessary

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)


c = [1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100, 1e3, 1e4]
penalty = ['l2', 'l1']
parameters = {'C' : c, 'penalty' : penalty}
logr = LogisticRegression()
search = GridSearchCV(logr, parameters, cv=5, return_train_score=True)
search.fit(X_train, y_train)
best_params = search.best_params_


print(best_params)
print(np.mean(search.cv_results_['mean_train_score']))
print(np.mean(search.cv_results_['mean_test_score']))
print('')



X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.50)

logr = LogisticRegression()
search = GridSearchCV(logr, parameters, cv=5, return_train_score=True)
search.fit(X_train, y_train)
best_params = search.best_params_


print(best_params)
print(np.mean(search.cv_results_['mean_train_score']))
print(np.mean(search.cv_results_['mean_test_score']))
print('')


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8)

logr = LogisticRegression()
search = GridSearchCV(logr, parameters, cv=5, return_train_score=True)
search.fit(X_train, y_train)
best_params = search.best_params_

print(best_params)
print(np.mean(search.cv_results_['mean_train_score']))
print(np.mean(search.cv_results_['mean_test_score']))
print('')


{'C': 1, 'penalty': 'l1'}
0.8385389087982152
0.8376691676600522

{'C': 10, 'penalty': 'l1'}
0.8268676943494462
0.8250168199147793

{'C': 1, 'penalty': 'l2'}
0.8067841747293804
0.8003930376193151

