In [54]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt

from sklearn import preprocessing

In [55]:
data = pd.read_csv('data.csv', header =None)

In [56]:
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,39,A0,77516,B0,13,C0,D0,E0,F0,G0,2174,0,40,H0,SMALL
1,50,A1,83311,B0,13,C1,D1,E1,F0,G0,0,0,13,H0,SMALL
2,38,A2,215646,B1,9,C2,D2,E0,F0,G0,0,0,40,H0,SMALL
3,53,A2,234721,B2,7,C1,D2,E1,F1,G0,0,0,40,H0,SMALL
4,28,A2,338409,B0,13,C1,D3,E2,F1,G1,0,0,40,H1,SMALL


# Normalize data

Split data into categorical and numerical so that we can normalize the numerical data.  

In [57]:
x = data.iloc[:,[0,2,4,10,11,12]].values.astype(float)
y = data.iloc[:,[1,3,5,6,7,8,9,13,14]]

In [58]:
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
df_numeric = pd.DataFrame(x_scaled)

In [59]:
cats = [1,3,5,6,7,8,9,13]

In [60]:
df_cats = pd.get_dummies(y, columns = cats, drop_first = True)

In [61]:
df_numeric.head()

Unnamed: 0,0,1,2,3,4,5
0,0.30137,0.044302,0.8,0.02174,0.0,0.397959
1,0.452055,0.048238,0.8,0.0,0.0,0.122449
2,0.287671,0.138113,0.533333,0.0,0.0,0.397959
3,0.493151,0.151068,0.4,0.0,0.0,0.397959
4,0.150685,0.221488,0.8,0.0,0.0,0.397959


In [62]:
df_cats.head()

Unnamed: 0,14,1_A0,1_A1,1_A2,1_A3,1_A4,1_A6,1_A7,1_A8,3_B1,...,13_H37,13_H38,13_H39,13_H40,13_H41,13_H5,13_H6,13_H7,13_H8,13_H9
0,SMALL,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,SMALL,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,SMALL,0,0,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,SMALL,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,SMALL,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [63]:
data_norm = df_numeric.merge(df_cats, left_index= True, right_index = True)

In [64]:
data_norm.head()

Unnamed: 0,0,1,2,3,4,5,14,1_A0,1_A1,1_A2,...,13_H37,13_H38,13_H39,13_H40,13_H41,13_H5,13_H6,13_H7,13_H8,13_H9
0,0.30137,0.044302,0.8,0.02174,0.0,0.397959,SMALL,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0.452055,0.048238,0.8,0.0,0.0,0.122449,SMALL,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,0.287671,0.138113,0.533333,0.0,0.0,0.397959,SMALL,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,0.493151,0.151068,0.4,0.0,0.0,0.397959,SMALL,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,0.150685,0.221488,0.8,0.0,0.0,0.397959,SMALL,0,0,1,...,0,0,0,0,0,0,0,0,0,0


Get dummy variables of all categorical features.  It may also be of interest to convert these to numeric, but for now I'm just going to make them binary features.

In [65]:
from sklearn.model_selection import train_test_split

In [66]:
X = data_norm.drop(14, axis = 1)
Y = data_norm[14]

In [67]:
X.head()

Unnamed: 0,0,1,2,3,4,5,1_A0,1_A1,1_A2,1_A3,...,13_H37,13_H38,13_H39,13_H40,13_H41,13_H5,13_H6,13_H7,13_H8,13_H9
0,0.30137,0.044302,0.8,0.02174,0.0,0.397959,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0.452055,0.048238,0.8,0.0,0.0,0.122449,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0.287671,0.138113,0.533333,0.0,0.0,0.397959,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,0.493151,0.151068,0.4,0.0,0.0,0.397959,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,0.150685,0.221488,0.8,0.0,0.0,0.397959,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [68]:
X_train, X_test , Y_train, Y_test = train_test_split(X,Y, test_size = 0.3, random_state = 101)

Get base estimate if we just guess the more common size, small vs large

In [69]:
count_small = Y_test[Y_test == "SMALL"].count()
count_large = Y_test[Y_test == "LARGE"].count()
count = Y_test.count()

If we always guessed small with the test data, our accuracy would be equal to 0.745165

In [70]:
print (count_small/count)

0.7611833350394104


##### Update data.csv to add missing categories

In [71]:
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,39,A0,77516,B0,13,C0,D0,E0,F0,G0,2174,0,40,H0,SMALL
1,50,A1,83311,B0,13,C1,D1,E1,F0,G0,0,0,13,H0,SMALL
2,38,A2,215646,B1,9,C2,D2,E0,F0,G0,0,0,40,H0,SMALL
3,53,A2,234721,B2,7,C1,D2,E1,F1,G0,0,0,40,H0,SMALL
4,28,A2,338409,B0,13,C1,D3,E2,F1,G1,0,0,40,H1,SMALL


In [72]:
df = pd.get_dummies(data, columns = [1,3,5,6,7,8,9,13], drop_first = True)

In [73]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Columns: 101 entries, 0 to 13_H9
dtypes: int64(6), object(1), uint8(94)
memory usage: 4.7+ MB


In [74]:
df.insert(11,'1_A5', 0)
df.insert(39,'6_D11', 0)
df.insert(96,'13_H4', 0)

In [75]:
X_final = df.drop(14, axis = 1)
Y_final = df[14]

In [76]:
X_train_final, X_test_final , Y_train_final, Y_test_final = train_test_split(X_final,Y_final, test_size = 0.3, random_state = 101)

# Random Forest

### Hyperparameter Tuning

WARNING: This chunk took 30 minutes to run with a high end computer. If it's run with a computer using a cpu worse than an Intel i7, the run time might end up being 2 hours

In [24]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV


rfc = RandomForestClassifier(random_state = 42)

n_estimators = [100, 300, 500, 800, 1200]
max_depth = [5, 8, 15, 25, 30]
min_samples_split = [2, 5, 10, 15, 100]
min_samples_leaf = [1, 2, 5, 10] 

hyperF = dict(n_estimators = n_estimators, max_depth = max_depth,  
              min_samples_split = min_samples_split, 
             min_samples_leaf = min_samples_leaf)

gridF = GridSearchCV(rfc, hyperF, cv = 3, verbose = 1, 
                      n_jobs = -1)
bestF = gridF.fit(X_train_final, Y_train_final)

Fitting 3 folds for each of 500 candidates, totalling 1500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:   16.1s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed:  4.3min
[Parallel(n_jobs=-1)]: Done 776 tasks      | elapsed: 10.4min
[Parallel(n_jobs=-1)]: Done 1226 tasks      | elapsed: 21.0min
[Parallel(n_jobs=-1)]: Done 1500 out of 1500 | elapsed: 27.5min finished


In [25]:
bestF.get_params()

{'cv': 3,
 'error_score': 'raise-deprecating',
 'estimator__bootstrap': True,
 'estimator__class_weight': None,
 'estimator__criterion': 'gini',
 'estimator__max_depth': None,
 'estimator__max_features': 'auto',
 'estimator__max_leaf_nodes': None,
 'estimator__min_impurity_decrease': 0.0,
 'estimator__min_impurity_split': None,
 'estimator__min_samples_leaf': 1,
 'estimator__min_samples_split': 2,
 'estimator__min_weight_fraction_leaf': 0.0,
 'estimator__n_estimators': 'warn',
 'estimator__n_jobs': None,
 'estimator__oob_score': False,
 'estimator__random_state': 42,
 'estimator__verbose': 0,
 'estimator__warm_start': False,
 'estimator': RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                        max_depth=None, max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, 

In [77]:
# rfc_final =RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
#                         max_depth=None, max_features='auto', max_leaf_nodes=None,
#                         min_impurity_decrease=0.0, min_impurity_split=None,
#                         min_samples_leaf=1, min_samples_split=2,
#                         min_weight_fraction_leaf=0.0, n_estimators='warn',
#                         n_jobs=None, oob_score=False, random_state=42, verbose=0,
#                         warm_start=False)
# rfc_final.fit(X_train_nn, Y_train_nn)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [78]:
# rfc_pred = rfc_final.predict(X_test_nn)

In [None]:
rfc_pred = bestF.predict(X_test_final)

In [79]:
from sklearn.metrics import classification_report, confusion_matrix

In [28]:
print(confusion_matrix(Y_test_final,rfc_pred))

[[1461  872]
 [ 420 7016]]


In [80]:
print(classification_report(Y_test_final,rfc_pred))

              precision    recall  f1-score   support

       LARGE       0.69      0.68      0.69      2333
       SMALL       0.90      0.90      0.90      7436

    accuracy                           0.85      9769
   macro avg       0.79      0.79      0.79      9769
weighted avg       0.85      0.85      0.85      9769



In [81]:
#error rate
print ((confusion_matrix(Y_test_final,rfc_pred)[0][1]+confusion_matrix(Y_test_final,rfc_pred)[1][0])/count)

0.14975944313645204


# Classify Futures

In [31]:
futures = pd.read_csv('futures.csv',header=None)


In [32]:
futures_final = pd.get_dummies(futures, columns = [1,3,5,6,7,8,9,13], drop_first = True)

In [33]:
futures_final.insert(8,'1_A2',0)
futures_final.insert(44,'6_D3',0)
futures_final.insert(62,'13_H1',0)
futures_final.insert(97,'13_H41',0)

In [34]:
futures_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7684 entries, 0 to 7683
Columns: 103 entries, 0 to 13_H9
dtypes: int64(10), uint8(93)
memory usage: 1.3 MB


In [36]:
# Train using the final model on normalized data
futures_predictions = bestF.predict(futures_final)

In [37]:
futures_predictions_df = pd.DataFrame(futures_predictions)

# Futures with predicted labels

In [38]:
futures_with_pred = futures.merge(futures_predictions_df,left_index = True, right_index = True)

In [39]:
futures_with_pred.head()

Unnamed: 0,0_x,1,2,3,4,5,6,7,8,9,10,11,12,13,0_y
0,38,A0,89814,B1,9,C1,D1,E1,F1,G0,0,0,50,H0,SMALL
1,28,A1,336951,B2,12,C1,D2,E1,F1,G0,0,0,40,H0,SMALL
2,44,A0,160323,B3,10,C1,D0,E1,F0,G0,7688,0,40,H0,LARGE
3,18,?,103497,B3,10,C0,?,E0,F1,G1,0,0,30,H0,SMALL
4,63,A3,104626,B5,15,C1,D5,E1,F1,G0,3103,0,32,H0,LARGE


In [40]:
from pandas import DataFrame

In [41]:
futures_with_pred.to_csv(r'M:\ISE364\ISEe364FinalProject\futuresFinal.csv', index = None, header = True )

AttributeError: 'numpy.ndarray' object has no attribute 'to_csv'