In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
data = pd.read_csv('../data/data_for_predictions')

In [3]:
data.head()

Unnamed: 0,distance,consume,speed,temp_inside,temp_outside,gas_type,ac,rain,sun
0,28.0,5.0,26,21.934911,12,E10,0,0,0
1,12.0,4.2,30,21.934911,13,E10,0,0,0
2,11.2,5.5,38,21.934911,15,E10,0,0,0
3,12.9,3.9,36,21.934911,14,E10,0,0,0
4,18.5,4.5,46,21.934911,15,E10,0,0,0


In [4]:
data.head()

Unnamed: 0,distance,consume,speed,temp_inside,temp_outside,gas_type,ac,rain,sun
0,28.0,5.0,26,21.934911,12,E10,0,0,0
1,12.0,4.2,30,21.934911,13,E10,0,0,0
2,11.2,5.5,38,21.934911,15,E10,0,0,0
3,12.9,3.9,36,21.934911,14,E10,0,0,0
4,18.5,4.5,46,21.934911,15,E10,0,0,0


## Train Test Split and scaling

In [5]:
data_num = data.drop(['gas_type'],axis=1)

In [6]:
transformer = StandardScaler().fit(data_num)
data_scaled = transformer.transform(data_num)

X = pd.DataFrame(data_scaled, columns=data_num.columns)

In [7]:
y = data['gas_type']

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=12)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(271, 8)
(117, 8)
(271,)
(117,)


## Using the RFC to predict the best gas 

In [12]:
rfc = RandomForestClassifier(max_depth=5,
                             min_samples_split=2,
                             min_samples_leaf =4,
                             random_state = 12)
rfc.fit(X_train, y_train)

print(rfc.score(X_train, y_train))
print(rfc.score(X_test, y_test))

y_pred = rfc.predict(X_test)


0.7749077490774908
0.6410256410256411


# Using Grid Search to try and get a best prediction

In [14]:
from sklearn.model_selection import GridSearchCV


param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10]
}


rfc = RandomForestClassifier(random_state=12)
grid_search = GridSearchCV(estimator=rfc, param_grid=param_grid, n_jobs=-1, verbose=20)

grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_
print(grid_search.best_params_)

Fitting 5 folds for each of 27 candidates, totalling 135 fits
{'max_depth': 15, 'min_samples_split': 5, 'n_estimators': 200}


In [15]:
rfc_best = RandomForestClassifier(random_state=12, **best_params)
rfc_best.fit(X_train, y_train)
y_pred = rfc_best.predict(X_test)
print(rfc_best.score(X_train, y_train))
print(rfc_best.score(X_test, y_test))

0.985239852398524
0.6068376068376068


## The train score changed a lot to the best and the test set did not perfome much better, therefore the best would be to use the first method and try do make it better