# 1. Load dataset
#### In this assignment, you are expected to build an ensemble of different models and train it on cover type dataset.
#### You will need to read the data from the file (cover.csv). It contains 581012 samples and 54 attributes for each sample.

In [None]:
import numpy as np
import pandas as pd

from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, VotingClassifier

from sklearn.model_selection import StratifiedShuffleSplit

In [None]:
df = pd.read_csv('cover.csv')

In [None]:
X = df.drop(columns='Cover_Type', axis=1)
y = df['Cover_Type']

# 2. Prepare dataset
#### Split the data into train, validation, and test sets.

In [None]:
# Used stratisfied shuffle to split classes evenly. Needed two split for validation and test
sss_test = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=9)
sss_val = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=9)

In [None]:
# train_validation and test split
train_index, test_index = list(sss_test.split(X, y))[0]

In [None]:
X_train_val, y_train_val = X.iloc[train_index], y.iloc[train_index]
X_test, y_test = X.iloc[test_index], y.iloc[test_index]

In [None]:
# train validation split
tr_index, val_index = list(sss_val.split(X_train_val, y_train_val))[0]

In [None]:
X_train, y_train = X_train_val.iloc[tr_index], y_train_val.iloc[tr_index]
X_val, y_val = X_train_val.iloc[val_index], y_train_val.iloc[val_index]

In [None]:
print(f"X_train: {X_train.shape}\nX_val: {X_val.shape}\nX_test: {X_test.shape}")

# 3. Modeling
#### Train different classifiers on the data. You can train RandomForestClassifier, ExtraTreesClassifier, LinearSVC, SGDClassifier, MLPClassifier, etc. Evaluate their performance using validation set.

***Random Forest classifier***

In [None]:
rf_clf = RandomForestClassifier(n_estimators=200)

In [None]:
rf_clf.fit(X_train, y_train)

In [None]:
rf_clf.score(X_val, y_val)

In [None]:
rf_preds = rf_clf.predict(X_val)

***Extra Tree classifier***

In [None]:
et_clf = ExtraTreesClassifier(n_estimators=200)

In [None]:
et_clf.fit(X_train, y_train)

In [None]:
et_clf.score(X_val, y_val)

In [None]:
et_preds = et_clf.predict(X_val)

***LinearSVC classifier***

In [None]:
svc_clf = LinearSVC()

In [None]:
svc_clf.fit(X_train, y_train)

In [None]:
svc_clf.score(X_val, y_val)

In [None]:
svc_preds = svc_clf.predict(X_val)

***SGD classifier***

In [None]:
sgd_clf = SGDClassifier()

In [None]:
sgd_clf.fit(X_train, y_train)

In [None]:
sgd_clf.score(X_val, y_val)

In [None]:
sgd_preds = sgd_clf.predict(X_val)

***MLP classifier***

In [None]:
mlp_clf = MLPClassifier()

In [None]:
mlp_clf.fit(X_train, y_train)

In [None]:
mlp_clf.score(X_val, y_val)

In [None]:
mlp_preds = mlp_clf.predict(X_val)

# 4. Ensembling
#### Createing a hard and soft voting classifier using the models we have trained. We will use VotingClassifier and check its performance on the validatioin set.

***Hard Ensembling***

In [None]:
eh_clf = VotingClassifier(
    estimators=[('rf', rf_clf), ('et', et_clf), ('sgd', sgd_clf), ('mlp', mlp_clf)], voting='hard'
)

In [None]:
eh_clf.fit(X_train, y_train)

In [None]:
eh_clf.score(X_val, y_val)

In [None]:
eh_preds = eh_clf.predict(X_val)

***Soft Ensembling***

In [None]:
es_clf = VotingClassifier(
    estimators=[('rf', rf_clf), ('et', et_clf), ('mlp', mlp_clf)], voting='soft'
)

In [None]:
es_clf.fit(X_train, y_train)

In [None]:
es_clf.score(X_val, y_val)

In [None]:
es_preds = es_clf

#### Making predictions on the validation set using trained models and creating a new training set out of those predictions: each training example will now have predictions of all classifiers as features. Train a new classifier on this new training set. Compare the performances.

In [None]:
all_preds = np.hstack([rf_preds, et_preds, svc_preds, sgd_preds, mlp_preds, eh_preds])

In [None]:
pred_df = pd.DataFrame(
    all_preds,
    columns=['ef_preds', 'et_preds', 'svc_preds', 'sgd_preds', 'mlp_preds' ,'eh_preds']
)

In [None]:
pred_df.head()

In [None]:
pred_df.shape

In [None]:
final_clf = RandomForestClassifier(n_estimators=200)

In [None]:
final_clf.fit(pred_df, y_val)

In [None]:
X_test.shape

In [None]:
rf_t = rf_clf.predict(X_test)
et_t = et_clf.predict(X_test)
svc_t = svc_clf.predict(X_test)
sgd_t = sgd_clf.predict(X_test)
mlp_t = mlp_clf.predict(X_test)
eh_t = eh_clf.predict(X_test)

In [None]:
test_preds = np.hstack([rf_t.reshape(-1, 1), et_t.reshape(-1, 1), svc_t.reshape(-1, 1), 
                        sgd_t.reshape(-1, 1), mlp_t.reshape(-1, 1), eh_t.reshape(-1, 1)])

In [None]:
final_clf.score(test_preds, y_test)