## 1.1. Load dataset
#### You will need to read the data from the file (cover.csv). It contains 581012 samples and 54 attributes for each sample. The target column is Cover_Type.

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import LinearSVC
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score


data = pd.read_csv('cover.csv')


# 1. Voting Classifier
#### In this assignment, you are expected to build an ensemble of different models and train it on cover type dataset.

## 1.2. Prepare dataset
#### Split the data into train, validation, and test sets using train_test_split twice with 0.2 test_size. Your final distribution will be 371847-92962-116203.

In [None]:
X = data.drop('Cover_Type', axis=1)
y = data['Cover_Type']

# Splitting dataset
X_train_temp, X_test, y_train_temp, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
X_train, X_val, y_train, y_val = train_test_split(
    X_train_temp, y_train_temp, test_size=0.2, random_state=42, stratify=y_train_temp
)

# Scalin dataset
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)


y_train_adjusted = y_train - 1
y_val_adjusted = y_val - 1
y_test_adjusted = y_test - 1

## 1.3. Modeling
#### Train 4-5 different classifiers on the data. You can train RandomForestClassifier, ExtraTreesClassifier, LinearSVC, SGDClassifier, MLPClassifier, etc. Evaluate their performances using validation set. Note that training may take quite a while (up to 30 minutes) depending on the hardware.

In [None]:
# Random Forest Classifier
clf1 = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)
clf1.fit(X_train, y_train)
rf_preds = clf1.predict(X_val)
rf_acc = accuracy_score(y_val, rf_preds)
print(f'Random Forest Accuracy: {rf_acc}')

Random Forest Accuracy: 0.9499580473741959


In [None]:
# Support Vector Classifier
clf3 = LinearSVC(random_state=42, max_iter=20000, C=1)
clf3.fit(X_train_scaled, y_train)  # requires scaled data
svc_preds = clf3.predict(X_val_scaled)
svc_acc = accuracy_score(y_val, svc_preds)
print(f'SVC Accuracy: {svc_acc}')

SVC Accuracy: 0.712269529485166


In [None]:
# SGD Classifier
clf4 = SGDClassifier(random_state=42, max_iter=2000, alpha=0.0001, loss='log_loss') # Changed loss to 'log_loss'
clf4.fit(X_train_scaled, y_train)  # requires scaled data
sgd_preds = clf4.predict(X_val_scaled)
sgd_acc = accuracy_score(y_val, sgd_preds)
print(f'SGD Classifier Accuracy: {sgd_acc}')

SGD Classifier Accuracy: 0.7150233428712808


In [None]:
# Multi-Layer Perceptron Classifier
clf5 = MLPClassifier(random_state=42, max_iter=1000, hidden_layer_sizes=(128, 64))
clf5.fit(X_train_scaled, y_train)  # requires scaled data
mlp_preds = clf5.predict(X_val_scaled)
mlp_acc = accuracy_score(y_val, mlp_preds)
print(f'MLP Classifier Accuracy: {mlp_acc}')

MLP Classifier Accuracy: 0.924054990211054


In [None]:
# XGBoost
clf6 = XGBClassifier(
    n_estimators=200, learning_rate=0.1, max_depth=6,
    random_state=42, use_label_encoder=False, eval_metric="mlogloss"
)
clf6.fit(X_train_scaled, y_train_adjusted)

Parameters: { "use_label_encoder" } are not used.



In [None]:
# XGBoost Validation and Test Accuracies
val_accuracy = clf6.score(X_val_scaled, y_val_adjusted)
print("Validation Accuracy:", val_accuracy)

Validation Accuracy: 0.8394612852563413


In [None]:
import joblib
# i had full Ram issues so i needed to train models oneby one and save to pkl
joblib.dump(clf1, 'random_forest.pkl')
joblib.dump(clf3, 'svc.pkl')
joblib.dump(clf4, 'sgd.pkl')
joblib.dump(clf5, 'mlp.pkl')
joblib.dump(clf6, 'xgboost.pkl')

print("Models have been saved successfully!")

Models have been saved successfully!


## 1.4. Ensembling
#### Create a hard and soft voting classifier using the models you have trained. You can use VotingClassifier. Check its performance on the validation set. Do you get better or worse performance than any of the individual classifiers?

In [None]:
import joblib

# et_clf = joblib.load('extra_trees.pkl')
# this model was way more huge about 4 gb so i could not use it :(
rf_clf = joblib.load('random_forest.pkl')
svc_clf = joblib.load('svc.pkl')
sgd_clf = joblib.load('sgd.pkl')
mlp_clf = joblib.load('mlp.pkl')
xgb_clf = joblib.load('xgboost.pkl')

In [None]:
# Soft Voting Classifier
voting_clf_soft = VotingClassifier(
    estimators=[
        ('rf', rf_clf),
        ('xgb', xgb_clf),
        ('mlp', mlp_clf)
    ],
    voting='soft'
)
voting_clf_soft.fit(X_train_scaled, y_train_adjusted)

soft_preds = voting_clf_soft.predict(X_val_scaled)
soft_acc = accuracy_score(y_val_adjusted, soft_preds)
print(f'Soft Voting Classifier Validation Accuracy: {soft_acc}')

Parameters: { "use_label_encoder" } are not used.



Soft Voting Classifier Validation Accuracy: 0.9377917858910092


In [None]:
# Hard Voting Classifier
voting_clf_hard = VotingClassifier(
    estimators=[
        ('rf', rf_clf),
        ('xgb', xgb_clf),
        ('svc', svc_clf),
        ('sgd', sgd_clf),
        ('mlp', mlp_clf)
    ],
    voting='hard'
)
voting_clf_hard.fit(X_train_scaled, y_train_adjusted)

hard_preds = voting_clf_hard.predict(X_val_scaled)
hard_acc = accuracy_score(y_val_adjusted, hard_preds)
print(f'Hard Voting Classifier Validation Accuracy: {hard_acc}')

Parameters: { "use_label_encoder" } are not used.



Hard Voting Classifier Validation Accuracy: 0.853649878444956


In [None]:
# Final evaluation on the test set
final_preds_hard = voting_clf_hard.predict(X_test_scaled)
final_acc_hard = accuracy_score(y_test_adjusted, final_preds_hard)
print(f'Final Test Set Accuracy (Hard Voting): {final_acc_hard}')

final_preds_soft = voting_clf_soft.predict(X_test_scaled)
final_acc_soft = accuracy_score(y_test_adjusted, final_preds_soft)
print(f'Final Test Set Accuracy (Soft Voting): {final_acc_soft}')

Final Test Set Accuracy (Hard Voting): 0.8552360954536458
Final Test Set Accuracy (Soft Voting): 0.9381857611249279


#### Check if any of the models hurts the performance of the ensemble. You can access the estimators of the ensemble using estimators_ attribute. If so, drop those using set_params and reevaluate.

In [None]:
# NOTE: since training takes 1 hour each time i am bored sory:( so i will check manually
import numpy as np
from scipy.stats import mode
from sklearn.metrics import accuracy_score

# For soft voting, only 3 models will be included
rf_probs = rf_clf.predict_proba(X_val_scaled)  # Random Forest (probabilities)
xgb_probs = xgb_clf.predict_proba(X_val_scaled)  # XGBoost (probabilities)
mlp_probs = mlp_clf.predict_proba(X_val_scaled)  # MLP (probabilities)

# For hard voting, we will include all models so...
svc_hard_preds = svc_clf.predict(X_val_scaled)  # SVC (hard predictions)
sgd_hard_preds = sgd_clf.predict(X_val_scaled)  # SGD (hard predictions)


all_probs_soft = [rf_probs, xgb_probs, mlp_probs]
model_names_soft = ['rf', 'xgb', 'mlp']


all_probs_hard = [
    np.argmax(rf_probs, axis=1),  # Random Forest (hard predictions from probs)
    np.argmax(xgb_probs, axis=1),  # XGBoost (hard predictions from probs)
    np.argmax(mlp_probs, axis=1),  # MLP (hard predictions from probs)
    svc_hard_preds,  # SVC
    sgd_hard_preds,  # SGD
]

# Soft Voting (only 3 models)
combined_probs_soft = np.mean(all_probs_soft, axis=0)
soft_preds = np.argmax(combined_probs_soft, axis=1)
soft_acc = accuracy_score(y_val_adjusted, soft_preds)
print(f"Soft Voting Validation Accuracy (3 models): {soft_acc}")

# Hard Voting (all models)
all_hard_preds = np.array(all_probs_hard).T  # Transpose for mode calculation
hard_preds = mode(all_hard_preds, axis=1).mode.flatten()
hard_acc = accuracy_score(y_val_adjusted, hard_preds)
print(f"Hard Voting Validation Accuracy (all models): {hard_acc}")




Soft Voting Validation Accuracy (3 models): 0.9203760676405413
Hard Voting Validation Accuracy (all models): 0.7160129945569157


In [None]:
from scipy.stats import mode
from sklearn.metrics import accuracy_score
import numpy as np


soft_probs = [rf_probs, xgb_probs, mlp_probs]
soft_model_names = ['rf', 'xgb', 'mlp']


hard_preds = [
    np.argmax(rf_probs, axis=1),  # Random Forest (hard predictions from probs)
    np.argmax(xgb_probs, axis=1),  # XGBoost (hard predictions from probs)
    np.argmax(mlp_probs, axis=1),  # MLP (hard predictions from probs)
    svc_clf.predict(X_val_scaled),  # SVC (hard predictions)
    sgd_clf.predict(X_val_scaled),  # SGD (hard predictions)
]
hard_model_names = ['rf', 'xgb', 'mlp', 'svc', 'sgd']
hard_preds = np.array(hard_preds).T  # Combining hard predictions for mode

# Evaluating Soft Voting Without Each Model
print("\nSoft Voting Evaluation Without Each Model:")
for i, name in enumerate(soft_model_names):
    # Excluding one model's probabilities
    reduced_soft_probs = [p for j, p in enumerate(soft_probs) if j != i]
    combined_reduced_probs = np.mean(reduced_soft_probs, axis=0)
    soft_reduced_preds = np.argmax(combined_reduced_probs, axis=1)
    soft_reduced_acc = accuracy_score(y_val_adjusted, soft_reduced_preds)
    print(f"Soft Voting Validation Accuracy without {name}: {soft_reduced_acc}")

# Evaluating Hard Voting Without Each Model
print("\nHard Voting Evaluation Without Each Model:")
for i, name in enumerate(hard_model_names):
    # Excluding one model's predictions
    reduced_hard_preds = np.delete(hard_preds, i, axis=1)
    hard_reduced_preds = mode(reduced_hard_preds, axis=1).mode.flatten()
    hard_reduced_acc = accuracy_score(y_val_adjusted, hard_reduced_preds)
    print(f"Hard Voting Validation Accuracy without {name}: {hard_reduced_acc}")


Soft Voting Evaluation Without Each Model:
Soft Voting Validation Accuracy without rf: 0.9234525935328414
Soft Voting Validation Accuracy without xgb: 0.9234310793657624
Soft Voting Validation Accuracy without mlp: 0.8194315957057723

Hard Voting Evaluation Without Each Model:
Hard Voting Validation Accuracy without rf: 0.8148705922850197
Hard Voting Validation Accuracy without xgb: 0.4925130698565005
Hard Voting Validation Accuracy without mlp: 0.49274972569436976
Hard Voting Validation Accuracy without svc: 0.8760676405412965
Hard Voting Validation Accuracy without sgd: 0.8756481142832555


# 2. Random Forest
#### In this assignment, you are expected to build a random forest that classifies a toy dataset.

## 2.1. Load dataset
#### You will need to read the data from the file (data.csv). It contains 15000 samples and two features for each sample.

In [None]:
import pandas as pd

data = pd.read_csv('data.csv', header=None)
data.columns = ['x1', 'x2', 'z']

# Converting scientific notation to float and map 'z' to 0 or 1
data['z'] = data['z'].astype(float) / 100


## 2.2. Prepare dataset
#### Split the data into train and test sets with 0.2 test size.

In [None]:
from sklearn.model_selection import train_test_split

X = data[['x1', 'x2']]
y = data['z']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


## 2.3. Modeling
#### Train a DecisionTreeClassifier on the data. Use GridSearchCV to tune the hyperparameters.

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV


param_grid = {
    'max_depth': [None, 5, 10, 15, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}


clf = DecisionTreeClassifier(random_state=42)

grid_search = GridSearchCV(
    estimator=clf,
    param_grid=param_grid,
    cv=5,
    n_jobs=-1,
    scoring='accuracy'
)

grid_search.fit(X_train, y_train)

print("Best hyperparameters:", grid_search.best_params_)


Best hyperparameters: {'max_depth': 5, 'min_samples_leaf': 2, 'min_samples_split': 2}


#### Train the best model on the whole train set (do you need to?) and evaluate the model on the test set.

In [None]:
from sklearn.metrics import accuracy_score

best_model = grid_search.best_estimator_

y_pred = best_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Test set accuracy with the best model:", accuracy)


Test set accuracy with the best model: 0.8556666666666667


#### Generate 1,200 subsets of the training set, each containing 100 randomly chosen instances. You can use ShuffleSplit.

In [None]:
from sklearn.model_selection import ShuffleSplit

ss = ShuffleSplit(n_splits=1200, train_size=100, random_state=42)


#### Train one tree on each subset, using the best model you previously found. Evaluate the performance of the trees using the test set. Did you get lower or higher accuracy? Why?

In [None]:
import numpy as np
from tqdm import tqdm

accuracies = []

for train_index, _ in tqdm(ss.split(X_train)):
    X_subset = X_train.iloc[train_index]
    y_subset = y_train.iloc[train_index]


    clf = DecisionTreeClassifier(**grid_search.best_params_, random_state=42)
    clf.fit(X_subset, y_subset)

    y_pred = clf.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    accuracies.append(acc)

print("Average accuracy over 1,200 trees:", np.mean(accuracies))


1200it [00:07, 155.99it/s]

Average accuracy over 1,200 trees: 0.7957122222222223





#### For each instance in the test set, predict its class using 1200 trees, and keep only the most frequent prediction. You can use mode from scipy.stats. Evaluate these predictions. Did you get lower or higher accuracy?

In [None]:
from scipy.stats import mode

all_predictions = []

for train_index, _ in tqdm(ss.split(X_train)):
    X_subset = X_train.iloc[train_index]
    y_subset = y_train.iloc[train_index]

    clf = DecisionTreeClassifier(**grid_search.best_params_, random_state=42)
    clf.fit(X_subset, y_subset)

    y_pred = clf.predict(X_test)
    all_predictions.append(y_pred)

all_predictions = np.array(all_predictions)  # Shape: (1200, number of test instances)

ensemble_predictions = mode(all_predictions, axis=0).mode[0]
ensemble_predictions = mode(all_predictions, axis=0)[0].squeeze()

ensemble_accuracy = accuracy_score(y_test, ensemble_predictions)
print("Ensemble test set accuracy:", ensemble_accuracy)

1200it [00:06, 188.88it/s]


Ensemble test set accuracy: 0.857
