In [1]:
import pandas as pd

In [2]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
covertype = fetch_ucirepo(id=31) 
  
# data (as pandas dataframes) 
X = covertype.data.features 
y = covertype.data.targets 

In [5]:
# metadata 
print(covertype.metadata) 
  
# variable information 
print(covertype.variables) 

{'uci_id': 31, 'name': 'Covertype', 'repository_url': 'https://archive.ics.uci.edu/dataset/31/covertype', 'data_url': 'https://archive.ics.uci.edu/static/public/31/data.csv', 'abstract': 'Classification of pixels into 7 forest cover types based on attributes such as elevation, aspect, slope, hillshade, soil-type, and more.', 'area': 'Biology', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 581012, 'num_features': 54, 'feature_types': ['Categorical', 'Integer'], 'demographics': [], 'target_col': ['Cover_Type'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 1998, 'last_updated': 'Sat Mar 16 2024', 'dataset_doi': '10.24432/C50K5N', 'creators': ['Jock Blackard'], 'intro_paper': None, 'additional_info': {'summary': 'Predicting forest cover type from cartographic variables only (no remotely sensed data).  The actual forest cover type for a given observation (30 x 30 meter cell) was determined from

In [10]:
def simple_heuristic(X):
    X_step = X + 1
    y = pd.DataFrame(X_step.product(axis='columns')%7 + 1)
    return y

In [18]:
import numpy as np
sh_y = np.array(simple_heuristic(X))
score_model_0 = (sh_y == y.values).sum() / len(y)
print(f"Model_0 score: {score_model_0}")
sh_y

Model_0 score: 0.14624827025947829


array([[6],
       [5],
       [4],
       ...,
       [5],
       [5],
       [6]])

In [49]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
len(X_train), len(X_test)

(464809, 116203)

In [43]:
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
model_1 = AdaBoostClassifier(algorithm="SAMME", random_state=42)
model_2 = RandomForestClassifier(max_depth=15, max_features=10, random_state=42)

model_1.fit(X_train, y_train)
model_2.fit(X_train, y_train)

score_model_1 = model_1.score(X_test, y_test)
score_model_2 = model_2.score(X_test, y_test)

print(f"Model_1 score: {score_model_1}")
print(f"Model_2 score: {score_model_2}")

  y = column_or_1d(y, warn=True)
  return fit_method(estimator, *args, **kwargs)


Model_1 score: 0.6309819884168223
Model_2 score: 0.8401590320387597


In [44]:
import torch
from torch import nn

class NeuralNetworkModel(nn.Module):
    def __init__(self, hidden_units):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(in_features=54, out_features=hidden_units),
            nn.ReLU(),
            nn.Linear(in_features=hidden_units, out_features=hidden_units),
            nn.Dropout(p=0.2),
            nn.ReLU(),
            nn.Linear(in_features=hidden_units, out_features=hidden_units),
            nn.Linear(in_features=hidden_units, out_features=7)
        )
        
    def forward(self, x):
        return self.layers(x)

In [50]:
X_train = torch.tensor(X_train.values, dtype=torch.float32)
X_test = torch.tensor(X_test.values, dtype=torch.float32)
y_train = torch.tensor(y_train.values, dtype=torch.long).squeeze()
y_test = torch.tensor(y_test.values, dtype=torch.long).squeeze()

X_train.shape, X_test.shape, y_train.shape, y_test.shape

(torch.Size([464809, 54]),
 torch.Size([116203, 54]),
 torch.Size([464809]),
 torch.Size([116203]))

In [54]:
y_train -= 1
y_test -= 1

In [66]:
loss_fn = nn.CrossEntropyLoss()

def train(model:torch.nn.Module,
               loss_fn:torch.nn.Module,
               optimizer:torch.optim.Optimizer,
               epochs:int):
    
    test_acc = 0
    test_loss = 0
    for epoch in range(epochs):
        model.train()
        y_logits = model(X_train)
        y_preds = torch.softmax(y_logits, dim=1).argmax()
        
        loss = loss_fn(y_logits, y_train)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if epoch % 20 == 0:
            curr_loss, curr_acc = test(model, loss_fn)
            if curr_acc < test_acc: # checking if model is overfitting
                print(f"Model started to overfit, ending teatching at epoch: {epoch}")
                return epoch
    return epochs

def test(model:torch.nn.Module,
               loss_fn:torch.nn.Module):
    model.eval()
    with torch.inference_mode():
        y_logits = model(X_test)
        test_loss = loss_fn(y_logits, y_test)
        y_preds = torch.argmax(torch.softmax(y_logits, dim=1), dim=1)
        test_acc = (y_preds == y_test).sum().item() / len(y_test)
        
    return test_loss, test_acc

In [67]:
def set_best_hyperparameters(hidden_units, lr, epochs):
    return {"hidden_units":hidden_units, "lr":lr, "epochs":epochs}

In [None]:
torch.manual_seed(42)

best_hyperparameters = set_best_hyperparameters(0, 0, 0)
best_test_acc = 0

for hidden_units in range(8, 13):
    model_5 = NeuralNetworkModel(hidden_units=hidden_units)
    lr = 0.001
    while lr <= 0.1:
        optimizer = torch.optim.Adam(params=model_5.parameters(), lr=lr, weight_decay=1e-5)
        for epochs in range(100, 1000, 100):
            num_ep = train(model=model_5, loss_fn=loss_fn, optimizer=optimizer, epochs=epochs)
            test_loss, test_acc = test(model=model_5, loss_fn=loss_fn)
            
            print(f"Number of epochs: {epochs} | Hidden_units: {hidden_units} | Learing rate: {lr}")
            print(f"Test loss: {test_loss}  |  Test accuracy: {test_acc}")
            print("--------------------------------------------------------------------------------\n")
            
            if test_acc > best_test_acc:
                best_test_acc = test_acc
                best_hyperparameters = set_best_hyperparameters(hidden_units, lr, num_ep)
                
            if num_ep < epochs: # this means, that model started to overfit
                break
                
        lr *= 10
        
print(f"Best test acc: {best_test_acc} with hyperparameters: {best_hyperparameters}")

Number of epochs: 100 | Hidden_units: 8 | Learing rate: 0.001
Test loss: 1.5417283773422241  |  Test accuracy: 0.3915991841862947
--------------------------------------------------------------------------------
Number of epochs: 200 | Hidden_units: 8 | Learing rate: 0.001
Test loss: 1.1495548486709595  |  Test accuracy: 0.49057253255079475
--------------------------------------------------------------------------------
Number of epochs: 300 | Hidden_units: 8 | Learing rate: 0.001
Test loss: 1.1195231676101685  |  Test accuracy: 0.49762054335946576
--------------------------------------------------------------------------------
Number of epochs: 400 | Hidden_units: 8 | Learing rate: 0.001
Test loss: 1.0857106447219849  |  Test accuracy: 0.500253866079189
--------------------------------------------------------------------------------
Number of epochs: 500 | Hidden_units: 8 | Learing rate: 0.001
Test loss: 0.908551037311554  |  Test accuracy: 0.6548453998605888
--------------------------