In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
import time
import torch
from torch.optim import Adam
from torch.nn import MSELoss
from torch import nn
from torch.utils.data import DataLoader, Dataset

In [2]:
def calculate_running_time(func):
    def wrapper(*args, **kwargs):
        start = time.time()
        model = func(*args)
        end = time.time() - start
        print(f"Function runned in {end} seconds.")
        return model
    return wrapper

In [3]:
@calculate_running_time
def train_forest(X, y):
    model = RandomForestRegressor()
    model.fit(X, y)
    return model

In [4]:
class DeepLearningModel(nn.Module):
    def __init__(self, activation_function):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(8, 16),
            activation_function(),
            nn.Linear(16, 32),
            activation_function(),
            nn.Linear(32, 64),
            activation_function(),
            nn.Linear(64, 32),
            activation_function(),
            nn.Linear(32, 16),
            activation_function(),
            nn.Linear(16, 8),
            activation_function(),
            nn.Linear(8, 1),
        )
        
        
    def forward(self, X):
        return self.layers(X)

In [5]:
class DataProvider(Dataset):
    def __init__(self, X, y):
        super().__init__()
        
        self.X = torch.tensor(X.values).float()
        self.y = torch.tensor(y.values).float()
        
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, index):
        return self.X[index], self.y[index]

In [6]:
def build_model(activation_function):
    return DeepLearningModel(activation_function)

In [7]:
def train_model_for_num_of_epochs(epochs: int, trained_models: dict, data: DataLoader):
    time_start = time.time()
    
    model_name = f"Model-{epochs}"
    loss_func = MSELoss()
    model = build_model(nn.ReLU)
    optimizer = Adam(model.parameters(), lr=0.0001)
    
    for epoch in range(epochs):
        for inp, out in data:
            optimizer.zero_grad()
            out = torch.unsqueeze(out, -1)
            prediction = model(inp)
            loss = loss_func(prediction, out)
            loss.backward()
            optimizer.step()
        
        
    print(f"Model name {model_name}: Loss: {loss.item()}")
    
    trained_models[model_name] = {
        "Time": (time.time() - time_start),
        "Epochs": epochs,
        "Model": model,
        "Optimizer": optimizer,
        "Loss function": loss_func
    }

In [8]:
def get_results(data: DataLoader, model: DeepLearningModel):
    loss_func = MSELoss()
    data_len = len(data)
    mse_sum = 0
    
    for inp, out in data:
        out = torch.unsqueeze(out, -1)
        preds = model(inp)
        loss = loss_func(preds, out)
        mse_sum += loss.item()
    
    mse = mse_sum / data_len
    print(mse)
        

In [9]:
data = pd.read_csv("./data/crabs_preprocessed.csv")

In [10]:
data.head()

Unnamed: 0.1,Unnamed: 0,id,Sex,Length,Diameter,Height,Weight,Shucked Weight,Viscera Weight,Shell Weight,Age
0,0,0,1,1.525,1.175,0.375,28.973189,12.728926,6.647958,8.348928,9
1,1,1,1,1.1,0.825,0.275,10.418441,4.521745,2.324659,3.40194,8
2,2,2,2,1.3875,1.1125,0.375,24.777463,11.3398,5.556502,6.662133,9
3,3,3,0,1.7,1.4125,0.5,50.660556,20.354941,10.991839,14.996885,11
4,4,4,1,1.25,1.0125,0.3375,23.289114,11.977664,4.50757,5.953395,8


In [11]:
data.drop("Unnamed: 0", inplace=True, axis=1)

In [12]:
X = data.drop(["Age", "id"], axis=1)
y = data["Age"]

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

In [14]:
trained_forest = train_forest(X_train, y_train)

Function runned in 43.62094759941101 seconds.


In [15]:
forest_predictions = trained_forest.predict(X_test)

In [16]:
forest_results = mean_squared_error(y_test, forest_predictions)

In [17]:
forest_results

4.495658220241712

In [18]:
train_data_loader = DataProvider(X=X_train, y=y_train)
train_iterator = DataLoader(train_data_loader, batch_size=32, shuffle=False)

In [19]:
models = {}

In [20]:
train_model_for_num_of_epochs(10, models, train_iterator)
train_model_for_num_of_epochs(50, models, train_iterator)
train_model_for_num_of_epochs(100, models, train_iterator)
train_model_for_num_of_epochs(200, models, train_iterator)
train_model_for_num_of_epochs(500, models, train_iterator)

Model name Model-10: Loss: 1.472991704940796
Model name Model-50: Loss: 5.26702880859375
Model name Model-100: Loss: 1.342608094215393
Model name Model-200: Loss: 1.1044083833694458
Model name Model-500: Loss: 7.0500640869140625


In [21]:
models["Model-10"]["Time"]

46.442111015319824

In [22]:
models["Model-50"]["Time"]

227.11348628997803

In [23]:
models["Model-100"]["Time"]

435.26624155044556

In [24]:
models["Model-200"]["Time"]

843.9717423915863

In [25]:
models["Model-500"]["Time"]

2202.2015614509583

In [26]:
test_data_loader = DataProvider(X=X_test, y=y_test)
test_iterator = DataLoader(test_data_loader)

In [28]:
get_results(test_iterator, models["Model-10"]["Model"])

4.4503527408925265


In [29]:
get_results(test_iterator, models["Model-50"]["Model"])

11.25330949265991


In [30]:
get_results(test_iterator, models["Model-100"]["Model"])

4.271466554444728


In [32]:
get_results(test_iterator, models["Model-200"]["Model"])

4.2238706977625355


In [31]:
get_results(test_iterator, models["Model-500"]["Model"])

9.941166406296347
