# Formalne metode u softverskom inženjerstvu
## ```Projektni zadatak broj 2 - Andrej Tomić 1106/22```

### Neuronska mreža

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, random_split, TensorDataset
from sklearn.datasets import fetch_rcv1
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score
import numpy as np

# Učitavanje RCV1 skupa podataka
rcv1 = fetch_rcv1()

# Ekstrahovanje podataka i ciljeva (koristimo 10000 uzoraka za manju memorijsku potrošnju)                                                                    
X = rcv1.data[20000:30000].toarray()
y = rcv1.target[20000:30000].toarray()

# Standardizacija podataka
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Konverzija u Tensor format
X_tensor = torch.tensor(X, dtype=torch.float32)
y_tensor = torch.tensor(y, dtype=torch.float32)

# Kreiranje TensorDataset-a
dataset = TensorDataset(X_tensor, y_tensor)

# Podela na trening i test skup (50% za trening, 50% za test)
train_size = len(dataset) // 2
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

# Kreiranje DataLoader-a
batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

class SimpleNN(nn.Module):
    def __init__(self, input_size, output_size):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(input_size, 512)
        self.fc2 = nn.Linear(512, 256)
        self.fc3 = nn.Linear(256, output_size)
        self.relu = nn.ReLU()
    
    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = torch.sigmoid(self.fc3(x))
        return x

# Inicijalizacija mreže
input_size = X_tensor.shape[1]
output_size = y_tensor.shape[1]
model = SimpleNN(input_size, output_size)

criterion = nn.BCELoss()  # Binarna klasifikacija
optimizer = optim.Adam(model.parameters(), lr=0.001)

num_epochs = 10
best_recall = 0.0
best_precision = 0.0

for epoch in range(num_epochs):
    model.train()
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

    # Evaluacija na testnom skupu
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for inputs, labels in test_loader:
            outputs = model(inputs)
            preds = (outputs > 0.5).float()
            all_preds.append(preds)
            all_labels.append(labels)
    
    all_preds = torch.cat(all_preds)
    all_labels = torch.cat(all_labels)
    
    precision = precision_score(all_labels.cpu(), all_preds.cpu(), average='micro')
    recall = recall_score(all_labels.cpu(), all_preds.cpu(), average='micro')

    print(f'Epoch {epoch+1}/{num_epochs}, Precision: {precision:.4f}, Recall: {recall:.4f}')
    
    # Sačuvaj model ako ima bolje performanse
    if precision > best_precision:
        best_precision = precision
        best_recall = recall
        torch.save(model.state_dict(), 'best_model.pth')


print(f"\n\nNajbolja preciznost: {best_precision:.4f}")
print(f"Najbolji odziv: {best_recall:.4f}")
print('Trening neuronske mreže završen.')

Epoch 1/10, Precision: 0.7978, Recall: 0.4734
Epoch 2/10, Precision: 0.8640, Recall: 0.5754
Epoch 3/10, Precision: 0.8262, Recall: 0.6412
Epoch 4/10, Precision: 0.8340, Recall: 0.6544
Epoch 5/10, Precision: 0.8354, Recall: 0.6553
Epoch 6/10, Precision: 0.8307, Recall: 0.6597
Epoch 7/10, Precision: 0.8313, Recall: 0.6616
Epoch 8/10, Precision: 0.8462, Recall: 0.6250
Epoch 9/10, Precision: 0.8344, Recall: 0.6547
Epoch 10/10, Precision: 0.8288, Recall: 0.6553


Najbolja preciznost: 0.8640
Najbolji odziv: 0.5754
Trening neuronske mreže završen.


### LightGBM

In [4]:
from sklearn.model_selection import train_test_split
import lightgbm as lgb
import optuna

y_train_np = np.asarray(y[:train_size].argmax(axis=1)).ravel()
y_test_np = np.asarray(y[train_size:].argmax(axis=1)).ravel()

X_train_search, X_valid_search, y_train_search, y_valid_search = train_test_split(X, y, test_size=0.5, random_state=42)

# Konvertovanje oznaka u 1-D numpy niz
y_train_search = np.argmax(y_train_search, axis=1)
y_valid_search = np.argmax(y_valid_search, axis=1)

# Definicija objective funkcije za Optuna
def objective(trial):
    # Definicija prostora pretrage hiperparametara
    param = {
        "objective": "multiclass",
        "metric": "multi_logloss",
        "verbosity": -1,
        "boosting_type": "gbdt",
        "num_class": y.shape[1],
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.4, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.4, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        "verbose": -1  # Uključi verbose opciju ovde
    }

    # Kreiraj LightGBM dataset
    dtrain = lgb.Dataset(X_train_search, label=y_train_search)
    dvalid = lgb.Dataset(X_valid_search, label=y_valid_search)

    # Kreiraj Optuna pruning callback
    pruning_callback = optuna.integration.LightGBMPruningCallback(trial, "multi_logloss", valid_name="valid_0")

    # Treniraj model
    model = lgb.train(param, dtrain, valid_sets=[dvalid], callbacks=[pruning_callback])

    # Evaluiraj model na validacionom skupu
    y_pred = model.predict(X_valid_search)
    y_pred_labels = np.argmax(y_pred, axis=1)
    accuracy = accuracy_score(y_valid_search, y_pred_labels)

    return accuracy

# Kreiraj Optuna studiju i optimizuj
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=10)

print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

# Definiši najbolje pronađene parametre od strane Optuna
best_params = {
    "objective": "multiclass",
    "metric": "multi_logloss",
    "verbosity": -1,
    "boosting_type": "gbdt",
    "num_class": output_size,
    "lambda_l1": study.best_params["lambda_l1"], 
    "lambda_l2": study.best_params["lambda_l2"],
    "num_leaves": study.best_params["num_leaves"],
    "feature_fraction": study.best_params["feature_fraction"],
    "bagging_fraction": study.best_params["bagging_fraction"],
    "bagging_freq": study.best_params["bagging_freq"],
    "min_child_samples": study.best_params["min_child_samples"],
}

# Train LightGBM model with the best parameters on the full training set
dtrain_full = lgb.Dataset(X[:train_size], label=y_train_np)
best_model = lgb.train(best_params, dtrain_full)

# Računanje preciznosti i odziva na test skupu
y_pred = best_model.predict(X[train_size:], num_iteration=best_model.best_iteration)
y_pred_binary = np.argmax(y_pred, axis=1)
precision_lgbm = precision_score(y_test_np, y_pred_binary, average='weighted', zero_division=0)
recall_lgbm = recall_score(y_test_np, y_pred_binary, average='weighted', zero_division=0)

print(f"\n\nPreciznost na test skupu: {precision_lgbm}")
print(f"Odziv na test skupu: {recall_lgbm}")
print('Trening LightGBM modela završen.')

[I 2024-06-26 09:46:22,178] A new study created in memory with name: no-name-0526f2c5-3d61-41cd-8727-0f86af8855af
[I 2024-06-26 09:47:01,008] Trial 0 finished with value: 0.7744 and parameters: {'lambda_l1': 0.564830610876858, 'lambda_l2': 0.014845666863006183, 'num_leaves': 202, 'feature_fraction': 0.8969880195218041, 'bagging_fraction': 0.4123559945662514, 'bagging_freq': 5, 'min_child_samples': 30}. Best is trial 0 with value: 0.7744.
[I 2024-06-26 09:47:45,757] Trial 1 finished with value: 0.7728 and parameters: {'lambda_l1': 3.3423090833078304e-05, 'lambda_l2': 1.4343593766578642e-07, 'num_leaves': 207, 'feature_fraction': 0.7449490654863375, 'bagging_fraction': 0.9826465352862168, 'bagging_freq': 4, 'min_child_samples': 70}. Best is trial 1 with value: 0.7728.
[I 2024-06-26 09:48:32,430] Trial 2 finished with value: 0.773 and parameters: {'lambda_l1': 3.424988314899009e-05, 'lambda_l2': 5.570278067137863e-07, 'num_leaves': 202, 'feature_fraction': 0.5780809759832684, 'bagging_fra

Number of finished trials: 10
Best trial:
  Value: 0.7728
  Params: 
    lambda_l1: 3.3423090833078304e-05
    lambda_l2: 1.4343593766578642e-07
    num_leaves: 207
    feature_fraction: 0.7449490654863375
    bagging_fraction: 0.9826465352862168
    bagging_freq: 4
    min_child_samples: 70


Preciznost na test skupu: 0.7271395787113216
Odziv na test skupu: 0.7398
Trening LightGBM modela završen.


### Poređenje - Neuronska mreža vs LightGBM

In [3]:
if best_precision > precision_lgbm:
    print("Neuronska mreža je postigla veću preciznost na testnom skupu.")
elif best_precision < precision_lgbm:
    print("LightGBM je postigao veću preciznost na testnom skupu.")
else:
    print("Oba modela su postigla istu preciznost na testnom skupu.")

Neuronska mreža je postigla veću preciznost na testnom skupu.
