In [None]:
import torch
import numpy as np
import json
import torch.nn as nn
import torch.optim as optim
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_squared_error

# Caricamento dei dati dal file JSON
with open('LemmatizedData/ate_absita_training_extracted.json', 'r') as file:
    data_train = json.load(file)

with open('LemmatizedData/ate_absita_gold_extracted.json', 'r') as file:
    data_test = json.load(file)


X_train = data_train['sentences']
y_train = data_train['scores']

X_test = data_test['sentences']
y_test = data_test['scores']

# Creazione del vettorizzatore TF-IDF per rappresentare i testi come features numeriche
vectorizer = TfidfVectorizer()
X_train_vectors = vectorizer.fit_transform(X_train)
X_test_vectors = vectorizer.transform(X_test)

# Conversione delle matrici sparse in tensori PyTorch
X_train_tensors = torch.tensor(X_train_vectors.toarray(), dtype=torch.float32)
X_test_tensors = torch.tensor(X_test_vectors.toarray(), dtype=torch.float32)
y_train_tensors = torch.tensor(y_train, dtype=torch.float32)
y_test_tensors = torch.tensor(y_test, dtype=torch.float32)

# Definizione della rete neurale
class SentimentClassifier(nn.Module):
    def __init__(self, input_size):
        super(SentimentClassifier, self).__init__()
        self.fc1 = nn.Linear(input_size, 64)
        self.fc2 = nn.Linear(64, 1)

    def forward(self, x):
        x = self.fc1(x)
        x = torch.relu(x)
        x = self.fc2(x)
        return x

# Inizializzazione del modello
model = SentimentClassifier(input_size=X_train_tensors.shape[1])

# Definizione della funzione di loss e dell'ottimizzatore
loss_function = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Addestramento del modello
batch_size = 30
num_epochs = 10
total_samples = X_train_tensors.shape[0]
total_batches = (total_samples + batch_size - 1) // batch_size

for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()
    
    # Addestramento per ogni batch
    for batch in range(total_batches):
        start_idx = batch * batch_size
        end_idx = min((batch + 1) * batch_size, total_samples)
        
        batch_inputs = X_train_tensors[start_idx:end_idx]
        batch_targets = y_train_tensors[start_idx:end_idx]
        
        outputs = model(batch_inputs)
        loss = loss_function(outputs.squeeze(), batch_targets)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

def convert_to_sentiment(score, threshold_pos, threshold_neg):
    if score > threshold_pos:
        return 'positivo'
    elif score < threshold_neg:
        return 'negativo'
    else:
        return 'neutrale'
    
def rmse(predictions, targets):
    return np.sqrt(((predictions - targets) ** 2).mean())

# Valutazione delle prestazioni del modello sul set di test
model.eval()
with torch.no_grad():
    predictions = model(X_test_tensors)
    predictions = predictions.squeeze().numpy()
    sentiments = [convert_to_sentiment(score, 3, 3) for score in predictions]
    rmse = rmse(predictions,y_test)
    print("RMSE:", rmse)