## Amazon Dataset Test

In [1]:
import pandas as pd
import re
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import nltk

# Run only once if not already downloaded
nltk.download('stopwords')

# Setup
file_path = "train.ft.txt"
max_per_class = 25000
min_text_len = 3

# Initialize
data = []
label_counts = {0: 0, 1: 0}
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

# Step 1: Read and filter lines
with open(file_path, "r", encoding="utf-8") as f:
    for line in tqdm(f, desc="Reading file"):
        match = re.match(r"__label__(\d) (.+)", line)
        if not match:
            continue

        label_raw, text = int(match.group(1)), match.group(2).strip()
        if label_raw not in [1, 2]:
            continue

        label = 0 if label_raw == 1 else 1
        if label_counts[label] >= max_per_class:
            continue

        # Split title + rest of review
        parts = text.split(":", 1)
        if len(parts) == 2:
            title, body = parts
            full_text = (title.strip() + " " + body.strip()).strip()
        else:
            full_text = text.strip()

        # Filter short/empty reviews
        if len(full_text) < min_text_len:
            continue

        data.append((label, full_text))
        label_counts[label] += 1

        if label_counts[0] >= max_per_class and label_counts[1] >= max_per_class:
            break

# Step 2: Create DataFrame
df = pd.DataFrame(data, columns=["label", "review"])
print(f"Loaded {len(df)} reviews (balanced dataset).")
df.head()


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\filip\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Reading file: 51224it [00:00, 294461.77it/s]


Loaded 50000 reviews (balanced dataset).


Unnamed: 0,label,review
0,1,Stuning even for the non-gamer This sound trac...
1,1,The best soundtrack ever to anything. I'm read...
2,1,Amazing! This soundtrack is my favorite music ...
3,1,Excellent Soundtrack I truly like this soundtr...
4,1,"Remember, Pull Your Jaw Off The Floor After He..."


In [2]:
df_imdb = pd.read_csv("IMDB Dataset.csv")  
df_imdb = df_imdb.sample(frac=1, random_state=42).reset_index(drop=True)  # Shuffle

# Encode labels: positive -> 1, negative -> 0
df_imdb['label'] = df_imdb['sentiment'].map({'positive': 1, 'negative': 0})
df_imdb = df_imdb.drop(columns=['sentiment'])
print("Total samples:", len(df_imdb))
df_imdb.head()

Total samples: 50000


Unnamed: 0,review,label
0,I really liked this Summerslam due to the look...,1
1,Not many television shows appeal to quite as m...,1
2,The film quickly gets to a major chase scene w...,0
3,Jane Austen would definitely approve of this o...,1
4,Expectations were somewhat high for me when I ...,0


In [3]:
df = pd.concat([df, df_imdb], axis=0)

In [4]:
df = df.sample(frac=1, random_state=42).reset_index(drop=True)  # Shuffle

print("Total samples:", len(df))
df.head()


Total samples: 100000


Unnamed: 0,label,review
0,0,It was funny because the whole thing was so un...
1,1,I've read innumerable reviews talking about th...
2,1,Great Foot Cream Recently purchased a jar of t...
3,0,I watched this movie a couple months ago when ...
4,1,I just re-watched a few episodes of this serie...


In [5]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^\w\s]", "", text)  # remove punctuation
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]
    tokens = [stemmer.stem(word) for word in tokens]
    return " ".join(tokens)

# Apply cleaning with progress bar
tqdm.pandas(desc="Cleaning text")
df["cleaned_review"] = df["review"].progress_apply(clean_text)

print(df.columns)


Cleaning text: 100%|██████████| 100000/100000 [01:21<00:00, 1219.72it/s]

Index(['label', 'review', 'cleaned_review'], dtype='object')





In [6]:
df = df.drop(columns=['review'])
print(df.columns)

Index(['label', 'cleaned_review'], dtype='object')


In [7]:
from sklearn.feature_extraction.text import CountVectorizer

def bag_of_words(textos):
    vectorizer = CountVectorizer()
    actual = []
    for i in range(len(textos)):
        actual.append(textos[i])

    X = vectorizer.fit_transform(actual)
    diccionario = vectorizer.get_feature_names_out()
    bow = X.toarray()
    return diccionario, bow

In [8]:
diccionario, X = bag_of_words(df['cleaned_review'])

MemoryError: Unable to allocate 147. GiB for an array with shape (100000, 197787) and data type int64

In [15]:
vectorizer = TfidfVectorizer(
    ngram_range=(1, 5),   # Unigram + bigram + trigram
    analyzer='word',
    strip_accents='unicode',
    min_df=2,           # Ignore terms appearing in less than 3 reviews
    max_df=0.9          # Ignore terms appearing in more than 80% of reviews
)

# Apply TF-IDF
X = vectorizer.fit_transform(tqdm(df["cleaned_review"], desc="Vectorizing with TF-IDF"))
print(f"TF-IDF matrix shape: {X.shape}")



Vectorizing with TF-IDF:   0%|          | 0/100000 [00:00<?, ?it/s][A
Vectorizing with TF-IDF:   1%|          | 558/100000 [00:00<00:18, 5347.48it/s][A
Vectorizing with TF-IDF:   1%|          | 1096/100000 [00:00<00:18, 5301.07it/s][A
Vectorizing with TF-IDF:   2%|▏         | 1627/100000 [00:00<00:20, 4896.24it/s][A
Vectorizing with TF-IDF:   2%|▏         | 2161/100000 [00:00<00:20, 4871.20it/s][A
Vectorizing with TF-IDF:   3%|▎         | 2675/100000 [00:00<00:20, 4787.08it/s][A
Vectorizing with TF-IDF:   3%|▎         | 3181/100000 [00:00<00:20, 4714.19it/s][A
Vectorizing with TF-IDF:   4%|▎         | 3699/100000 [00:00<00:20, 4710.92it/s][A
Vectorizing with TF-IDF:   4%|▍         | 4171/100000 [00:00<00:20, 4661.68it/s][A
Vectorizing with TF-IDF:   5%|▍         | 4638/100000 [00:00<00:20, 4643.85it/s][A
Vectorizing with TF-IDF:   5%|▌         | 5103/100000 [00:01<00:22, 4205.59it/s][A
Vectorizing with TF-IDF:   6%|▌         | 5572/100000 [00:01<00:21, 4337.33it/s][A
Vecto

TF-IDF matrix shape: (100000, 1419267)


In [8]:
df.to_csv("cleaned_reviews_amazon.csv", index=False)
print("File saved as 'cleaned_reviews_amazon.csv'")


File saved as 'cleaned_reviews_amazon.csv'


In [16]:
from sklearn.model_selection import train_test_split

X_train, X_temp, y_train, y_temp = train_test_split(
    X, df["label"], test_size=0.30, random_state=42, stratify=df["label"]
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, random_state=42, stratify=y_temp
)

print(f"Train size: {X_train.shape[0]}")
print(f"Validation size: {X_val.shape[0]}")
print(f"Test size: {X_test.shape[0]}")


Train size: 70000
Validation size: 15000
Test size: 15000


In [17]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDClassifier

# Logistic Regression
lr = LogisticRegression(max_iter=1000, n_jobs=-1)
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_val)
print("\nLogistic Regression:")
print(classification_report(y_val, y_pred_lr))

# Linear SVM via SGDClassifier
svm = SGDClassifier(loss="hinge", max_iter=1000, random_state=42)
svm.fit(X_train, y_train)
y_pred_svm = svm.predict(X_val)
print("\nLinear SVM (SGD):")
print(classification_report(y_val, y_pred_svm))



Logistic Regression:
              precision    recall  f1-score   support

           0       0.89      0.88      0.88      7500
           1       0.88      0.89      0.88      7500

    accuracy                           0.88     15000
   macro avg       0.88      0.88      0.88     15000
weighted avg       0.88      0.88      0.88     15000


Linear SVM (SGD):
              precision    recall  f1-score   support

           0       0.88      0.87      0.87      7500
           1       0.87      0.88      0.88      7500

    accuracy                           0.88     15000
   macro avg       0.88      0.88      0.88     15000
weighted avg       0.88      0.88      0.88     15000



In [18]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer, f1_score

# Definiamo il modello base
sgd = SGDClassifier(random_state=42)

# Definiamo la distribuzione degli iperparametri
param_dist = {
    "loss": ["hinge", "log_loss", "modified_huber"],
    "alpha": [1e-5, 1e-4, 1e-3, 1e-2],
    "penalty": ["l2", "l1", "elasticnet"],
    "max_iter": [2000, 2500, 3000, 4000, 5000],
    "learning_rate": ["optimal", "invscaling", "adaptive"],
    "eta0": [0.01, 0.1, 0.5, 1],
}

# Scorer: f1 macro per bilanciare classi
scorer = make_scorer(f1_score, average='macro')

# Setup RandomizedSearch
random_search = RandomizedSearchCV(
    sgd,
    param_distributions=param_dist,
    n_iter=50,  # puoi aumentare se hai tempo
    scoring=scorer,
    n_jobs=-1,
    cv=3,
    verbose=2,
    random_state=42,
)

# Fit
random_search.fit(X_train, y_train)


Fitting 3 folds for each of 50 candidates, totalling 150 fits


In [12]:
print("Best parameters found:")
print(random_search.best_params_)

best_model = random_search.best_estimator_

# Valutazione su validation set
y_val_pred = best_model.predict(X_val)
print("\nValidation set evaluation:")
print(classification_report(y_val, y_val_pred))


Best parameters found:
{'penalty': 'l2', 'max_iter': 3000, 'loss': 'hinge', 'learning_rate': 'adaptive', 'eta0': 0.01, 'alpha': 1e-05}

Validation set evaluation:
              precision    recall  f1-score   support

           0       0.89      0.89      0.89      7500
           1       0.89      0.89      0.89      7500

    accuracy                           0.89     15000
   macro avg       0.89      0.89      0.89     15000
weighted avg       0.89      0.89      0.89     15000



In [13]:
y_test_pred = best_model.predict(X_test)
print("\nFinal evaluation on test set:")
print(classification_report(y_test, y_test_pred))



Final evaluation on test set:
              precision    recall  f1-score   support

           0       0.90      0.89      0.89      7500
           1       0.89      0.90      0.90      7500

    accuracy                           0.89     15000
   macro avg       0.89      0.89      0.89     15000
weighted avg       0.89      0.89      0.89     15000



In [13]:
from sklearn.naive_bayes import MultinomialNB

nb = MultinomialNB()
nb.fit(X_train, y_train)
print("\nNaive Bayes:")
print(classification_report(y_val, nb.predict(X_val)))



Naive Bayes:
              precision    recall  f1-score   support

           0       0.86      0.89      0.88      7500
           1       0.89      0.86      0.87      7500

    accuracy                           0.88     15000
   macro avg       0.88      0.88      0.88     15000
weighted avg       0.88      0.88      0.88     15000



In [14]:
import lightgbm as lgb

lgb_model = lgb.LGBMClassifier(n_estimators=300, random_state=42)
lgb_model.fit(X_train.toarray(), y_train)

print("\nLightGBM:")
print(classification_report(y_val, lgb_model.predict(X_val.toarray())))


MemoryError: Unable to allocate 5.54 TiB for an array with shape (70000, 10878793) and data type float64

In [40]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout, BatchNormalization, Activation
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from tensorflow.keras import regularizers


# Parameters
max_words = 20000
max_len = 200
embedding_dim = 100

# Tokenization
tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(df["cleaned_review"])

sequences = tokenizer.texts_to_sequences(df["cleaned_review"])
padded = pad_sequences(sequences, maxlen=max_len, padding="post", truncating="post")

# Train/Val/Test split
X_train_nn, X_temp_nn, y_train_nn, y_temp_nn = train_test_split(padded, df["label"], test_size=0.30, stratify=df["label"])
X_val_nn, X_test_nn, y_val_nn, y_test_nn = train_test_split(X_temp_nn, y_temp_nn, test_size=0.50, stratify=y_temp_nn)

l2_lambda = 0.001
model = Sequential([
    Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_len),
    Conv1D(filters=8, kernel_size=3, use_bias=False), # Often disable bias before BN
    BatchNormalization(),
    Activation('relu'), # Activation after BN
    GlobalMaxPooling1D(),
    Dense(16, use_bias=False), # Often disable bias before BN
    BatchNormalization(),
    Activation('relu'), # Activation after BN
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

# EarlyStopping callback
early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True, verbose=1)

# Training
history = model.fit(
    X_train_nn,
    y_train_nn,
    epochs=25,
    batch_size=128,
    validation_data=(X_val_nn, y_val_nn),
    callbacks=[early_stop],
    verbose=1
)


Model: "sequential_10"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_14 (Embedding)     (None, 200, 100)          2000000   
_________________________________________________________________
conv1d_11 (Conv1D)           (None, 198, 8)            2400      
_________________________________________________________________
batch_normalization_11 (Batc (None, 198, 8)            32        
_________________________________________________________________
activation_10 (Activation)   (None, 198, 8)            0         
_________________________________________________________________
global_max_pooling1d_9 (Glob (None, 8)                 0         
_________________________________________________________________
dense_20 (Dense)             (None, 16)                128       
_________________________________________________________________
batch_normalization_12 (Batc (None, 16)              

In [41]:
from sklearn.metrics import classification_report

y_pred_cnn = (model.predict(X_test_nn) > 0.5).astype("int32")
print("\nCNN classification report:")
print(classification_report(y_test_nn, y_pred_cnn))



CNN classification report:
              precision    recall  f1-score   support

           0       0.86      0.82      0.84      7500
           1       0.83      0.87      0.85      7500

    accuracy                           0.84     15000
   macro avg       0.85      0.84      0.84     15000
weighted avg       0.85      0.84      0.84     15000



In [42]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.callbacks import EarlyStopping

# Model definition
rnn_model = Sequential([
    Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_len),
    Bidirectional(LSTM(32, return_sequences=False)),
    Dropout(0.5),
    Dense(32, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

rnn_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Early stopping callback
early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True, verbose=1)

# Training
history_rnn = rnn_model.fit(
    X_train_nn, y_train_nn,
    epochs=10,
    batch_size=128,
    validation_data=(X_val_nn, y_val_nn),
    callbacks=[early_stop],
    verbose=1
)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Restoring model weights from the end of the best epoch.
Epoch 00004: early stopping


In [43]:
from sklearn.metrics import classification_report

y_pred_nn = (rnn_model.predict(X_test_nn) > 0.5).astype("int32")
print("\nRNN LSTM classification report:")
print(classification_report(y_test_nn, y_pred_nn))



RNN LSTM classification report:
              precision    recall  f1-score   support

           0       0.90      0.84      0.87      7500
           1       0.85      0.91      0.88      7500

    accuracy                           0.87     15000
   macro avg       0.88      0.87      0.87     15000
weighted avg       0.88      0.87      0.87     15000



In [44]:
import numpy as np

embedding_index = {}
glove_path = "glove.6B.100d.txt"  # Cambia path se serve

with open(glove_path, encoding='utf8') as f:
    for line in f:
        values = line.strip().split()
        word = values[0]
        vector = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = vector

print(f"Loaded {len(embedding_index)} word vectors from GloVe.")


Loaded 400000 word vectors from GloVe.


In [45]:
# Numero parole da usare e tokenizer già esistente
word_index = tokenizer.word_index
num_tokens = min(max_words, len(word_index)) + 1  # +1 per OOV

embedding_matrix = np.zeros((num_tokens, embedding_dim))

for word, i in word_index.items():
    if i >= max_words:
        continue
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector


In [46]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout, LSTM, BatchNormalization, Reshape
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split

# Parameters
max_words = 30000
max_len = 200
embedding_dim = 100

# Tokenization
tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(df["cleaned_review"])

sequences = tokenizer.texts_to_sequences(df["cleaned_review"])
padded = pad_sequences(sequences, maxlen=max_len, padding="post", truncating="post")

# Train/Val/Test split
X_train_nn, X_temp_nn, y_train_nn, y_temp_nn = train_test_split(padded, df["label"], test_size=0.30, stratify=df["label"])
X_val_nn, X_test_nn, y_val_nn, y_test_nn = train_test_split(X_temp_nn, y_temp_nn, test_size=0.50, stratify=y_temp_nn)

# Model definition
model = Sequential([
    # Embedding layer
    Embedding(input_dim=num_tokens,
              output_dim=embedding_dim,
              weights=[embedding_matrix],
              input_length=max_len,
              trainable=True),  # Fine-tuning gli embeddings

    # Convolutional layers (2 stacked layers)
    Conv1D(filters=128, kernel_size=5, activation='relu', padding='same'),
    BatchNormalization(),  # Normalizza gli output
    Conv1D(filters=128, kernel_size=5, activation='relu', padding='same'),
    GlobalMaxPooling1D(),  # Poolea la feature map per ottenere il massimo da ciascuna feature

    # Reshape output per passare al layer LSTM
    Reshape((1, 128)),  # Aggiungi una dimensione per i timesteps, che è necessaria per l'LSTM

    # LSTM layer per catturare le dipendenze temporali
    LSTM(64, return_sequences=False),

    # Fully connected layer
    Dense(128, activation='relu'),
    Dropout(0.5),  # Regularizzazione

    # Output layer
    Dense(1, activation='sigmoid')  # Output binario per la classificazione
])

# Compilazione del modello
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

# Early stopping
early_stop = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True, verbose=1)

# Training del modello
history = model.fit(X_train_nn, y_train_nn,
                    epochs=50,
                    batch_size=128,
                    validation_data=(X_val_nn, y_val_nn),
                    callbacks=[early_stop],
                    verbose=1)



Model: "sequential_12"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_16 (Embedding)     (None, 200, 100)          2000100   
_________________________________________________________________
conv1d_12 (Conv1D)           (None, 200, 128)          64128     
_________________________________________________________________
batch_normalization_13 (Batc (None, 200, 128)          512       
_________________________________________________________________
conv1d_13 (Conv1D)           (None, 200, 128)          82048     
_________________________________________________________________
global_max_pooling1d_10 (Glo (None, 128)               0         
_________________________________________________________________
reshape (Reshape)            (None, 1, 128)            0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 64)              

In [47]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.callbacks import EarlyStopping

# Model definition
rnn_model = Sequential([
    Embedding(input_dim=num_tokens,
          output_dim=embedding_dim,
          weights=[embedding_matrix],
          input_length=max_len,
          trainable=True),    
    Bidirectional(LSTM(64, return_sequences=False)),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])

rnn_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Early stopping callback
early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True, verbose=1)

# Training
history_rnn = rnn_model.fit(
    X_train_nn, y_train_nn,
    epochs=10,
    batch_size=128,
    validation_data=(X_val_nn, y_val_nn),
    callbacks=[early_stop],
    verbose=1
)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Restoring model weights from the end of the best epoch.
Epoch 00005: early stopping
