In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dropout, Dense
from sklearn.metrics import classification_report
import numpy as np

In [2]:
# 1. Citire fișierele
train_df = pd.read_csv("date/train.csv", engine="python")
test_df = pd.read_csv("date/test.csv", engine="python")

In [3]:
# 2. Combinare 'Summary' și 'Text' în 'combined_text'
for df in [train_df, test_df]:
    df["Summary"] = df["Summary"].fillna("")
    df["Text"] = df["Text"].fillna("")
    df["combined_text"] = df["Summary"].astype(str) + " " + df["Text"].astype(str)

In [4]:
# 3. Curățare scoruri (păstrăm doar 1–5)
for df in [train_df, test_df]:
    df["Score"] = pd.to_numeric(df["Score"], errors="coerce")
    df.dropna(subset=["Score"], inplace=True)
    df["Score"] = df["Score"].astype(int)
    df = df[df["Score"].isin([1, 2, 3, 4, 5])]

In [5]:
# 4. Reaplicare modificările pe dataframes (deoarece `df = df[...]` nu modifică în loc)
train_df = train_df[train_df["Score"].isin([1, 2, 3, 4, 5])]
test_df = test_df[test_df["Score"].isin([1, 2, 3, 4, 5])]

In [6]:
# 5. Codificare etichete
le = LabelEncoder()
train_df["label"] = le.fit_transform(train_df["Score"])
test_df["label"] = le.transform(test_df["Score"])  # trebuie să folosească același encoder

In [7]:
# 6. Tokenizare pe text
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(train_df["combined_text"])

In [8]:
# 7. Transformare în secvențe + padding
X_train = tokenizer.texts_to_sequences(train_df["combined_text"])
X_test = tokenizer.texts_to_sequences(test_df["combined_text"])
X_train = pad_sequences(X_train, maxlen=200)
X_test = pad_sequences(X_test, maxlen=200)


In [9]:
y_train = train_df["label"].values
y_test = test_df["label"].values

In [10]:
# 8. Model LSTM
model = Sequential([
    Embedding(input_dim=10000, output_dim=64, input_length=200),
    LSTM(64),
    Dropout(0.5),
    Dense(32, activation='relu'),
    Dense(len(le.classes_), activation='softmax')
])



In [11]:
model.compile(
    loss="sparse_categorical_crossentropy",
    optimizer="adam",
    metrics=["accuracy"]
)

model.summary()

In [12]:
# 9. Antrenare
model.fit(
    X_train, y_train,
    epochs=5,
    batch_size=128,
    validation_split=0.1
)

Epoch 1/5
[1m3198/3198[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m369s[0m 114ms/step - accuracy: 0.7008 - loss: 0.8344 - val_accuracy: 0.7551 - val_loss: 0.6541
Epoch 2/5
[1m3198/3198[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m432s[0m 135ms/step - accuracy: 0.7648 - loss: 0.6297 - val_accuracy: 0.7698 - val_loss: 0.6253
Epoch 3/5
[1m3198/3198[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m371s[0m 116ms/step - accuracy: 0.7854 - loss: 0.5793 - val_accuracy: 0.7785 - val_loss: 0.6054
Epoch 4/5
[1m3198/3198[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m397s[0m 124ms/step - accuracy: 0.8049 - loss: 0.5331 - val_accuracy: 0.7871 - val_loss: 0.5929
Epoch 5/5
[1m3198/3198[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m403s[0m 126ms/step - accuracy: 0.8207 - loss: 0.4959 - val_accuracy: 0.7901 - val_loss: 0.5890


<keras.src.callbacks.history.History at 0x25352d732f0>

In [13]:
# 10. Evaluare
y_pred = model.predict(X_test).argmax(axis=1)

[1m3553/3553[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 14ms/step


In [14]:
unique_labels = np.unique(y_test)
print(classification_report(
    y_test,
    y_pred,
    labels=unique_labels,
    target_names=le.inverse_transform(unique_labels).astype(str)
))

              precision    recall  f1-score   support

           1       0.71      0.77      0.74     10515
           2       0.54      0.37      0.44      5937
           3       0.54      0.54      0.54      8460
           4       0.62      0.43      0.51     16026
           5       0.87      0.94      0.90     72743

    accuracy                           0.79    113681
   macro avg       0.66      0.61      0.63    113681
weighted avg       0.78      0.79      0.78    113681

