In [10]:
# import pandas as pd

# # Load the first CSV
# df_comments = pd.read_parquet('data/comments.parquet.brotli', engine='pyarrow')

# # Load the second CSV
# df_roberta = pd.read_csv('data/comment_sentiment.csv')

# # Merge the two dataframes on the 'comment_id' column
# merged_df = pd.merge(df_comments, df_roberta, on='comment_id', how='inner')

# # Save the merged dataframe to a new CSV
# merged_df.to_csv('data/ffnn.csv', index=False)

# print("Merged CSV created successfully as 'ffnn.csv'.")


Merged CSV created successfully as 'ffnn.csv'.


### **FFNN Model**

In [119]:
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Flatten
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [127]:
# Step 1: Load the dataset
data = pd.read_csv('data/comment_preprocessing.csv')

In [128]:
# Assuming the CSV has columns: 'comment' (text) and 'sentiment' (-1, 0, 1)
comments = data['comment_content']
sentiments = data['RoBERTa_sentiment']

In [131]:
# Convert labels to categorical format
y = to_categorical(sentiments, num_classes=3)  # Convert to one-hot encoding


In [133]:
print(sentiments.value_counts())

print("Unique values in sentiments:", sentiments.unique())
print("Unique values in RoBERTa_sentiment:", data['RoBERTa_sentiment'].unique())


RoBERTa_sentiment
-1    457913
 0    410038
 1    170166
Name: count, dtype: int64
Unique values in sentiments: [ 0 -1  1]
Unique values in RoBERTa_sentiment: [ 0 -1  1]


In [135]:
# Step 3: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [137]:
# Step 4: Build the FFNN Model
model = Sequential([
    Embedding(input_dim=10000, output_dim=128),  # Embedding layer
    Flatten(),  # Flatten the output from the embedding layer
    Dense(128, activation='relu'),  # Hidden layer 1
    Dense(64, activation='relu'),  # Hidden layer 2
    Dense(3, activation='softmax')  # Output layer for multi-class classification
])

In [139]:
# Step 5: Compile the Model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [141]:
# Step 6: Train the Model
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2, verbose=1)

Epoch 1/10
[1m20763/20763[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m119s[0m 6ms/step - accuracy: 0.7005 - loss: 0.6802 - val_accuracy: 0.7523 - val_loss: 0.5842
Epoch 2/10
[1m20763/20763[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m119s[0m 6ms/step - accuracy: 0.7894 - loss: 0.5063 - val_accuracy: 0.7564 - val_loss: 0.5846
Epoch 3/10
[1m20763/20763[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m122s[0m 6ms/step - accuracy: 0.8520 - loss: 0.3708 - val_accuracy: 0.7528 - val_loss: 0.6536
Epoch 4/10
[1m20763/20763[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m119s[0m 6ms/step - accuracy: 0.8937 - loss: 0.2740 - val_accuracy: 0.7457 - val_loss: 0.7730
Epoch 5/10
[1m20763/20763[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m119s[0m 6ms/step - accuracy: 0.9187 - loss: 0.2123 - val_accuracy: 0.7409 - val_loss: 0.9738
Epoch 6/10
[1m20763/20763[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m119s[0m 6ms/step - accuracy: 0.9358 - loss: 0.1707 - val_accuracy: 0.7410 - val_loss:

In [143]:
# Step 7: Evaluate the Model
y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)  # Convert probabilities to class labels
y_test_classes = np.argmax(y_test, axis=1)  # Convert one-hot back to class labels

[1m6489/6489[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 797us/step


In [145]:
# Print classification report
print(classification_report(y_test_classes, y_pred_classes, target_names=['Negative', 'Neutral', 'Positive']))

              precision    recall  f1-score   support

    Negative       0.70      0.73      0.71     81713
     Neutral       0.70      0.64      0.67     33906
    Positive       0.77      0.77      0.77     92005

    accuracy                           0.73    207624
   macro avg       0.72      0.71      0.72    207624
weighted avg       0.73      0.73      0.73    207624

