### Importing Libraries

In [27]:
import pandas as pd
import numpy as np
import nltk
import re
import tensorflow as tf
import pickle

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from tensorflow.keras.utils import to_categorical

nltk.download('punkt_tab')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

lemmatizer = WordNetLemmatizer()

# Extract precision, recall, and f1-score from classification report
report = classification_report(Y_test, y_pred_labels, output_dict=True)
for label, metrics in report.items():
    if isinstance(metrics, dict):  # Only process label metrics, not summary stats
        precision = metrics.get('precision', 0)
        recall = metrics.get('recall', 0)
        print(f'Label {label}: Precision = {precision:.2f}, Recall = {recall:.2f}')


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Loading Dataset

In [8]:
data = pd.read_csv("/content/Reddit_Data.csv")

print(data.head())
print("\n",data.shape)
data.isnull().sum()

                                                text  category
0   family mormon have never tried explain them t...         1
1  buddhism has very much lot compatible with chr...         1
2  seriously don say thing first all they won get...        -1
3  what you have learned yours and only yours wha...         0
4  for your own benefit you may want read living ...         1

 (37249, 2)


Unnamed: 0,0
text,100
category,0


### Data Cleaning

In [24]:
stop_words = set(stopwords.words('english'))

# Function
def clean_text(text):
    text = str(text)
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    text = " ".join([word for word in text.split() if word not in stop_words])
    text = text.lower()
    return text

# Applying function to dataset
data['cleaned_text'] = data['text'].apply(clean_text)

# Tokenize
data['tokens'] = data['cleaned_text'].apply(word_tokenize)

# Print
data['tokens'].head()

Unnamed: 0,tokens
0,"[family, mormon, never, tried, explain, still,..."
1,"[buddhism, much, lot, compatible, christianity..."
2,"[seriously, say, thing, first, get, complex, e..."
3,"[learned, want, teach, different, focus, goal,..."
4,"[benefit, may, want, read, living, buddha, liv..."


### Feature Extraction

In [25]:
# Initialize TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_features = 5000)

# Fit and transform the text data
X = vectorizer.fit_transform(data['cleaned_text'])
Y = data['category']

# Save
pickle.dump(vectorizer, open('tfidf_vectorizer.pkl', 'wb'))

# Print
print(X.shape)

(37249, 5000)


### Train-Test Split

In [26]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 42)

# Print
print(f"Training data shape: {X_train.shape} \nTest data shape: {X_test.shape}")

Training data shape: (29799, 5000) 
Test data shape: (7450, 5000)


### Convert labels to categorical

In [32]:
# Adjust labels for 3 classes: negative, neutral, positive
Y_train_encoded = to_categorical(Y_train + 1, num_classes=3)
Y_test_encoded = to_categorical(Y_test + 1, num_classes=3)


### Building Model

In [33]:
model = tf.keras.Sequential()

# Input layer
model.add(tf.keras.layers.Dense(256, input_shape = (X_train.shape[1],), activation = 'relu'))

# Hidden layers
model.add(tf.keras.layers.Dropout(0.5))
model.add(tf.keras.layers.Dense(128, activation = 'relu'))
model.add(tf.keras.layers.Dropout(0.5))

# Output layer
model.add(tf.keras.layers.Dense(3, activation = 'softmax'))


# Compile
model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

# Print
model.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [30]:
# Convert sparse matrices to dense
X_train_dense = X_train.toarray()
X_test_dense = X_test.toarray()

### Train the Model

In [34]:
history = model.fit(X_train_dense, Y_train_encoded, epochs = 30, batch_size = 32, validation_data = (X_test_dense, Y_test_encoded), verbose =1)

# Save
model.save('sentiment_analysis.h5')

Epoch 1/30
[1m932/932[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 23ms/step - accuracy: 0.6322 - loss: 0.7917 - val_accuracy: 0.8442 - val_loss: 0.4388
Epoch 2/30
[1m932/932[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 21ms/step - accuracy: 0.8901 - loss: 0.3266 - val_accuracy: 0.8510 - val_loss: 0.4303
Epoch 3/30
[1m932/932[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 19ms/step - accuracy: 0.9351 - loss: 0.2161 - val_accuracy: 0.8510 - val_loss: 0.4585
Epoch 4/30
[1m932/932[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 21ms/step - accuracy: 0.9535 - loss: 0.1572 - val_accuracy: 0.8428 - val_loss: 0.5227
Epoch 5/30
[1m932/932[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 19ms/step - accuracy: 0.9680 - loss: 0.1133 - val_accuracy: 0.8448 - val_loss: 0.5775
Epoch 6/30
[1m932/932[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 18ms/step - accuracy: 0.9777 - loss: 0.0795 - val_accuracy: 0.8420 - val_loss: 0.6658
Epoch 7/30
[1m9



### Evaluation

In [35]:
# Accuracy
loss, accuracy = model.evaluate(X_test_dense, Y_test_encoded)
print(f"Test Accuracy: {accuracy * 100:.2f}%")

# Classification Report
Y_pred = model.predict(X_test_dense)
y_pred_labels = np.argmax(Y_pred, axis=1) - 1

# Classification report
print("Classification Report:")
print(classification_report(Y_test, y_pred_labels))

# Confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(Y_test, y_pred_labels))

# Extract precision, recall, and f1-score from classification report
report = classification_report(Y_test, y_pred_labels, output_dict=True)
for label, metrics in report.items():
    if isinstance(metrics, dict):  # Only process label metrics, not summary stats
        precision = metrics.get('precision', 0)
        recall = metrics.get('recall', 0)
        print(f'Label {label}: Precision = {precision:.2f}, Recall = {recall:.2f}')


[1m233/233[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - accuracy: 0.8441 - loss: 1.2633
Test Accuracy: 83.91%
[1m233/233[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step
Classification Report:
              precision    recall  f1-score   support

          -1       0.77      0.72      0.74      1667
           0       0.86      0.90      0.88      2615
           1       0.86      0.85      0.85      3168

    accuracy                           0.84      7450
   macro avg       0.83      0.82      0.83      7450
weighted avg       0.84      0.84      0.84      7450

Confusion Matrix:
[[1208  165  294]
 [ 111 2357  147]
 [ 260  222 2686]]


In [38]:
def predict_sentiment(text):
# Clean the input text using the previously defined clean_text function
  cleaned_text = clean_text(text)

  # Transform the cleaned text using the vectorizer
  vectorized_text = vectorizer.transform([cleaned_text]).toarray()

  # Predict sentiment
  prediction = model.predict(vectorized_text)
  sentiment_class = np.argmax(prediction)  # Get the predicted class (0, 1, 2)

  # Map the sentiment class to labels
  sentiment_mapping = {0: "Negative", 1: "Neutral", 2: "Positive"}
  return sentiment_mapping[sentiment_class]


In [46]:
# Test examples
examples = [
    "Good",
    "Fine",
    "Moderate"
]

for example in examples:
    sentiment = predict_sentiment(example)
    print(f"Text: {example}\nPredicted Sentiment: {sentiment}\n")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
Text: Good
Predicted Sentiment: Positive

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step
Text: Fine
Predicted Sentiment: Positive

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
Text: Moderate
Predicted Sentiment: Neutral

