<a href="https://colab.research.google.com/github/OPtimus4139/Sentiment_Analysis/blob/main/Children_Stories_Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%cd /content/drive/MyDrive/B.Tech_Course_Project/Children_Stories_Sentiment_Analysis
! ls

/content/drive/MyDrive/B.Tech_Course_Project/Children_Stories_Sentiment_Analysis
100-Stories-with-sentiment-analysis.xlsx


In [40]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.sentiment import SentimentIntensityAnalyzer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk

In [4]:
nltk.download('vader_lexicon')
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [28]:
file_path = '100-Stories-with-sentiment-analysis.xlsx'
data = pd.read_excel(file_path)

In [56]:
# Initialize VADER sentiment analyzer and scaling
sia = SentimentIntensityAnalyzer()
scaler = MinMaxScaler()

# Fill missing 'story' values with an empty string
data['processed_text'] = data['processed_text'].fillna("")

# Scale valence, arousal, and dominance
data[['valence', 'arousal', 'dominance']] = scaler.fit_transform(data[['valence', 'arousal', 'dominance']])

# Text cleaning function
def clean_text(text):
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)
    text = re.sub(r'\[.*?\]', '', text)
    text = re.sub(r'\<.*?\>', '', text)
    text = re.sub(r'[^A-Za-z\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()

    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word.lower() not in stop_words]

    return " ".join(tokens)

# Apply text cleaning
data['cleaned_text'] = data['processed_text'].apply(clean_text)

In [57]:
data.columns

Index(['Unnamed: 0', 'url', 'length', 'title', 'text_no', 'author', 'story',
       'valence', 'arousal', 'dominance',
       ...
       '292', '293', '294', '295', '296', '297', '298', '299',
       'sentiment_label', 'cleaned_text'],
      dtype='object', length=320)

In [59]:
# Sentiment labeling
def label_sentiment(text):
    polarity_score = sia.polarity_scores(text)['compound']
    if polarity_score > 0.5:
        return 1
    elif polarity_score < 0.5:
        return 0

data['sentiment_label'] = data['cleaned_text'].apply(label_sentiment)

In [60]:
X = data[[str(i) for i in range(300)]]  # Selecting columns 0 to 299 as features
y = data['sentiment_label']  # The target label column

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train = X_train.values.reshape((X_train.shape[0], 1, X_train.shape[1]))
X_test = X_test.values.reshape((X_test.shape[0], 1, X_test.shape[1]))

# Print the shapes to confirm
print(f'X_train shape: {X_train.shape}')
print(f'X_test shape: {X_test.shape}')
print(f'y_train shape: {y_train.shape}')
print(f'y_test shape: {y_test.shape}')

X_train shape: (79, 1, 300)
X_test shape: (20, 1, 300)
y_train shape: (79,)
y_test shape: (20,)


In [61]:
from tensorflow.keras import Input
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Flatten, Dense
from sklearn.ensemble import RandomForestClassifier

In [51]:
model = Sequential()
model.add(Input(shape=(1, 300)))
model.add(LSTM(64, return_sequences=True,))
model.add(LSTM(64, return_sequences=True))
model.add(LSTM(64, return_sequences=True))
model.add(LSTM(64, return_sequences=True))
model.add(LSTM(64, return_sequences=True))
model.add(LSTM(64, return_sequences=True))
model.add(LSTM(64, return_sequences=True))
model.add(LSTM(64, return_sequences=True))
model.add(Flatten())
model.add(Dense(64, activation='relu'))
model.add(Dense(1, activation='sigmoid'))  # For binary classification

In [52]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

Epoch 1/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 2s/step - accuracy: 0.5053 - loss: 0.6931 - val_accuracy: 0.4375 - val_loss: 0.6934
Epoch 2/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 68ms/step - accuracy: 0.5367 - loss: 0.6930 - val_accuracy: 0.4375 - val_loss: 0.6938
Epoch 3/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms/step - accuracy: 0.5367 - loss: 0.6928 - val_accuracy: 0.4375 - val_loss: 0.6942
Epoch 4/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 60ms/step - accuracy: 0.4950 - loss: 0.6933 - val_accuracy: 0.4375 - val_loss: 0.6943
Epoch 5/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 66ms/step - accuracy: 0.5367 - loss: 0.6925 - val_accuracy: 0.4375 - val_loss: 0.6946
Epoch 6/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 85ms/step - accuracy: 0.5367 - loss: 0.6924 - val_accuracy: 0.4375 - val_loss: 0.6949
Epoch 7/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x781c0406f730>

In [54]:
# Extract LSTM features
feature_extractor = Sequential(model.layers[:-3])  # Excluding the last Dense layers
X_train_lstm_features = feature_extractor.predict(X_train)
X_train_lstm_features = X_train_lstm_features.reshape(X_train_lstm_features.shape[0], -1)

# If needed, reshape X_test_lstm_features as well (assuming you extract features for the test set)
X_test_lstm_features = feature_extractor.predict(X_test)
X_test_lstm_features = X_test_lstm_features.reshape(X_test_lstm_features.shape[0], -1)

# Train a Random Forest Classifier on the reshaped features
X_train_rf, X_test_rf, y_train_rf, y_test_rf = train_test_split(X_train_lstm_features, y_train, test_size=0.2, random_state=42)
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)
rf_clf.fit(X_train_rf, y_train_rf)

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 726ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step


In [55]:
# Make predictions and evaluate
y_pred_rf = rf_clf.predict(X_test_rf)
accuracy = accuracy_score(y_test_rf, y_pred_rf)
print(f'Random Forest Classifier Accuracy: {accuracy:.4f}')

Random Forest Classifier Accuracy: 0.6250
