In [4]:
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense,Dropout
from sklearn.model_selection import train_test_split

In [5]:
# Load the dataset
df = pd.read_csv('../improved_dataset.csv')

In [6]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\upend\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [7]:
import re
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

# Initialize Porter Stemmer
stemmer = PorterStemmer()

# Load English stopwords
english_stopwords = set(stopwords.words("english"))

def preprocess_text(text):
    # Convert text to lowercase
    text = str(text).lower()
    
    # Remove URLs
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    
    # Remove digits and single characters
    text = re.sub(r'\b\w\b|\d+', '', text)
    
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    
    # Tokenize the text and remove stopwords
    words = [word for word in text.split() if word not in english_stopwords]
    
    # Stem the words
    stemmed_words = [stemmer.stem(word) for word in words]
    
    # Join the stemmed words back into a single string
    processed_text = ' '.join(stemmed_words)
    
    return processed_text

# Apply the preprocess_text function to the "text" column of your DataFrame
df["text"] = df["text"].apply(preprocess_text)


In [8]:
# Split the dataset into features and labels
X = df['text']
y = df['label']

# Tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)
X_seq = tokenizer.texts_to_sequences(X)

# Pad sequences to ensure uniform length
X_padded = pad_sequences(X_seq)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_padded, y, test_size=0.2, random_state=42)

In [11]:
vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 100
max_length = X_padded.shape[1]

model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, input_length=max_length))
model.add(LSTM(128))
model.add(Dropout(0.5)) # Adding dropout with a rate of 0.5 (adjust as needed)
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [12]:
model.summary()

In [13]:
vocab_size

7823

In [14]:
model.fit(X_train, y_train, epochs=10, batch_size=64, validation_data=(X_test, y_test))

Epoch 1/10
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 95ms/step - accuracy: 0.5920 - loss: 0.6691 - val_accuracy: 0.7003 - val_loss: 0.5544
Epoch 2/10
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 79ms/step - accuracy: 0.8153 - loss: 0.4604 - val_accuracy: 0.7709 - val_loss: 0.4352
Epoch 3/10
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 79ms/step - accuracy: 0.9151 - loss: 0.2324 - val_accuracy: 0.7840 - val_loss: 0.4459
Epoch 4/10
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 79ms/step - accuracy: 0.9587 - loss: 0.1513 - val_accuracy: 0.7932 - val_loss: 0.5130
Epoch 5/10
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 79ms/step - accuracy: 0.9814 - loss: 0.0682 - val_accuracy: 0.7932 - val_loss: 0.6621
Epoch 6/10
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 80ms/step - accuracy: 0.9898 - loss: 0.0422 - val_accuracy: 0.7958 - val_loss: 0.8233
Epoch 7/10
[1m48/48[0m [32m━━━━

<keras.src.callbacks.history.History at 0x2b45eb95960>

In [15]:
loss, accuracy = model.evaluate(X_test, y_test)
print('Test Accuracy:', accuracy)


[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - accuracy: 0.7949 - loss: 0.8484
Test Accuracy: 0.7801046967506409


In [19]:
# Preprocess the input text
def preprocess_input(text):
    # Tokenize the text
    text_seq = tokenizer.texts_to_sequences([text])
    # Pad sequences to ensure uniform length
    text_padded = pad_sequences(text_seq, maxlen=max_length)
    return text_padded

# Function to make predictions
def predict_sentiment(text):
    # Preprocess the input
    text_padded = preprocess_input(text)
    # Make predictions
    prediction = model.predict(text_padded)
    return prediction[0][0]

# Example usage
text = "I feel more sad"
prediction = predict_sentiment(text)
def classify_stress(prediction, threshold=0.5):
    if prediction >= threshold:
        return str(f"Stressful+{prediction}")
    else:
        return str(f"Not Stressful+{prediction}")

# Example usage
stress_classification = classify_stress(prediction)
print("Classification:", stress_classification)



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
Classification: Stressful+0.8774896264076233
