In [12]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pandas as pd
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords 
from nltk.stem import PorterStemmer

# Load the dataset
df = pd.read_csv(r"C:\Users\Vidya\Desktop\vidya\text.csv")

# Preprocess text data
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    tokens = word_tokenize(text)         # Tokenization
    stop_words = set(stopwords.words('english'))  # Remove stopwords
    tokens = [word for word in tokens if word not in stop_words]
    stemmer = PorterStemmer()            # Stemming
    stemmed_tokens = [stemmer.stem(word) for word in tokens]
    preprocessed_text = ' '.join(stemmed_tokens)
    return preprocessed_text

# Apply text preprocessing
df['preprocessed_text'] = df['text'].apply(preprocess_text)



In [13]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['preprocessed_text'], df['label'], test_size=0.2, random_state=42)



In [18]:
print(X_train)
print(y_train)
print(X_test)
print(y_test)

146122    ive blab enough tonight im tire ive feel prett...
334806    woke realli earli morn drove feel ecstat every...
182273    feel never gave rest day megabrick feel stubbo...
198898            feel restless teari flat sad strang today
80779                      feel like im doom ive even began
                                ...                        
259178                                 feel love fell belov
365838    realiz often time isnt reaction start feel pro...
131932                                          feel enviou
146867    im still impati frequent irrit time inexplic h...
121958                                     feel weird apart
Name: preprocessed_text, Length: 333447, dtype: object
146122    0
334806    1
182273    3
198898    4
80779     0
         ..
259178    2
365838    1
131932    3
146867    1
121958    4
Name: label, Length: 333447, dtype: int64
36130     id say mayb made feel foolish would reeeeeeall...
138065    join ld church admit feel somewhat asham 

In [14]:
# Initialize the Tokenizer
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)

# Convert text to sequences and pad them
X_train_padded = pad_sequences(tokenizer.texts_to_sequences(X_train), maxlen=100, padding='post')
X_test_padded = pad_sequences(tokenizer.texts_to_sequences(X_test), maxlen=100, padding='post')

# Build a Sequential model with bidirectional LSTM
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=16, input_shape=(100,)))
model.add(Bidirectional(LSTM(units=64)))
model.add(Dense(units=6, activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Display the model summary
model.summary()



  super().__init__(**kwargs)


In [19]:
print(X_train_padded)
print(y_train)
print(X_test_padded)
print(y_test)

[[ 21  83 513 ...   0   0   0]
 [313   6 754 ...   0   0   0]
 [  1  51 743 ...   0   0   0]
 ...
 [  1 627   0 ...   0   0   0]
 [  3  20 494 ...   0   0   0]
 [  1 174 846 ...   0   0   0]]
146122    0
334806    1
182273    3
198898    4
80779     0
         ..
259178    2
365838    1
131932    3
146867    1
121958    4
Name: label, Length: 333447, dtype: int64
[[ 145   30  201 ...    0    0    0]
 [1085  774  292 ...    0    0    0]
 [ 223  292   54 ...    0    0    0]
 ...
 [ 376   46 1070 ...    0    0    0]
 [1611  606  315 ...    0    0    0]
 [  95    1  299 ...    0    0    0]]
36130     0
138065    0
146440    3
103337    0
315528    1
         ..
7818      0
398821    1
235291    1
148780    3
409429    0
Name: label, Length: 83362, dtype: int64


In [11]:
# Train the model
history = model.fit(X_train_padded, y_train, epochs=10, validation_data=(X_test_padded, y_test))



Epoch 1/10
[1m10421/10421[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m317s[0m 30ms/step - accuracy: 0.7999 - loss: 0.5075 - val_accuracy: 0.9197 - val_loss: 0.1594
Epoch 2/10
[1m10421/10421[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m315s[0m 30ms/step - accuracy: 0.9245 - loss: 0.1469 - val_accuracy: 0.9215 - val_loss: 0.1505
Epoch 3/10
[1m10421/10421[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m311s[0m 30ms/step - accuracy: 0.9283 - loss: 0.1329 - val_accuracy: 0.9212 - val_loss: 0.1409
Epoch 4/10
[1m10421/10421[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m312s[0m 30ms/step - accuracy: 0.9300 - loss: 0.1270 - val_accuracy: 0.9217 - val_loss: 0.1398
Epoch 5/10
[1m10421/10421[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m311s[0m 30ms/step - accuracy: 0.9318 - loss: 0.1209 - val_accuracy: 0.9193 - val_loss: 0.1408
Epoch 6/10
[1m10421/10421[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m316s[0m 30ms/step - accuracy: 0.9346 - loss: 0.1155 - val_accuracy: 0.9212 - val

In [12]:
# Evaluate the model
loss, accuracy = model.evaluate(X_test_padded, y_test)
print(f'\nTest Accuracy (BiLSTM): {accuracy}')


[1m2606/2606[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 10ms/step - accuracy: 0.9111 - loss: 0.1558

Test Accuracy (BiLSTM): 0.9112305641174316


In [15]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

 

In [16]:
random_forest = RandomForestClassifier(n_estimators=100, random_state=42)


In [20]:
random_forest.fit(X_train_padded, y_train)


In [22]:
y_pred = random_forest.predict(X_test_padded)
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy}")

print(classification_report(y_test, y_pred))

Test Accuracy: 0.43555816799021135
              precision    recall  f1-score   support

           0       0.47      0.52      0.49     24201
           1       0.43      0.71      0.54     28164
           2       0.21      0.07      0.11      6929
           3       0.50      0.16      0.24     11441
           4       0.36      0.12      0.18      9594
           5       0.17      0.06      0.09      3033

    accuracy                           0.44     83362
   macro avg       0.36      0.27      0.27     83362
weighted avg       0.42      0.44      0.39     83362

