In [4]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import re

# Load NLTK resources
import nltk
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Vidya\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:



# Function to preprocess text
def preprocess_text(text):
    # Initialize stop words
    stop_words = set(stopwords.words('english'))
    
    # Convert to lowercase
    text = text.lower()
    # Remove non-alphanumeric characters and replace with space
    text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)
    # Tokenize text
    words = text.split()
    # Remove stop words
    words = [word for word in words if word not in stop_words]
    # Stemming
    stemmer = SnowballStemmer("english")
    words = [stemmer.stem(word) for word in words]
    # Join the words back into a string
    text = ' '.join(words)
    return text

file_path = r"C:\Users\Vidya\Desktop\vidya\text.csv"
data = pd.read_csv(file_path)


# Preprocess the text
data['text'] = data['text'].apply(preprocess_text)

# Assuming your data is in the format provided
tweets = data['text']
labels = data['label']




In [6]:
data.head()

Unnamed: 0.1,Unnamed: 0,text,label
0,0,feel realli helpless heavi heart,4
1,1,ive enjoy abl slouch relax unwind frank need l...,0
2,2,gave internship dmrg feel distraught,4
3,3,dont know feel lost,0
4,4,kindergarten teacher thorough weari job taken ...,4


In [7]:
data.shape

(416809, 3)

In [8]:
# Preprocess the data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(tweets)
sequences = tokenizer.texts_to_sequences(tweets)
max_sequence_length = max([len(x) for x in sequences])
padded_sequences = pad_sequences(sequences, maxlen=max_sequence_length)

# Encode the labels
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)



In [9]:
sequences

[[1, 6, 213, 1257, 133],
 [21,
  169,
  114,
  12246,
  267,
  4939,
  1651,
  27,
  81,
  67,
  53,
  82,
  2634,
  11415,
  136,
  29,
  60,
  1,
  26,
  810,
  51,
  6,
  38,
  16],
 [755, 4248, 26262, 1, 998],
 [28, 4, 1, 177],
 [4524,
  849,
  1908,
  2681,
  226,
  626,
  856,
  3741,
  1107,
  358,
  964,
  67,
  10,
  847,
  31,
  739,
  1634],
 [282, 1, 65, 647],
 [15, 12, 6457, 15, 361, 84, 264, 1685, 227, 1, 2, 479, 117, 992],
 [199, 101, 1, 528, 141, 654, 2845, 244, 23, 99],
 [3, 954, 46, 7, 617, 1, 166],
 [19, 379, 469, 94, 1, 543],
 [35, 398, 55, 5, 537, 49, 781, 1167, 1, 250, 13, 695, 834, 2189, 173],
 [3, 1, 13, 2, 556, 1203, 1918, 13, 19462],
 [1785, 1609, 3, 1, 665, 59, 15],
 [1,
  2,
  2062,
  1839,
  538,
  1546,
  11,
  10739,
  48,
  321,
  1442,
  727,
  10739,
  3305,
  722,
  564,
  538,
  1501,
  3226,
  1240,
  577,
  1224],
 [1, 4872, 3264, 515, 207, 4, 92, 131, 154, 276, 3374],
 [134, 4525, 181, 1340, 24, 46, 97, 104, 327, 1, 27, 46, 97, 56],
 [1, 767],
 [1

In [10]:
padded_sequences

array([[    0,     0,     0, ...,   213,  1257,   133],
       [    0,     0,     0, ...,     6,    38,    16],
       [    0,     0,     0, ..., 26262,     1,   998],
       ...,
       [    0,     0,     0, ...,     7,  1404,   354],
       [    0,     0,     0, ..., 51531,   114,    31],
       [    0,     0,     0, ...,     1,   507,    14]])

In [11]:
encoded_labels

array([4, 0, 4, ..., 5, 3, 5], dtype=int64)

In [12]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, encoded_labels, test_size=0.2, random_state=42)



In [30]:
print(X_train)
print(X_test)
print(y_train)
print(y_test)

[[   0    0    0 ...  523 7223  972]
 [   0    0    0 ...  284  714   11]
 [   0    0    0 ...  278  265 5922]
 ...
 [   0    0    0 ...    0    1  631]
 [   0    0    0 ...  265 4970  105]
 [   0    0    0 ...    1  166  851]]
[[    0     0     0 ...    15 28510   658]
 [    0     0     0 ...  4636  2342   688]
 [    0     0     0 ...   748  1012   105]
 ...
 [    0     0     0 ...     1   515  1768]
 [    0     0     0 ...     1  1136   428]
 [    0     0     0 ...  1064   142    16]]
[0 1 3 ... 3 1 4]
[0 0 3 ... 1 3 0]


In [14]:
# Define the LSTM model
embedding_dim = 100
vocab_size = len(tokenizer.word_index) + 1
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_shape=(max_sequence_length, )))
model.add(LSTM(units=128))
model.add(Dense(units=6, activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])



  super().__init__(**kwargs)


In [31]:
model.summary()

In [15]:
# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))



Epoch 1/10
[1m10421/10421[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m874s[0m 84ms/step - accuracy: 0.8553 - loss: 0.3510 - val_accuracy: 0.9232 - val_loss: 0.1443
Epoch 2/10
[1m10421/10421[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m885s[0m 85ms/step - accuracy: 0.9259 - loss: 0.1345 - val_accuracy: 0.9233 - val_loss: 0.1384
Epoch 3/10
[1m10421/10421[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m889s[0m 85ms/step - accuracy: 0.9311 - loss: 0.1208 - val_accuracy: 0.9210 - val_loss: 0.1393
Epoch 4/10
[1m10421/10421[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m885s[0m 85ms/step - accuracy: 0.9362 - loss: 0.1097 - val_accuracy: 0.9169 - val_loss: 0.1458
Epoch 5/10
[1m10421/10421[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m913s[0m 88ms/step - accuracy: 0.9373 - loss: 0.1046 - val_accuracy: 0.9114 - val_loss: 0.1501
Epoch 6/10
[1m10421/10421[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m925s[0m 88ms/step - accuracy: 0.9411 - loss: 0.0976 - val_accuracy: 0.9106 - val

<keras.src.callbacks.history.History at 0x1f3e6235090>

In [16]:
# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Loss: {loss}, Test Accuracy: {accuracy}")

[1m2606/2606[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 21ms/step - accuracy: 0.9036 - loss: 0.2076
Test Loss: 0.21085695922374725, Test Accuracy: 0.9036731123924255


In [None]:
###evaluation

In [13]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report


In [14]:
random_forest = RandomForestClassifier(n_estimators=100, random_state=42)

In [15]:
random_forest.fit(X_train, y_train)

In [16]:
y_pred = random_forest.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy}")

print(classification_report(y_test, y_pred))

Test Accuracy: 0.4157409851011252
              precision    recall  f1-score   support

           0       0.46      0.50      0.48     24201
           1       0.41      0.70      0.52     28164
           2       0.19      0.06      0.10      6929
           3       0.44      0.12      0.19     11441
           4       0.32      0.09      0.15      9594
           5       0.14      0.05      0.07      3033

    accuracy                           0.42     83362
   macro avg       0.33      0.25      0.25     83362
weighted avg       0.39      0.42      0.37     83362

