In [1]:
import numpy as np
import pandas as pd
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense, SpatialDropout1D



In [2]:

df = pd.read_csv(r"C:\Users\Vidya\Desktop\vidya\text.csv")

# Preprocessing
encoder = LabelEncoder()
df['label'] = encoder.fit_transform(df['label'])
X = df['text']
y = df['label']

# Text preprocessing function
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    
    # Tokenization
    tokens = word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    # Stemming
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(word) for word in tokens]
    
    # Remove non-alphabetic words
    stemmed_tokens = [word for word in stemmed_tokens if word.isalpha()]
    
    # Join the remaining words back into a string
    preprocessed_text = ' '.join(stemmed_tokens)
    
    return preprocessed_text

# Apply text preprocessing
X_preprocessed = X.apply(preprocess_text)



In [3]:
# Tokenization
max_words = 5000
max_len = 100
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_preprocessed)
X_seq = tokenizer.texts_to_sequences(X_preprocessed)
X_pad = pad_sequences(X_seq, maxlen=max_len)



In [4]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_pad, y, test_size=0.2, random_state=42)




In [7]:
# Define the model
embedding_dim = 100
model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_shape=(max_len,)))
model.add(SpatialDropout1D(0.2))
model.add(GRU(64, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(6, activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

model.summary()


In [9]:
# Train the model
epochs = 10
batch_size = 64
model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_test, y_test))


Epoch 1/10
[1m5211/5211[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m262s[0m 49ms/step - accuracy: 0.6692 - loss: 25.4067 - val_accuracy: 0.7927 - val_loss: 0.6217
Epoch 2/10
[1m5211/5211[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m243s[0m 47ms/step - accuracy: 0.8532 - loss: 0.4351 - val_accuracy: 0.8072 - val_loss: 0.5596
Epoch 3/10
[1m5211/5211[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m247s[0m 47ms/step - accuracy: 0.8770 - loss: 0.3601 - val_accuracy: 0.8362 - val_loss: 0.4615
Epoch 4/10
[1m5211/5211[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m249s[0m 48ms/step - accuracy: 0.9037 - loss: 352.9110 - val_accuracy: 0.8471 - val_loss: 0.4252
Epoch 5/10
[1m5211/5211[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m256s[0m 49ms/step - accuracy: 0.9011 - loss: 0.2862 - val_accuracy: 0.8530 - val_loss: 0.3997
Epoch 6/10
[1m5211/5211[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m254s[0m 49ms/step - accuracy: 0.9072 - loss: 0.2604 - val_accuracy: 0.8539 - val_loss: 0.

<keras.src.callbacks.history.History at 0x1bc11718b50>

In [10]:
# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print("Test Accuracy:", accuracy)

[1m2606/2606[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 9ms/step - accuracy: 0.8675 - loss: 0.3412
Test Accuracy: 0.8681893348693848


In [5]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder


In [6]:
random_forest = RandomForestClassifier(n_estimators=100, random_state=42)

In [7]:
random_forest.fit(X_train, y_train)

In [8]:
y_pred = random_forest.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy}")

print(classification_report(y_test, y_pred))

Test Accuracy: 0.422002831026127
              precision    recall  f1-score   support

           0       0.46      0.50      0.48     24201
           1       0.42      0.70      0.52     28164
           2       0.21      0.07      0.11      6929
           3       0.47      0.14      0.22     11441
           4       0.34      0.11      0.16      9594
           5       0.16      0.06      0.08      3033

    accuracy                           0.42     83362
   macro avg       0.34      0.26      0.26     83362
weighted avg       0.40      0.42      0.38     83362

