In [49]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM,Dropout
from keras.layers import Embedding
from keras.preprocessing import sequence
import re
import nltk
from nltk.tokenize import word_tokenize
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\upend\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
df_train=pd.read_csv('dreaddit-train.csv')
df_test=pd.read_csv('dreaddit-test.csv')

In [50]:
# Initialize Porter Stemmer
stemmer = PorterStemmer()

# Load English stopwords
english_stopwords = set(stopwords.words("english"))

def preprocess_text(text):
    # Convert text to lowercase
    text = str(text).lower()
    
    # Remove URLs
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    
    # Remove digits and single characters
    text = re.sub(r'\b\w\b|\d+', '', text)
    
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    
    # Tokenize the text and remove stopwords
    words = [word for word in text.split() if word not in english_stopwords]
    
    # Stem the words
    stemmed_words = [stemmer.stem(word) for word in words]
    
    # Join the stemmed words back into a single string
    processed_text = ' '.join(stemmed_words)
    
    return processed_text

# Apply the preprocess_text function to the "text" column of your DataFrame
df_train["text"] = df_train["text"].apply(preprocess_text)

In [51]:
from sklearn.feature_extraction.text import TfidfVectorizer
from keras.preprocessing.sequence import pad_sequences

# Initialize the TF-IDF vectorizer
max_features = 10000  # Maximum number of features (words) to keep in the vocabulary
tfidf_vectorizer = TfidfVectorizer(max_features=max_features)

# Fit and transform the text data to TF-IDF features
X_tfidf = tfidf_vectorizer.fit_transform(df_train['text'])

# Convert the TF-IDF matrix to an array
X_tfidf_array = X_tfidf.toarray()

# Pad sequences to ensure uniform length
max_sequence_length = 100  # Adjust this according to your data
X_padded = pad_sequences(X_tfidf_array, maxlen=max_sequence_length)

# X_padded now contains your TF-IDF features with padded sequences


In [52]:
X_padded

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [53]:
X_train, X_test, y_train, y_test = train_test_split(X_padded, df_train['label'], test_size=0.2, random_state=42)

In [54]:
embedding_dim = 100  # Dimension of word embeddings
model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_sequence_length))
model.add(LSTM(units=128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(units=1, activation='sigmoid'))

In [55]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [56]:
print(model.summary())

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 100, 100)          1000000   
                                                                 
 lstm_2 (LSTM)               (None, 128)               117248    
                                                                 
 dense_2 (Dense)             (None, 1)                 129       
                                                                 
Total params: 1117377 (4.26 MB)
Trainable params: 1117377 (4.26 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


In [57]:
epochs = 10  # Number of training epochs
batch_size = 64  # Batch size
model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_test, y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x2a09f075360>

In [58]:
loss, accuracy = model.evaluate(X_test, y_test)
print("Test Loss:", loss)
print("Test Accuracy:", accuracy)

Test Loss: 0.6908356547355652
Test Accuracy: 0.5369718074798584


In [59]:
# Assuming 'model' is your trained LSTM model and 'tokenizer' is your Tokenizer

# Define a function to preprocess a single input text
def preprocess_input(text):
    # Tokenize the input text
    text_sequence = tokenizer.texts_to_sequences([text])
    # Pad the sequence
    text_sequence_padded = pad_sequences(text_sequence, maxlen=max_sequence_length)
    return text_sequence_padded

# Define a function to predict sentiment for a single input text
def predict_sentiment(input_text):
    # Preprocess the input text
    input_text_preprocessed = preprocess_input(input_text)
    # Make predictions
    predictions = model.predict(input_text_preprocessed)
    # Assuming 0 indicates negative sentiment and 1 indicates positive sentiment
    sentiment = "Positive" if predictions[0] > 0.5 else "Negative"
    return sentiment

# Example usage:
input_text = "I want to die"
sentiment = predict_sentiment(input_text)
print("Sentiment:", sentiment)


Sentiment: Positive
