In [65]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import string
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [66]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ROMIT\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ROMIT\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [67]:
df=pd.read_csv('Twitter_Data.csv')
df

Unnamed: 0,clean_text,category
0,when modi promised “minimum government maximum...,-1.0
1,talk all the nonsense and continue all the dra...,0.0
2,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,1.0
...,...,...
162975,why these 456 crores paid neerav modi not reco...,-1.0
162976,dear rss terrorist payal gawar what about modi...,-1.0
162977,did you cover her interaction forum where she ...,0.0
162978,there big project came into india modi dream p...,0.0


In [68]:
#Change our dependent variable to categorical. (0 to “Neutral,”-1 to “Negative”, 1 to “Positive”)
df['category']=df['category'].map({0: 'Neutral', -1: 'Negative', 1: 'Positive'})

In [69]:
#Missing Value Analysis
df.isnull()

Unnamed: 0,clean_text,category
0,False,False
1,False,False
2,False,False
3,False,False
4,False,False
...,...,...
162975,False,False
162976,False,False
162977,False,False
162978,False,False


In [70]:
df.dropna(inplace=True)#drop missing values

In [71]:
# Text cleaning
stop_words = set(stopwords.words('english'))
ps = PorterStemmer()

def clean_text(text):
    # Tokenize words
    words = word_tokenize(text)
    # Remove punctuation and convert to lowercase
    words = [ps.stem(word.lower()) for word in words if word.isalnum()]
    # Remove stopwords
    words = [word for word in words if word not in stop_words]
    return ' '.join(words)

df['cleaned_text'] = df['clean_text'].apply(clean_text)

# Step 5: Create a new column for the length of each sentence
df['sentence_length'] = df['cleaned_text'].apply(lambda x: len(x.split()))

In [72]:
X = df['cleaned_text']#X is independent dataframe
Y = df['category']#Y is dependent dataframe

In [73]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)
vocab_size = len(tokenizer.word_index) + 1
encoded_sentences = tokenizer.texts_to_sequences(X)

# Add padding from the front side
max_length = 10  # Define your desired sequence length
padded_sequences = pad_sequences(encoded_sentences, maxlen=max_length, padding='pre')

In [74]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

# Build LSTM model
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=128, input_length=max_length))
model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])


In [77]:
labels=df['category'].map({'Neutral':0,'Negative':-1,'Positive':1})
labels

0        -1
1         0
2         1
3         1
4         1
         ..
162975   -1
162976   -1
162977    0
162978    0
162979    1
Name: category, Length: 162969, dtype: int64

In [78]:
from sklearn.model_selection import train_test_split

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)


In [79]:
# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1297cd70280>

In [80]:
# Predict on test data
predictions = model.predict(X_test)

# Normalize predictions to binary values (0 or 1)
normalized_predictions = [1 if prediction >= 0.5 else 0 for prediction in predictions]

# Calculate accuracy and other metrics
from sklearn.metrics import accuracy_score, classification_report

accuracy = accuracy_score(y_test, normalized_predictions)
class_report = classification_report(y_test, normalized_predictions)

print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(class_report)


Accuracy: 0.5462355034668958
Classification Report:
              precision    recall  f1-score   support

          -1       0.00      0.00      0.00      7152
           0       0.44      0.82      0.57     11067
           1       0.74      0.61      0.67     14375

    accuracy                           0.55     32594
   macro avg       0.39      0.48      0.41     32594
weighted avg       0.47      0.55      0.49     32594



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
