In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
import re
import string
from bs4 import BeautifulSoup
from textblob import TextBlob

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from wordcloud import WordCloud


In [2]:
reddit_data = pd.read_csv('/content/Reddit_Data.csv')
reddit_data.head()

Unnamed: 0,clean_comment,category
0,family mormon have never tried explain them t...,1
1,buddhism has very much lot compatible with chr...,1
2,seriously don say thing first all they won get...,-1
3,what you have learned yours and only yours wha...,0
4,for your own benefit you may want read living ...,1


In [3]:
reddit_data.shape

(37249, 2)

In [4]:
reddit_data.duplicated().sum()

449

In [5]:
reddit_data.drop_duplicates(inplace=True)

In [6]:
reddit_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 36800 entries, 0 to 37248
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   clean_comment  36799 non-null  object
 1   category       36800 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 862.5+ KB


In [7]:
reddit_data.isnull().sum()

Unnamed: 0,0
clean_comment,1
category,0


In [8]:
reddit_data.dropna(inplace=True)

In [9]:
twitter_data = pd.read_csv('/content/Twitter_Data.csv')
twitter_data.head()

Unnamed: 0,clean_text,category
0,when modi promised “minimum government maximum...,-1.0
1,talk all the nonsense and continue all the dra...,0.0
2,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,1.0


In [10]:
twitter_data.duplicated().sum()

1

In [11]:
twitter_data.drop_duplicates(inplace=True)

In [12]:
twitter_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 162979 entries, 0 to 162979
Data columns (total 2 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   clean_text  162976 non-null  object 
 1   category    162972 non-null  float64
dtypes: float64(1), object(1)
memory usage: 3.7+ MB


In [13]:
twitter_data.isnull().sum()

Unnamed: 0,0
clean_text,3
category,7


In [14]:
twitter_data.dropna(inplace=True)

In [15]:
twitter_data.head()

Unnamed: 0,clean_text,category
0,when modi promised “minimum government maximum...,-1.0
1,talk all the nonsense and continue all the dra...,0.0
2,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,1.0


In [16]:
reddit_data.head()

Unnamed: 0,clean_comment,category
0,family mormon have never tried explain them t...,1
1,buddhism has very much lot compatible with chr...,1
2,seriously don say thing first all they won get...,-1
3,what you have learned yours and only yours wha...,0
4,for your own benefit you may want read living ...,1


In [17]:
reddit_data.rename(columns = {'clean_comment':'clean_text'}, inplace=True)

In [18]:
reddit_data.head()

Unnamed: 0,clean_text,category
0,family mormon have never tried explain them t...,1
1,buddhism has very much lot compatible with chr...,1
2,seriously don say thing first all they won get...,-1
3,what you have learned yours and only yours wha...,0
4,for your own benefit you may want read living ...,1


In [19]:
final_data = pd.concat([reddit_data, twitter_data], axis = 0)
final_data.head()

Unnamed: 0,clean_text,category
0,family mormon have never tried explain them t...,1.0
1,buddhism has very much lot compatible with chr...,1.0
2,seriously don say thing first all they won get...,-1.0
3,what you have learned yours and only yours wha...,0.0
4,for your own benefit you may want read living ...,1.0


In [20]:
final_data.shape

(199768, 2)

In [21]:
final_data.reset_index(inplace=True, drop=True)

In [22]:
final_data.index

RangeIndex(start=0, stop=199768, step=1)

In [23]:
final_data.isnull().sum()

Unnamed: 0,0
clean_text,0
category,0


In [24]:
final_data['clean_text'][123]

'modiji you use condom wait '

In [25]:
final_data['clean_text'][3345]

'another user feels betrayed and wounded because the great chief modi4pm has misused his trust saar had much trust you upvoted every post yours believed every word you said laughed every joke yours defended every time some one suspected you sent threatening pms people went raving about you every one dug wiki articles prove how you were right everything when you were not making fun them suggested your name for presidentship modship thought you were best thing happen humanity since jesus complaints are lengthy and many but the feeling betrayal broken promises bleeding heart wounded animal only once did trust someone and was you the greatest trust the one pose strangers because they don know and hence won judge why chief why why did you this right now lie low snakes belly don want live any more hahaahahahaha this some funny shit that good recap ping pon\n'

In [26]:
final_data.shape

(199768, 2)

In [27]:
final_data['category'].value_counts(normalize=True)

Unnamed: 0_level_0,proportion
category,Unnamed: 1_level_1
1.0,0.440611
0.0,0.34034
-1.0,0.219049


In [30]:
from tqdm import tqdm

final_text = []

for sentence in tqdm(final_data['clean_text'].values):
  sentence = BeautifulSoup(sentence, 'html.parser').get_text()
  sentence = re.sub(r'http\S+|www\S+|https\S+', '', sentence, flags=re.MULTILINE)
  sentence = re.sub(r'[^a-zA-Z]', ' ', sentence)
  sentence = sentence.translate(str.maketrans('', '', string.punctuation))
  sentence = sentence.lower().strip()
  sentence_words = nltk.word_tokenize(sentence)
  stop_words = set(stopwords.words('english'))
  filtered_words = [word for word in sentence_words if word not in stop_words]
  lemmatizer = WordNetLemmatizer()
  lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_words]
  final_text.append(' '.join(lemmatized_words))


100%|██████████| 199768/199768 [01:28<00:00, 2248.01it/s]


In [31]:
final_text[111]

'bjp forward caste leader advani rajnath joshi naidu gadkari jaitley yashwant sushma accept modi prime minister candidate collude congress cbi malign dump eliminate modi appropriate time pretext godhra riot till bjp use modi getting urban india vote election google corrupt money forward caste people saradha group financial scandal billion igi airport scam billion coal mining scam billion karnataka wakf board land scam billion andhra pradesh land scam billion service tax central excise duty fraud billion gujarat psu financial irregularity billion fdes maharashtra stamp duty scam million highway scam million ministry external affair gift scam fde himachal pradesh pulse scam month flying club fraud million jammu kashmir cricket association scam million punjab paddy scam million arvind joshi tinu joshi million uttar pradesh seed scam million obsolete french fighter jet billion nhrm billion goa mining scam million noida corporation farm land scandal million bellary mine scandal billion kash

In [32]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y = le.fit_transform(final_data['category'])
y

array([2, 2, 0, ..., 1, 1, 2])

In [74]:
import tensorflow
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Dense, SimpleRNN, LSTM, GRU, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences


In [65]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(final_text)
vocab_size = len(tokenizer.word_index) + 1
vocab_size

117295

In [66]:
sequences = tokenizer.texts_to_sequences(final_text)
sequences

[[89,
  32574,
  55,
  704,
  992,
  71,
  13205,
  20796,
  11,
  11,
  5,
  328,
  3002,
  5378,
  10172,
  37,
  3291,
  4488,
  26779,
  17578,
  4917,
  6155,
  892,
  42,
  3677,
  1383],
 [3677,
  54,
  122,
  12104,
  6069,
  831,
  1908,
  3646,
  1433,
  600,
  42,
  1433,
  2648,
  3927,
  42,
  20,
  76,
  208,
  42,
  183,
  65,
  1191,
  26,
  120,
  3927,
  42,
  16399,
  262,
  3927,
  42,
  16399,
  141,
  2864,
  4429,
  3677,
  108,
  22,
  231,
  351,
  262,
  6070,
  32575,
  1471,
  1191,
  2306,
  7,
  8129,
  369,
  182,
  252,
  60,
  56,
  1191,
  180,
  262,
  4430,
  13206,
  3677,
  262,
  65,
  730,
  4430,
  1381,
  266,
  26,
  22,
  172,
  1598,
  42,
  4429,
  39,
  34,
  2143,
  68,
  838,
  352,
  578,
  92,
  208,
  3647,
  590,
  1289,
  7012,
  1816,
  2577,
  7137,
  161,
  32576,
  3821,
  4429,
  152,
  225,
  624,
  225,
  332,
  1162,
  13206,
  4429,
  3163,
  107,
  5,
  3260,
  7138,
  7378,
  22,
  184,
  2143,
  4672,
  1348,
  129,
  26

In [67]:
max_length = max(len(seq) for seq in sequences)
max_length

879

In [68]:
min_length = min(len(seq) for seq in sequences)
min_length

0

In [69]:
sent_length = 30
X_padded = pad_sequences(sequences, maxlen=sent_length, padding='post')

In [70]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_padded, y, test_size=0.2, random_state=42, stratify = y)

In [75]:
model = Sequential()
model.add(Input(shape=(sent_length,)))
model.add(Embedding(input_dim=vocab_size, output_dim=20, input_length=sent_length))
model.add(LSTM(units=128, return_sequences=True))
model.add(LSTM(units=64, return_sequences=True))
model.add(LSTM(units=32, return_sequences=False))
model.add(Dense(units=3, activation='softmax'))
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

model.summary()

In [76]:
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

Epoch 1/10
[1m4995/4995[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 10ms/step - accuracy: 0.7438 - loss: 0.6224 - val_accuracy: 0.8759 - val_loss: 0.3731
Epoch 2/10
[1m4995/4995[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 10ms/step - accuracy: 0.8965 - loss: 0.3180 - val_accuracy: 0.8954 - val_loss: 0.3263
Epoch 3/10
[1m4995/4995[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m88s[0m 11ms/step - accuracy: 0.9266 - loss: 0.2381 - val_accuracy: 0.8956 - val_loss: 0.3274
Epoch 4/10
[1m4995/4995[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 10ms/step - accuracy: 0.9428 - loss: 0.1904 - val_accuracy: 0.8829 - val_loss: 0.3631
Epoch 5/10
[1m4995/4995[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 10ms/step - accuracy: 0.9525 - loss: 0.1543 - val_accuracy: 0.8717 - val_loss: 0.4162
Epoch 6/10
[1m4995/4995[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 10ms/step - accuracy: 0.9623 - loss: 0.1238 - val_accuracy: 0.8626 - val_loss: 0.4663
Epoc

<keras.src.callbacks.history.History at 0x7c3be88c8e90>

In [77]:
test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=0)
print(f"Test Accuracy: {test_accuracy}")

Test Accuracy: 0.8386895060539246
