In [54]:
import pandas as pd
import numpy as np

import re
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

import tensorflow
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,classification_report

In [55]:
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## data preprocessing

In [56]:
data = pd.read_csv("/content/Reddit_Data.csv")

In [57]:
data.head()

Unnamed: 0,clean_comment,category
0,family mormon have never tried explain them t...,1
1,buddhism has very much lot compatible with chr...,1
2,seriously don say thing first all they won get...,-1
3,what you have learned yours and only yours wha...,0
4,for your own benefit you may want read living ...,1


In [58]:
data.isnull().sum()

Unnamed: 0,0
clean_comment,100
category,0


In [59]:
data = data.dropna()

In [60]:
data.isnull().sum()

Unnamed: 0,0
clean_comment,0
category,0


In [61]:
data.shape

(37149, 2)

In [62]:
data.head()

Unnamed: 0,clean_comment,category
0,family mormon have never tried explain them t...,1
1,buddhism has very much lot compatible with chr...,1
2,seriously don say thing first all they won get...,-1
3,what you have learned yours and only yours wha...,0
4,for your own benefit you may want read living ...,1


## independent and dependent feature split

In [63]:
x = data.drop("category",axis=1)
y = data["category"]

In [64]:
x.shape

(37149, 1)

In [65]:
y.shape

(37149,)

## one hot representation

In [66]:
corpus = []
lemmatizer = WordNetLemmatizer()
for i in x.index:
  words = re.sub("[^a-zA-Z]"," ",x.loc[i,"clean_comment"])
  words = words.lower()
  words = words.split()
  words = [lemmatizer.lemmatize(word) for word in words if word not in stopwords.words("english")]
  words = " ".join(words)
  corpus.append(words)

In [67]:
corpus

['family mormon never tried explain still stare puzzled time time like kind strange creature nonetheless come admire patience calmness equanimity acceptance compassion developed thing buddhism teach',
 'buddhism much lot compatible christianity especially considering sin suffering almost thing suffering caused wanting thing want going getting thing wrong way christian would mean wanting thing coincide god wanting thing coincide without aid jesus buddhism could also seen proof god mighty omnipotence certainly christian lucky one christ side everyone else well many christian believe god grace salvation buddhism god way showing grace upon others would also help study thing jesus said see buddha made similar claim rich man getting heaven joke basically advocating rid material possession fact distinctly remembered jesus making someone cry someone asked achieve salvation jesus replied live like buddhist roughly translated also point buddha rarely spoke anything god theory personally knew wel

In [68]:
voc_size = 5000
one_hot_repr = [one_hot(word,voc_size) for word in corpus]

In [69]:
one_hot_repr[0]

[3239,
 461,
 4299,
 4928,
 4234,
 2205,
 4890,
 1953,
 4735,
 4735,
 4935,
 4344,
 1369,
 4811,
 205,
 2940,
 2917,
 4312,
 1804,
 65,
 4976,
 1420,
 3140,
 3517,
 2525,
 2485]

In [70]:
corpus[0]

'family mormon never tried explain still stare puzzled time time like kind strange creature nonetheless come admire patience calmness equanimity acceptance compassion developed thing buddhism teach'

## Padding

In [71]:
sent_length = 0
for sent in one_hot_repr:
  if len(sent) > sent_length:
    sent_length = len(sent)
sent_length

879

In [72]:
embedded_docs = pad_sequences(one_hot_repr,padding="pre",maxlen=sent_length)
embedded_docs[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,   

## lSTM rnn model with embedded layer

In [94]:
Embedded_vector = 40
model = Sequential()
model.add(Embedding(voc_size,Embedded_vector,input_length=sent_length))
model.add(Dropout(0.3))
model.add(LSTM(100))
model.add(Dropout(0.3))
model.add(Dense(1,"sigmoid"))
model.compile(loss="binary_crossentropy",optimizer="adam",metrics=["accuracy"])

In [95]:
model.summary()

In [96]:
x = np.array(embedded_docs)
y = np.array(y)

## train test split

In [97]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.33,random_state=42)

## model training

In [98]:
model.fit(x_train,y_train,validation_data=(x_test,y_test),epochs=10,batch_size=64)

Epoch 1/10
[1m389/389[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 59ms/step - accuracy: 0.3941 - loss: 0.3695 - val_accuracy: 0.5874 - val_loss: -0.9347
Epoch 2/10
[1m389/389[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 58ms/step - accuracy: 0.5836 - loss: -1.4649 - val_accuracy: 0.4801 - val_loss: -1.5412
Epoch 3/10
[1m389/389[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 60ms/step - accuracy: 0.5604 - loss: -4.2410 - val_accuracy: 0.5575 - val_loss: -5.5212
Epoch 4/10
[1m389/389[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 56ms/step - accuracy: 0.5677 - loss: -8.7834 - val_accuracy: 0.5463 - val_loss: -7.0208
Epoch 5/10
[1m389/389[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 58ms/step - accuracy: 0.5773 - loss: -11.8100 - val_accuracy: 0.5783 - val_loss: 0.5233
Epoch 6/10
[1m389/389[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 57ms/step - accuracy: 0.5252 - loss: -5.2932 - val_accuracy: 0.5869 - val_loss: -7.6003
Epoch

<keras.src.callbacks.history.History at 0x7ff664704340>

In [99]:
model.summary()

In [100]:
pred = model.predict(x_test)

[1m384/384[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 19ms/step
