# **RNN**

In [79]:
# importing the libraries for the RNN
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, SimpleRNN
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

from sklearn.preprocessing import MinMaxScaler

In [80]:
# importing the data
df = pd.read_csv("/content/spam.csv", encoding='latin-1')

In [81]:
# printing the first 5
df.head()

Unnamed: 0,class,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [82]:
#printing the last 5 from the data
df.tail()

Unnamed: 0,class,message
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...
5571,ham,Rofl. Its true to its name


In [83]:
df.columns

Index(['class', 'message'], dtype='object')

In [84]:
df.dtypes

Unnamed: 0,0
class,object
message,object


In [85]:
df.shape

(5572, 2)

In [86]:
print(df.isnull().sum())

class      0
message    0
dtype: int64


In [87]:
print(df.duplicated().sum())

403


In [88]:
df[df.duplicated()]

Unnamed: 0,class,message
102,ham,As per your request 'Melle Melle (Oru Minnamin...
153,ham,As per your request 'Melle Melle (Oru Minnamin...
206,ham,"As I entered my cabin my PA said, '' Happy B'd..."
222,ham,"Sorry, I'll call later"
325,ham,No calls..messages..missed calls
...,...,...
5524,spam,You are awarded a SiPix Digital Camera! call 0...
5535,ham,"I know you are thinkin malaria. But relax, chi..."
5539,ham,Just sleeping..and surfing
5553,ham,Hahaha..use your brain dear


In [89]:
# droping the duplicated values
df.drop_duplicates(inplace=True)

In [90]:
df.duplicated().sum()

np.int64(0)

In [91]:
print(df.value_counts('class'))

class
ham     4516
spam     653
Name: count, dtype: int64


In [92]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5169 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   class    5169 non-null   object
 1   message  5169 non-null   object
dtypes: object(2)
memory usage: 121.1+ KB


In [93]:
print(df.describe())

       class                     message
count   5169                        5169
unique     2                        5169
top      ham  Rofl. Its true to its name
freq    4516                           1


In [94]:
import string
df['class'] = df['class'].map({'ham': 0, 'spam': 1})

# Clean text
def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+", '', text)
    text = re.sub(r'\@\w+|\#', '', text)
    text = re.sub(f'[{re.escape(string.punctuation)}]', '', text)
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df['message'] = df['message'].apply(clean_text)

In [95]:
# we taken 5000
max_words = 5000
max_len = 100

In [96]:
# Tokenized
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(df['message'])
X = tokenizer.texts_to_sequences(df['message'])
X = pad_sequences(X, maxlen=max_len)
y = df['class'].values

In [97]:
# Splitting Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [98]:
# Build the RNN Model
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense, Dropout
from tensorflow.keras.models import Sequential

model = Sequential([
    Embedding(input_dim=max_words, output_dim=64),
    SimpleRNN(64, return_sequences=True),
    SimpleRNN(32),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

In [99]:
# Compile the Model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [100]:
# Model Summary
model.summary()

In [101]:
history = model.fit(X_train, y_train, epochs=5, batch_size=64, validation_split=0.2)

Epoch 1/5
[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 59ms/step - accuracy: 0.8010 - loss: 0.4339 - val_accuracy: 0.9516 - val_loss: 0.1426
Epoch 2/5
[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 50ms/step - accuracy: 0.9741 - loss: 0.1062 - val_accuracy: 0.9794 - val_loss: 0.0705
Epoch 3/5
[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 52ms/step - accuracy: 0.9881 - loss: 0.0587 - val_accuracy: 0.9843 - val_loss: 0.0530
Epoch 4/5
[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 58ms/step - accuracy: 0.9957 - loss: 0.0248 - val_accuracy: 0.9819 - val_loss: 0.0605
Epoch 5/5
[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 63ms/step - accuracy: 0.9953 - loss: 0.0231 - val_accuracy: 0.9722 - val_loss: 0.0808


In [102]:
# Evaluate the Model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Loss: {loss}")
print(f"Test Accuracy: {accuracy}")

[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 19ms/step - accuracy: 0.9716 - loss: 0.1099
Test Loss: 0.11006432771682739
Test Accuracy: 0.9719535708427429


In [103]:
model.summary()