In [None]:
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, SimpleRNN, LSTM
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [None]:
df = pd.read_csv('spam.csv', encoding='latin-1')
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [None]:
df.rename(columns={'v1':'target', 'v2':'text'}, inplace=True)
df

Unnamed: 0,target,text,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


In [None]:
df = df[['target', 'text']]
df

Unnamed: 0,target,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [None]:
le = LabelEncoder()
df['target'] = le.fit_transform(df['target'])
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['target'] = le.fit_transform(df['target'])


Unnamed: 0,target,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...
5568,0,Will Ì_ b going to esplanade fr home?
5569,0,"Pity, * was in mood for that. So...any other s..."
5570,0,The guy did some bitching but I acted like i'd...


In [None]:
vacob_size = 1000

In [None]:
ecoded_doc = [one_hot(d, vacob_size) for d in df['text']]
ecoded_doc

[[193,
  367,
  44,
  713,
  110,
  51,
  371,
  661,
  57,
  676,
  208,
  722,
  815,
  869,
  501,
  311,
  73,
  37,
  169,
  484],
 [291, 472, 991, 206, 535, 300],
 [34,
  979,
  661,
  960,
  402,
  238,
  555,
  723,
  320,
  383,
  401,
  630,
  491,
  744,
  366,
  569,
  749,
  383,
  723,
  990,
  723,
  931,
  979,
  797,
  723,
  55,
  885,
  324,
  41,
  238,
  714],
 [535, 204, 31, 65, 740, 558, 535, 926, 985, 920, 31],
 [691, 683, 540, 437, 715, 693, 723, 726, 715, 172, 502, 811, 315],
 [318,
  845,
  73,
  791,
  320,
  228,
  823,
  820,
  199,
  106,
  219,
  165,
  773,
  946,
  727,
  304,
  451,
  558,
  710,
  699,
  356,
  162,
  999,
  291,
  397,
  723,
  399,
  723,
  723,
  585,
  191,
  723,
  687],
 [83,
  662,
  301,
  755,
  413,
  727,
  723,
  421,
  595,
  727,
  227,
  138,
  727,
  727,
  996,
  439],
 [517,
  187,
  392,
  192,
  410,
  927,
  411,
  185,
  941,
  973,
  617,
  238,
  228,
  900,
  517,
  392,
  614,
  699,
  736,
  460,
  513,
  1

In [None]:
max_length = 50
padded_doc = pad_sequences(ecoded_doc, maxlen=max_length, padding='post')
padded_doc

array([[193, 367,  44, ...,   0,   0,   0],
       [291, 472, 991, ...,   0,   0,   0],
       [ 34, 979, 661, ...,   0,   0,   0],
       ...,
       [954, 769, 661, ...,   0,   0,   0],
       [415, 107,  62, ...,   0,   0,   0],
       [ 73, 216, 741, ...,   0,   0,   0]], dtype=int32)

In [None]:
labels = df['target'].values
labels

array([0, 0, 1, ..., 0, 0, 0])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(padded_doc, labels, test_size=0.2, random_state=42)

In [None]:
#build RNN
model = Sequential()
model.add(Embedding(vacob_size, 10, input_length=max_length))
model.add(SimpleRNN(128))
model.add(Dense(1, activation='sigmoid'))



In [None]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
model.fit(X_train, y_train, epochs=5)

Epoch 1/5
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 22ms/step - accuracy: 0.8553 - loss: 0.4234
Epoch 2/5
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 17ms/step - accuracy: 0.8725 - loss: 0.3853
Epoch 3/5
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 17ms/step - accuracy: 0.8592 - loss: 0.4105
Epoch 4/5
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 18ms/step - accuracy: 0.8609 - loss: 0.4084
Epoch 5/5
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 17ms/step - accuracy: 0.8645 - loss: 0.4000


<keras.src.callbacks.history.History at 0x7903e94d6560>

In [None]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Accuracy: {accuracy}')

[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.8760 - loss: 0.3796
Test Accuracy: 0.865470826625824


In [None]:
def predict_data(model,sentence,vocab_size,max_length):
  encoded_sentence = one_hot(sentence,vocab_size)
  padded_sentence = pad_sequences([encoded_sentence], maxlen=max_length, padding='post')
  prediction = model.predict(padded_sentence)
  return 'spam' if prediction > 0.5 else 'ham'


In [None]:
new__sentence = "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question"


In [None]:
print(predict_data(model,new__sentence,vacob_size,max_length))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 160ms/step
ham
