In [32]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.metrics import accuracy_score

In [33]:
df = pd.read_csv("spam.csv", encoding='latin-1')[['v1', 'v2']]

In [34]:
df.columns = ['label', 'text']

In [35]:
df.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [36]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   5572 non-null   object
 1   text    5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [37]:
df.tail()

Unnamed: 0,label,text
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...
5571,ham,Rofl. Its true to its name


In [38]:
df.shape

(5572, 2)

In [39]:
df.describe()

Unnamed: 0,label,text
count,5572,5572
unique,2,5169
top,ham,"Sorry, I'll call later"
freq,4825,30


In [40]:
df['label'] = LabelEncoder().fit_transform(df['label'])

In [41]:
vectorizer = TfidfVectorizer(stop_words='english', max_features=3000)

In [42]:
X = vectorizer.fit_transform(df['text']).toarray()
y = df['label'].values

In [43]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [44]:
model = Sequential()
model.add(Dense(64, input_dim=X_train.shape[1], activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [45]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [46]:
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test), verbose=1)

Epoch 1/10
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 15ms/step - accuracy: 0.8255 - loss: 0.5274 - val_accuracy: 0.9444 - val_loss: 0.1610
Epoch 2/10
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step - accuracy: 0.9737 - loss: 0.1073 - val_accuracy: 0.9749 - val_loss: 0.0753
Epoch 3/10
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 12ms/step - accuracy: 0.9929 - loss: 0.0255 - val_accuracy: 0.9821 - val_loss: 0.0678
Epoch 4/10
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 14ms/step - accuracy: 0.9978 - loss: 0.0123 - val_accuracy: 0.9794 - val_loss: 0.0734
Epoch 5/10
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.9980 - loss: 0.0083 - val_accuracy: 0.9830 - val_loss: 0.0766
Epoch 6/10
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.9992 - loss: 0.0049 - val_accuracy: 0.9794 - val_loss: 0.0877
Epoch 7/10
[1m140/140[

<keras.src.callbacks.history.History at 0x1f84b0ac8e0>

In [47]:
y_pred = model.predict(X_test)
y_pred = (y_pred > 0.5).astype(int)

[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step  


In [48]:
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.9820627802690582


In [50]:
def test_email(text):
    vector = vectorizer.transform([text]).toarray()
    prediction = model.predict(vector)
    print("Spam" if prediction[0] > 0.5 else "Not Spam")

test_email("Congratulations! You've won a free iPhone. Claim now!")
test_email("Hi team, please find the meeting notes attached.")

    

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 452ms/step
Spam
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 112ms/step
Not Spam
