In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# loading the model

In [None]:
ds = pd.read_csv('/kaggle/input/sms-spam-collection-dataset/spam.csv', encoding='latin1')

In [None]:
ds.head()

In [None]:
ds.info()

# feature selection

In [None]:
ds.isnull().sum()

In [None]:
ds = ds.drop(columns=['Unnamed: 2','Unnamed: 3','Unnamed: 4'])

In [None]:
ds.info()

In [None]:
ds.columns = ['cls','email']

In [None]:
ds.head()

## cheking duplicates

In [None]:
ds.duplicated().sum()

In [None]:
ds = ds.drop_duplicates(keep='first')

In [None]:
ds.duplicated().sum()

## encoding categorical values

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [None]:
ds['cls'] = le.fit_transform(ds['cls'])

In [None]:
ds.cls.value_counts()

##  ham : 0 ; spam : 1

In [None]:
ds.head()

In [None]:
ham = ds[ds['cls']==0]
spam = ds[ds['cls']==1]
ham.head(),spam.head()

In [None]:
ham_txt = " ".join(ham.email)
ham_txt[0:25]

In [None]:
spam_txt = " ".join(spam.email)
spam_txt[0:25]

# visualization

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
plt.figure(figsize=(10,7))
wc = WordCloud(width=800,height=650).generate(ham_txt)
plt.imshow(wc)
plt.axis('off')
plt.title('word cloud of ham emails')
plt.show()

In [None]:
plt.figure(figsize=(10,7))
wc = WordCloud(width=800,height=650).generate(spam_txt)
plt.imshow(wc)
plt.axis('off')
plt.title('word cloud of spam emails')
plt.show()

In [None]:
plt.figure(figsize=(8,6))
sns.countplot(x = ds['cls'])
plt.xlabel('labels')
plt.ylabel('count')
plt.title('count of classes of emails')
plt.show()

# preprocessing pipeline

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
import spacy

In [None]:
nlp = spacy.load("en_core_web_sm")

In [None]:
model = Pipeline([
    ('vectorizer',TfidfVectorizer(
    tokenizer = lambda txt:[token.lemma_ for token in nlp(txt)],
    stop_words = 'english',
    ngram_range=(1,2),
    max_features=500))
])

In [None]:
x_prep = model.fit_transform(ds['email'])

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x_prep,ds['cls'],test_size=0.2,random_state=42,stratify=ds['cls'])

In [None]:
x_train.shape,x_test.shape

In [None]:
max_words = x_prep.shape[1]  # Number of words in the TF-IDF vocabulary
max_length = x_prep.shape[1]  # Maximum sequence length

# model creation

In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM,Dropout,Dense,Embedding,Masking

In [None]:
dl_model = Sequential([
    Embedding(input_dim=max_words,output_dim=128,input_length=max_length,mask_zero=True),
    Masking(mask_value=0.0),
    LSTM(64,return_sequences=True),
    LSTM(64),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

In [None]:
dl_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
x_train = x_train.toarray()
x_test = x_test.toarray()

In [None]:
epochs = 5
batch_size = 64
history = dl_model.fit(x_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(x_test, y_test))

# learning curve

In [None]:
def learning_curve(history, epoch):
    # training vs validation accuracy
    epoch_range = range(1,epoch+1)
    plt.plot(epoch_range, history.history['accuracy'])
    plt.plot(epoch_range, history.history['val_accuracy'])
    plt.title('Model Accuracy')
    plt.ylabel('Accuracy')
    plt.xlabel('Epoch')
    plt.legend(['Train', 'val'], loc='upper left')
    plt.show()

# training vs validation loss
    plt.plot(epoch_range, history.history['loss'])
    plt.plot(epoch_range, history.history['val_loss'])
    plt.title('Model Loss')
    plt.ylabel('Loss')
    plt.xlabel('Epoch')
    plt.legend(['Train', 'val'], loc='upper left')
    plt.show()

In [None]:
learning_curve(history, 5)