<a href="https://colab.research.google.com/github/SumeetsRoorkee/ML_Code/blob/main/Spam_detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
! pip install tqdm
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from tqdm import tqdm
from textblob import TextBlob
import time
import re
import gensim
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_curve, auc
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
tqdm.pandas()
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import warnings
warnings.filterwarnings("ignore")
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
train_df = pd.read_csv("/kaggle/input/email-classification-nlp/SMS_train.csv", encoding='iso-8859-1')
test_df = pd.read_csv("/kaggle/input/email-classification-nlp/SMS_test.csv", encoding='iso-8859-1')

In [None]:
print(train_df.shape)
print(test_df.shape)

In [None]:
train_df['Label'].value_counts()

In [None]:
from sklearn import preprocessing

label_encoder = preprocessing.LabelEncoder()
train_df['Spam_Label']= label_encoder.fit_transform(train_df['Label'])
test_df['Spam_Label']= label_encoder.fit_transform(test_df['Label'])
train_df.head

In [None]:
test_df.head

In [None]:
nltk.download('stopwords')
nltk.download('punkt')

stop_words = stopwords.words('english')

In [None]:
df = pd.concat([train_df, test_df], ignore_index=True)
df.head()

In [None]:

def proper_text(text):
    text = text.lower() #converts to lower case
    text = re.sub(r'\s+', ' ', text) # Removes extra space
    text = re.sub(r'[^\w\s]','',text) # Removes puncutations
    text = re.sub(r'\d+','',text)
    return text

In [None]:

df['Message_body'] = df['Message_body'].progress_apply(proper_text)

In [None]:
def spell_correct(text):
    textblob_obj = TextBlob(text)
    return textblob_obj.correct().string

In [None]:
df['Message_body'] = df['Message_body'].progress_apply(spell_correct)

In [None]:
df.head()

In [None]:

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and token not in stop_words:
            result.append(token)
    return result

In [None]:
df['token'] = df['Message_body'].progress_apply(preprocess)

In [None]:
df['token'][1]

In [None]:
vocab = []
for i in df['token']:
    for j in i:
        vocab.append(j)

In [None]:
vocab[0:10]

In [None]:
total_words = len(list(set(vocab)))
total_words

In [None]:
df['clean_data'] = df['token'].apply(lambda x: " ".join(x))
df.head()

In [None]:
max_len = -1
for doc in df.clean_data:
    token = nltk.word_tokenize(doc)
    if(max_len<len(token)):
        max_len = len(token)
print(max_len)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tokenizer = Tokenizer(num_words = total_words)
tokenizer.fit_on_texts(df['clean_data'])
sequence = tokenizer.texts_to_sequences(df['clean_data'])

X = pad_sequences(sequence, maxlen=max_len)
y = df['Spam_Label'].values


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
model = Sequential()

model.add(Embedding(total_words, output_dim=128))

model.add(Bidirectional(LSTM(128)))

model.add(Dense(128, activation = 'relu'))
model.add(Dense(1, activation = 'sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
model.summary()

In [None]:
model.fit(X_train, y_train, epochs=10, batch_size=64, validation_data=(X_test, y_test))

In [None]:
pred = model.predict(X_test)

In [None]:
# if the predicted value is >0.5 it is real else it is fake
prediction = []
for i in range(len(pred)):
    if pred[i].item() > 0.5:
        prediction.append(1)
    else:
        prediction.append(0)

In [None]:
# getting the accuracy
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(list(y_test), prediction)

print("Model Accuracy : ", accuracy)

In [None]:
# get the confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(list(y_test), prediction)
plt.figure(figsize = (25, 25))
sns.heatmap(cm, annot = True)