In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

import nltk
nltk.download("all")
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
lemmatizer=WordNetLemmatizer()
stop_words=stopwords.words("english")

import spacy
nlp=spacy.load("en_core_web_sm")

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_ru is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package basque_grammars to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package basque_grammars is already up-to-date!
[nltk_data]    | Downloading package bcp47 to /root/nltk_data...
[nltk_data]    |   Package bcp47 is already up-to-dat

In [13]:
import re

In [14]:
from bs4 import BeautifulSoup

In [15]:
df=pd.read_csv("spam.csv")
df.head()

Unnamed: 0,Category,Message
0,not spam,"Go until jurong point, crazy.. Available only ..."
1,not spam,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,not spam,U dun say so early hor... U c already then say...
4,not spam,"Nah I don't think he goes to usf, he lives aro..."


In [16]:
df.columns

Index(['Category', 'Message'], dtype='object')

In [17]:
def preprocessing(text):
  text=text.lower()
  tex=BeautifulSoup(text).get_text()
  text=re.sub("[^a-zA-Z ]+"," ",text)
  text=re.sub("[\s]+"," ",text)
  token=[lemmatizer.lemmatize(i) for i in word_tokenize(text) if i not in stop_words]
  return " ".join(token)


In [18]:
df["Cleaned"]=df["Message"].apply(preprocessing)

  tex=BeautifulSoup(text).get_text()


In [20]:
df["word_count"]=df["Message"].apply(lambda x:len(x.split()))
df["word_count_cleaned"]=df["Cleaned"].apply(lambda x:len(x.split()))
df["char_count"]=df["Message"].apply(lambda x:len(x))
df["char_count_cleaned"]=df["Cleaned"].apply(lambda x:len(x))
df["char_count_without_spaces"]=df["Cleaned"].apply(lambda x:len(re.sub("[\s]+","",x)))
df["num_dig"]=df["Message"].apply(lambda x : sum([1 if w.isdigit() else 0 for w in x.split()]))

In [25]:
pos_d={"noun":["NNP","NN","NNS","NNPS"],"verb":["VBZ","VB","VBD","VBG","VBN"]}
def pos(text,family):
  pos_token=nltk.pos_tag(word_tokenize(text))
  count=0
  for token in pos_token:
    if token[1] in pos_d[family]:
      count+=1

  return count

df["noun count"]=df["Cleaned"].apply(lambda x: pos(x,"noun"))

df["verb count"]=df["Cleaned"].apply(lambda x: pos(x,"verb"))

In [26]:
df.head()

Unnamed: 0,Category,Message,Cleaned,word_count,word_count_cleaned,char_count,char_count_cleaned,char_count_without_spaces,num_dig,noun count,verb count
0,not spam,"Go until jurong point, crazy.. Available only ...",go jurong point crazy available bugis n great ...,20,16,111,82,67,0,6,2
1,not spam,Ok lar... Joking wif u oni...,ok lar joking wif u oni,6,6,29,23,18,0,3,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry wkly comp win fa cup final tkts st ...,28,21,155,101,81,2,9,4
3,not spam,U dun say so early hor... U c already then say...,u dun say early hor u c already say,11,9,49,35,27,0,3,1
4,not spam,"Nah I don't think he goes to usf, he lives aro...",nah think go usf life around though,13,7,61,35,29,0,1,1


In [42]:
le=LabelEncoder()
df["Category"]=le.fit_transform(df["Category"])

In [43]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf=TfidfVectorizer(max_features=500)
tf_idf=tf.fit_transform(df["Cleaned"])

In [46]:
type(tf_idf)

scipy.sparse._csr.csr_matrix

In [34]:
from scipy.sparse import hstack,csr_matrix

In [44]:
features=df.iloc[:,3:]
target=df.iloc[:,0]

In [47]:
features=hstack([tf_idf,csr_matrix(features)])

In [48]:
from sklearn.model_selection import train_test_split
xtrain,xtest,ytrain,ytest=train_test_split(features,target,random_state=20,test_size=0.2,stratify=target)

In [49]:
from sklearn.naive_bayes import MultinomialNB
model=MultinomialNB()
model.fit(xtrain,ytrain)
ypred=model.predict(xtest)
from sklearn.metrics import classification_report
print(classification_report(ytest,ypred))

              precision    recall  f1-score   support

           0       0.98      0.99      0.98       966
           1       0.91      0.89      0.90       149

    accuracy                           0.97      1115
   macro avg       0.95      0.94      0.94      1115
weighted avg       0.97      0.97      0.97      1115



In [50]:
from sklearn.tree import DecisionTreeClassifier
model1=DecisionTreeClassifier()
model1.fit(xtrain,ytrain)
ypred=model1.predict(xtest)
from sklearn.metrics import classification_report
print(classification_report(ytest,ypred))

              precision    recall  f1-score   support

           0       0.98      0.98      0.98       966
           1       0.86      0.87      0.87       149

    accuracy                           0.96      1115
   macro avg       0.92      0.93      0.92      1115
weighted avg       0.96      0.96      0.96      1115



In [51]:
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(max_features=500)
count_vec=cv.fit_transform(df["Cleaned"])
count_vec

<5572x500 sparse matrix of type '<class 'numpy.int64'>'
	with 27417 stored elements in Compressed Sparse Row format>

In [18]:
# DL

In [56]:
features=df.iloc[:,2]
target=df.iloc[:,0]

In [54]:
import tensorflow
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding,Dense,Flatten,SimpleRNN,LSTM
from tensorflow.keras.preprocessing.text  import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [59]:
xtrain,xtest,ytrain,ytest=train_test_split(features,target,random_state=1,test_size=0.3,stratify=target)
tok=Tokenizer()
tok.fit_on_texts(xtrain)
vocabulary=tok.word_index
vocab_length=len(vocabulary)
train_seq=tok.texts_to_sequences(xtrain)
doc_length=[]
for i in train_seq:
  doc_length.append(len(i))
max_len=np.quantile(doc_length,0.99)
train_matrix=pad_sequences(train_seq,maxlen=int(max_len))
test_seq=tok.texts_to_sequences(xtest)
test_matrix=pad_sequences(test_seq,maxlen=int(max_len))


In [None]:
model=Sequential()
model.add(Embedding(input_dim=vocab_length+1,output_dim=32,input_length=int(max_len),mask_zero=True))
model.add(Flatten())
model.add(Dense(128,activation="tanh"))
model.add(Dense(64,activation="tanh"))
model.add(Dense(16,activation="tanh"))
model.add(Dense(1,activation="sigmoid"))
model.compile(optimizer="adam",loss="binary_crossentropy")
model.fit(train_matrix,ytrain,epochs=10,batch_size=32)
ypred=model.predict(test_matrix)
ypred=np.where(ypred>=0.5,1,0)


In [64]:
print(classification_report(ytest,ypred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99      1448
           1       0.99      0.91      0.95       224

    accuracy                           0.99      1672
   macro avg       0.99      0.95      0.97      1672
weighted avg       0.99      0.99      0.99      1672



In [67]:
import gensim
import gensim.downloader as api
from gensim.models.word2vec import Word2Vec

In [72]:
wv=api.load("word2vec-google-news-300")



In [79]:
vector_size = 300
gensim_weight_matrix = np.zeros((vocab_length+1 ,vector_size))
gensim_weight_matrix.shape

for word, index in tok.word_index.items():
    if index < vocab_length+1: # since index starts with zero
        if word in wv:
            gensim_weight_matrix[index] = wv[word]
        else:
            gensim_weight_matrix[index] = np.zeros(300)

In [89]:
EMBEDDING_DIM = 300 # this means the embedding layer will create  a vector in 100 dimension
model1 = Sequential()
model1.add(Embedding(input_dim = vocab_length+1,# the whole vocabulary size
                          output_dim = EMBEDDING_DIM, # vector space dimension
                          input_length= int(max_len), # max_len of text sequence
                          weights = [gensim_weight_matrix],trainable = False))
model1.add(Flatten())
model1.add(Dense(128,activation="tanh"))
model1.add(Dense(64,activation="tanh"))
model1.add(Dense(16,activation="tanh"))
model1.add(Dense(1,activation="sigmoid"))
model1.compile(optimizer="adam",loss="binary_crossentropy")


model1.fit(train_matrix,ytrain,epochs=10,batch_size=32)
ypred=model1.predict(test_matrix)
ypred=np.where(ypred>=0.5,1,0)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [90]:
print(classification_report(ytest,ypred))

              precision    recall  f1-score   support

           0       0.98      0.99      0.99      1448
           1       0.94      0.88      0.91       224

    accuracy                           0.98      1672
   macro avg       0.96      0.93      0.95      1672
weighted avg       0.98      0.98      0.98      1672



In [85]:
embeddings_index={}
f=open("/content/glove.6B.100d.txt",encoding="utf8")

for line in f:
  values=line.split()
  word=values[0]
  coefs=np.asarray(values[1:],dtype="float32")
  embeddings_index[word]=coefs
f.close()

In [87]:
emb_dim=100
emb_matrix=np.zeros((vocab_length+1,emb_dim))

for word, index in tok.word_index.items():
    if index < vocab_length+1: # since index starts with zero
        if word in embeddings_index:
            emb_matrix[index] = embeddings_index[word]
        else:
            emb_matrix[index] = np.zeros(100)

In [91]:
EMBEDDING_DIM = 100 # this means the embedding layer will create  a vector in 100 dimension
model2 = Sequential()
model2.add(Embedding(input_dim = vocab_length+1,# the whole vocabulary size
                          output_dim = EMBEDDING_DIM, # vector space dimension
                          input_length= int(max_len), # max_len of text sequence
                          weights = [emb_matrix],trainable = False))
model2.add(Flatten())
model2.add(Dense(128,activation="tanh"))
model2.add(Dense(64,activation="tanh"))
model2.add(Dense(16,activation="tanh"))
model2.add(Dense(1,activation="sigmoid"))
model2.compile(optimizer="adam",loss="binary_crossentropy")


model2.fit(train_matrix,ytrain,epochs=10,batch_size=32)
ypred=model2.predict(test_matrix)
ypred=np.where(ypred>=0.5,1,0)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [92]:
print(classification_report(ytest,ypred))

              precision    recall  f1-score   support

           0       0.98      0.98      0.98      1448
           1       0.85      0.89      0.87       224

    accuracy                           0.96      1672
   macro avg       0.92      0.93      0.92      1672
weighted avg       0.96      0.96      0.96      1672

