In [1]:
#importing required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [2]:
#storing the dataset to a dataframe named 'data'
data=pd.read_csv('/content/spam.csv',encoding='ISO-8859-1')

In [3]:
#displaying the first few rows
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [4]:
data.shape

(5572, 5)

There are 5572 observations in the dataset.

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


In [6]:
#to find the count of each label
data['v1'].value_counts()

ham     4825
spam     747
Name: v1, dtype: int64

**Preprocessing**

In [7]:
#dropping unnecessary columns
data.drop(['Unnamed: 2','Unnamed: 3','Unnamed: 4'],axis=1,inplace=True)
data.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [8]:
#encoding the dependent feature
data['v1']=data['v1'].map({'ham':0,'spam':1})
data['v1'].value_counts()

0    4825
1     747
Name: v1, dtype: int64

In [9]:
#defining a few functions to clean the text
#to convert text to lower case
def lower_case(text):
  return text.lower()

#to remove special characters
def remove_special_characters(text, remove_digits=True):
    pattern=r'[^a-zA-z0-9\s]'
    text=re.sub(pattern,'',text)
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"I'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\^^", "", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    return text

#lemmatizing the text
from nltk.stem.wordnet import WordNetLemmatizer
def lemmatize(text):
    lemma=WordNetLemmatizer()
    text= ' '.join([lemma.lemmatize(word) for word in text.split()])
    return text

#to remove stopwords
#Setting English stopwords
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from nltk.tokenize import word_tokenize

stopword_list=nltk.corpus.stopwords.words('english')
#removing the stopwords
def remove_stopwords(text, is_lower_case=False):
    tokens = word_tokenize(text)
    tokens = [token.strip() for token in tokens]
    filtered_text = ' '.join([token for token in tokens if token not in stopword_list])   
    return filtered_text

In [10]:
#applying the cleaning functions 
data['v2']=data['v2'].apply(lower_case)
data['v2']=data['v2'].apply(remove_special_characters)
data['v2']=data['v2'].apply(lemmatize)
data['v2']=data['v2'].apply(remove_stopwords)
data['v2']=data['v2'].apply(lambda x: word_tokenize(x))
data.head()

Unnamed: 0,v1,v2
0,0,"[go, jurong, point, crazy, available, bugis, n..."
1,0,"[ok, lar, joking, wif, u, oni]"
2,1,"[free, entry, 2, wkly, comp, win, fa, cup, fin..."
3,0,"[u, dun, say, early, hor, u, c, already, say]"
4,0,"[nah, dont, think, go, usf, life, around, though]"


**Modelling**

In [11]:
from gensim.models import Word2Vec

#create a CBOW model from the dataset
cbow_w2v_model=Word2Vec(data['v2'],window=9,min_count=2,sg=0)
#create a skipgram model
skipgram_w2v_model=Word2Vec(data['v2'],window=9,min_count=2,sg=1)

In [12]:
len(cbow_w2v_model.wv.vocab)

3980

There are 3980 unique words in the dataset.

In [13]:
#extract vectors from all the words in the document
def get_embeddings_w2v(doc_tokens,model):
    embeddings=[]

    #iterate over tokens to extract the vectors
    for tok in doc_tokens:
        if tok in model.wv.vocab:
            embeddings.append(model.wv.word_vec(tok))

    #mean the vectors of the individual words to get the vector of the statement
    return np.mean(embeddings,axis=0)

CBOW Model

In [14]:
#extract vectors from all the words in the document using cbow model
X_w2v_model=data['v2'].apply(lambda x: get_embeddings_w2v(x,cbow_w2v_model))
X_w2v_model = pd.DataFrame(X_w2v_model)
X_w2v_model=X_w2v_model.v2.apply(pd.Series)
print('X shape:',X_w2v_model.shape)

  out=out, **kwargs)


X shape: (5572, 100)


In [15]:
X_w2v_model['v1']=data['v1']

#dropping rows with null values
X_w2v_model=X_w2v_model.dropna()
X_w2v_model.shape

(5545, 101)

In [16]:
#splitting into dependent and independent features
y=X_w2v_model['v1']
x=X_w2v_model.drop('v1',axis=1)

In [17]:
#splitting the dataset for training the model and validating
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=42,test_size=0.2)

In [18]:
#build a text classification model
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score
from sklearn.ensemble import GradientBoostingClassifier
gb=GradientBoostingClassifier()
gb_model=gb.fit(x_train,y_train)
gb_pred=gb_model.predict(x_test)
gb_acc=accuracy_score(y_test,gb_pred)
print('Confusion matrix is: \n', confusion_matrix(y_test,gb_pred))
print(classification_report(y_test,gb_pred))

Confusion matrix is: 
 [[933  20]
 [ 44 112]]
              precision    recall  f1-score   support

           0       0.95      0.98      0.97       953
           1       0.85      0.72      0.78       156

    accuracy                           0.94      1109
   macro avg       0.90      0.85      0.87      1109
weighted avg       0.94      0.94      0.94      1109



Skipgram Model

In [19]:
#extract vectors from all the words in the document using skipgram model
X_w2v_model=data['v2'].apply(lambda x: get_embeddings_w2v(x,skipgram_w2v_model))
X_w2v_model = pd.DataFrame(X_w2v_model)
X_w2v_model=X_w2v_model.v2.apply(pd.Series)
X_w2v_model['v1']=data['v1']

#dropping rows with null values
X_w2v_model=X_w2v_model.dropna()

#splitting into dependent and independent features
y=X_w2v_model['v1']
x=X_w2v_model.drop('v1',axis=1)

#splitting the dataset for training the model and validating
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=42,test_size=0.2)

  out=out, **kwargs)


In [20]:
#build a text classification model
gb=GradientBoostingClassifier()
gb_model=gb.fit(x_train,y_train)
gb_pred=gb_model.predict(x_test)
gb_acc=accuracy_score(y_test,gb_pred)

print('Confusion matrix is: \n', confusion_matrix(y_test,gb_pred))
print(classification_report(y_test,gb_pred))

Confusion matrix is: 
 [[939  14]
 [ 16 140]]
              precision    recall  f1-score   support

           0       0.98      0.99      0.98       953
           1       0.91      0.90      0.90       156

    accuracy                           0.97      1109
   macro avg       0.95      0.94      0.94      1109
weighted avg       0.97      0.97      0.97      1109



Pretrained Word2Vec Model from Google

In [21]:
file_name='/content/drive/MyDrive/GoogleNews-vectors-negative300.bin.gz'

In [22]:
#load gensim pretrained model
from gensim.models import KeyedVectors
google_w2vec= KeyedVectors.load_word2vec_format(file_name,binary=True)

In [23]:
#extract vectors from all the words in the document using pretrained model
X_w2v_model=data['v2'].apply(lambda x: get_embeddings_w2v(x,google_w2vec))
X_w2v_model = pd.DataFrame(X_w2v_model)
X_w2v_model=X_w2v_model.v2.apply(pd.Series)
X_w2v_model['v1']=data['v1']

#dropping rows with null values
X_w2v_model=X_w2v_model.dropna()
y=X_w2v_model['v1']
x=X_w2v_model.drop('v1',axis=1)

#splitting the dataset for training the model and validating
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=42,test_size=0.2)

  import sys
  
  out=out, **kwargs)


In [24]:
#build a text classification model
gb=GradientBoostingClassifier()
gb_model=gb.fit(x_train,y_train)
gb_pred=gb_model.predict(x_test)
gb_acc=accuracy_score(y_test,gb_pred)

print('Confusion matrix is: \n', confusion_matrix(y_test,gb_pred))
print(classification_report(y_test,gb_pred))

Confusion matrix is: 
 [[936   8]
 [ 31 135]]
              precision    recall  f1-score   support

           0       0.97      0.99      0.98       944
           1       0.94      0.81      0.87       166

    accuracy                           0.96      1110
   macro avg       0.96      0.90      0.93      1110
weighted avg       0.96      0.96      0.96      1110

