**IMPORTING LIBRARIES**

In [1]:
import pandas as pd
import numpy as np
import gensim
from gensim.models import Word2Vec as wtv
from gensim.models import KeyedVectors

**LOADING DATASET**

In [2]:
df = pd.read_csv('/content/spam.csv', encoding='ISO-8859-1')

In [3]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


In [5]:
df['v1'].value_counts()

ham     4825
spam     747
Name: v1, dtype: int64

In [6]:
df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1, inplace=True)
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [7]:
df.rename({'v1':'Category', 'v2':'Text'}, axis=1, inplace=True)

In [8]:
import nltk
import re
import string
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [9]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

**PRE-PROCESSING**

In [10]:
def strip_html(text): #html strips removal
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

def remove_between_square_brackets(text): #square brackets removal
    return re.sub('\[[^]]*\]', '', text)


def to_lower(text): #converting to lower 
    return text.lower()

In [11]:
#Removing Special characters
def remove_special_characters(text, remove_digits=True):
    pattern=r'[^a-zA-z0-9\s]'
    text=re.sub(pattern,'',text)
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"I'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\^^", "", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    return(text)

In [12]:
def simple_tokenize(text):    # Text Tokenization
    return nltk.word_tokenize(text)

In [13]:
def simple_lemmatizer(token_list):    #text lematization
    wlemma = WordNetLemmatizer()
    return [wlemma.lemmatize(token) for token in token_list]

In [14]:
def remove_punct(token_list):# Remove Punctuation
    return [token for token in token_list if token not in string.punctuation]

In [15]:
stop_words = stopwords.words('english')       # Stopwords Removal
def remove_stopwords(token_list):
    return [token for token in token_list if token not in stop_words]

In [16]:
# NLTK Preprocessor
def nltk_preprocess(text):
    text = to_lower(text)
    text = remove_special_characters(text)
    text = simple_tokenize(text)
    text = remove_punct(text)
    text = remove_stopwords(text)
    text = simple_lemmatizer(text)
    return text

In [17]:
df['nltktext'] = df['Text'].apply(nltk_preprocess)

In [18]:
df['nltktext'][0]

['go',
 'jurong',
 'point',
 'crazy',
 'available',
 'bugis',
 'n',
 'great',
 'world',
 'la',
 'e',
 'buffet',
 'cine',
 'got',
 'amore',
 'wat']

In [19]:
tokens = pd.Series(df.nltktext.values)

**CBOW MODEL**

In [20]:
cbow_model = wtv(tokens, size=300, window=9, min_count=2, sg=0) # train a cbow model

In [21]:
def get_embedding_cbow(doc_tokens): # extract vectors from words
    embeddings = []
    model = cbow_model
        
    for tok in doc_tokens:
        if tok in model.wv.vocab:
            embeddings.append(model.wv.word_vec(tok))
    return np.mean(embeddings, axis=0)

In [22]:
df['cbow_vectors'] = df['Text'].apply(lambda x: get_embedding_cbow(x))

  out=out, **kwargs)


In [23]:
df.isnull().sum()

Category         0
Text             0
nltktext         0
cbow_vectors    54
dtype: int64

In [24]:
df = df.dropna().reset_index(drop=True)
df.head(5)

Unnamed: 0,Category,Text,nltktext,cbow_vectors
0,ham,"Go until jurong point, crazy.. Available only ...","[go, jurong, point, crazy, available, bugis, n...","[0.16384576, -0.041929033, -0.027308647, 0.254..."
1,ham,Ok lar... Joking wif u oni...,"[ok, lar, joking, wif, u, oni]","[0.15813185, -0.040524624, -0.02641406, 0.2443..."
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entry, 2, wkly, comp, win, fa, cup, fin...","[0.1509684, -0.03856838, -0.02535023, 0.234552..."
3,ham,U dun say so early hor... U c already then say...,"[u, dun, say, early, hor, u, c, already, say]","[0.21291782, -0.054247126, -0.03543997, 0.3302..."
4,ham,"Nah I don't think he goes to usf, he lives aro...","[nah, dont, think, go, usf, life, around, though]","[0.16192235, -0.041395236, -0.027124392, 0.251..."


In [25]:
# create X from w2vec
X_cbow = pd.DataFrame(df['cbow_vectors'].values.tolist())
X_cbow.shape

(5518, 300)

In [26]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(df.Category)

In [27]:
from sklearn.model_selection import train_test_split
X_train_cb, X_test_cb, y_train_cb, y_test_cb = train_test_split(X_cbow, y, test_size=0.2, random_state=42)

In [28]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
model1 = SVC()
# Fit the model on the train dataset
model1 = model1.fit(X_train_cb, y_train_cb)

pred1 = model1.predict(X_test_cb) #predictions

a1 = accuracy_score(y_test_cb, pred1)
print("Accuracy:", a1*100, "%")

Accuracy: 85.59782608695652 %


**SKIPGRAM MODEL**

In [29]:
# train a skipgram model from the given data set
skgram_model = wtv(tokens, size=300, window=9, min_count=2, sg=1)

In [30]:
def get_embedding_sg(doc_tokens):
    embeddings = []
    model = skgram_model
    for tok in doc_tokens:
        if tok in model.wv.vocab:
            embeddings.append(model.wv.word_vec(tok))
   
    return np.mean(embeddings, axis=0)

In [31]:
df['sgram_vectors'] = df['Text'].apply(lambda x: get_embedding_sg(x))

In [32]:
# create X from w2vec
X_skg = pd.DataFrame(df['sgram_vectors'].values.tolist())
X_skg.shape

(5518, 300)

In [33]:
y = le.fit_transform(df.Category)

In [34]:
X_train_sg, X_test_sg, y_train_sg, y_test_sg = train_test_split(X_skg, y, test_size=0.2, random_state=42)

In [35]:
# Build a text classification model

model2 = SVC()
model2 = model2.fit(X_train_sg, y_train_sg)
pred2 = model2.predict(X_test_sg)

a2 = accuracy_score(y_test_sg, pred2)
print("Accuracy:", a2*100, "%")

Accuracy: 87.68115942028986 %


**PRETRAINED GOOGLE WORD2VEC MODEL**

In [36]:
file='/content/drive/MyDrive/GoogleNews-vectors-negative300.bin'

In [37]:
# load into gensim pretrained model
google_w2vec = KeyedVectors.load_word2vec_format(file, binary=True)

In [38]:
# extract vectors from all words in doc
def get_embedding_ggl(doc_tokens):
    embeddings = []
    model = google_w2vec
    for tok in doc_tokens:
        if tok in model.wv.vocab:
            embeddings.append(model.wv.word_vec(tok))
   
    return np.mean(embeddings, axis=0)

In [39]:
df['google_vectors'] = df['Text'].apply(lambda x: get_embedding_ggl(x))

  
  import sys


In [40]:
# create X from w2vec
X_ggl = pd.DataFrame(df['google_vectors'].values.tolist())
X_ggl.shape

(5518, 300)

In [41]:
y = le.fit_transform(df.Category)

In [42]:
X_train_gl, X_test_gl, y_train_gl, y_test_gl = train_test_split(X_ggl, y, test_size=0.2, random_state=42)

In [43]:
# Text classification model
model3 = SVC()
model3 = model3.fit(X_train_gl, y_train_gl)
pred3 = model3.predict(X_test_gl)

a3 = accuracy_score(y_test_gl, pred3)
print("Accuracy:", a3*100, "%")

Accuracy: 97.01086956521739 %
