In [None]:
import pandas as pd
import numpy as np
import re
import spacy
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
import gensim
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score

In [None]:
from google.colab import files
files.upload()

In [None]:
data=pd.read_csv('/content/SPAM.csv')

In [None]:
data=data.dropna().reset_index()

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   index     5572 non-null   int64 
 1   Category  5572 non-null   object
 2   Message   5572 non-null   object
dtypes: int64(1), object(2)
memory usage: 130.7+ KB


In [None]:
#remove anything apart from alphabets
data['only_alpha']=[re.sub("[^A-Za-z']+", " ", str(row)).lower() for row in data['Message']]

In [None]:
#calling the gensim preprocess function
data['preprocessed_text']= data['only_alpha'].apply(lambda x: gensim.utils.simple_preprocess(x))

In [None]:
data.head()

Unnamed: 0,index,Category,Message,only_alpha,preprocessed_text,label
0,0,ham,"Go until jurong point, crazy.. Available only ...",go until jurong point crazy available only in ...,"[go, until, jurong, point, crazy, available, o...",0
1,1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni,"[ok, lar, joking, wif, oni]",0
2,2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in a wkly comp to win fa cup final ...,"[free, entry, in, wkly, comp, to, win, fa, cup...",1
3,3,ham,U dun say so early hor... U c already then say...,u dun say so early hor u c already then say,"[dun, say, so, early, hor, already, then, say]",0
4,4,ham,"Nah I don't think he goes to usf, he lives aro...",nah i don't think he goes to usf he lives arou...,"[nah, don, think, he, goes, to, usf, he, lives...",0


In [None]:
#convert labels into numbers
data['label']=data['Category'].map({'ham':0,'spam':1})

In [None]:
#train test split
X_train, X_test, y_train, y_test = train_test_split (data['preprocessed_text'], data['label'] , test_size=0.3)

In [None]:
# Train the word2vec model with train set into 100 vector size and 5 words to be considered for context, min 2 words to be in a sentense
w2v_model = gensim.models.Word2Vec(X_train,vector_size=100,  window=5, min_count=2)

In [None]:
#look for similar words for a word
w2v_model.wv.most_similar('card')

[('give', 0.9975357055664062),
 ('she', 0.9974492192268372),
 ('use', 0.9974482655525208),
 ('work', 0.9974368810653687),
 ('part', 0.9974302649497986),
 ('new', 0.9974066615104675),
 ('by', 0.9973984956741333),
 ('can', 0.9973711967468262),
 ('ve', 0.9973655939102173),
 ('home', 0.9973592162132263)]

In [None]:
len(w2v_model.wv.index_to_key)

3161

In [None]:
words=set(w2v_model.wv.index_to_key)

In [None]:
#convert sentense into vectors of gensim embedding for train set
X_train_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words]) for ls in X_train])

  X_train_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words]) for ls in X_train])


In [None]:
#convert sentense into vectors of gensim embedding for test set
X_test_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words]) for ls in X_test])

  X_test_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words]) for ls in X_test])


In [None]:
len(X_test_vect), len(X_test)

(1672, 1672)

In [None]:
# Compute sentence vectors by averaging the word vectors for the words contained in the sentence
#converting all sentense vectors into same size
X_train_vect_avg = []
for v in X_train_vect:
    if v.size:
      X_train_vect_avg.append(v.mean(axis=0))
    else:
      X_train_vect_avg.append(np.zeros(100, dtype=float))

In [None]:
# Compute sentence vectors by averaging the word vectors for the words contained in the sentence
#converting all sentense vectors into same size
X_test_vect_avg = []
for v in X_test_vect:
    if v.size:
      X_test_vect_avg.append(v.mean(axis=0))
    else:
      X_test_vect_avg.append(np.zeros(100, dtype=float))

In [None]:
# Instantiate and fit a basic Random Forest model on top of the vectors
rf = RandomForestClassifier()
rf_model = rf.fit(X_train_vect_avg, y_train.values)

In [None]:
# Use the trained model to make predictions on the test data
y_pred = rf_model.predict(X_test_vect_avg)

In [None]:
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
print('Precision: {} / Recall: {} / Accuracy: {}'.format(round(precision, 3), round(recall, 3), round((y_pred==y_test).sum()/len(y_pred), 3)))

Precision: 0.911 / Recall: 0.723 / Accuracy: 0.956


In [None]:
from sklearn.metrics import confusion_matrix
cm=confusion_matrix(y_test, y_pred)
print(cm)

[[1444   15]
 [  59  154]]
