### Libraries

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB
from imblearn.over_sampling import SMOTE
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import warnings
warnings.filterwarnings("ignore")


import spacy
nlp = spacy.load("en_core_web_lg")

import gensim.downloader as api
word = api.load("word2vec-google-news-300")

### Creating a vector for a word boy

In [12]:
word["boy"]

array([ 2.35351562e-01,  1.65039062e-01,  9.32617188e-02, -1.28906250e-01,
        1.59912109e-02,  3.61328125e-02, -1.16699219e-01, -7.32421875e-02,
        1.38671875e-01,  1.15356445e-02,  1.87500000e-01, -2.91015625e-01,
        1.70898438e-02, -1.84570312e-01, -2.87109375e-01,  2.54821777e-03,
       -2.19726562e-01,  1.77734375e-01, -1.20605469e-01,  5.39550781e-02,
        3.78417969e-02,  2.49023438e-01,  1.76757812e-01,  2.69775391e-02,
        1.21093750e-01, -3.51562500e-01, -5.83496094e-02,  1.22070312e-01,
        5.97656250e-01, -1.60156250e-01,  1.08398438e-01, -2.40478516e-02,
       -1.16699219e-01,  3.58886719e-02, -2.37304688e-01,  1.15234375e-01,
        5.27343750e-01, -2.18750000e-01, -4.54101562e-02,  3.30078125e-01,
        3.75976562e-02, -5.51757812e-02,  3.26171875e-01,  6.74438477e-03,
        3.71093750e-01,  3.68652344e-02,  6.68945312e-02,  5.17578125e-02,
       -4.76074219e-02, -7.91015625e-02,  4.46777344e-02,  1.67968750e-01,
        5.51757812e-02, -

#                                            Word2vec Using Gensim

### Reading the dataset

In [22]:
df = pd.read_csv("spam.csv")
df

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


### Preprocessing

In [6]:
def preprocessing(text):
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:    
        if token.is_stop or token.is_punct or token.is_space:
            continue
        filtered_tokens.append(token.lemma_)
    return " ".join(filtered_tokens)

In [24]:
df["Message"] = df["Message"].apply(preprocessing)

### converting categorical records into vector

In [7]:
%%time
def vector(text):
    doc = text.split()
    vector = [word[token] for token in doc if token in word]
    return np.mean(vector,axis=0)


Wall time: 0 ns


In [26]:
sep = list(map(lambda x: (x, df[x].fillna(df[x].mode()[0], inplace = True)), df.columns))

df["Category"] = df["Category"].apply(lambda x : 0 if x == "ham" else 1)

col=df.select_dtypes(include="object").columns 

for i in col:
    df[i] = df[i].apply(vector)
df.dropna(inplace=True)

In [27]:
df

Unnamed: 0,Category,Message
0,0,"[-0.012878418, 0.040771484, -0.0042349007, 0.1..."
1,0,"[-0.08577728, 0.057678223, 0.04466756, 0.11112..."
2,1,"[-0.012818813, -0.06542969, -0.07962799, 0.024..."
3,0,"[-0.09277344, 0.094441734, 0.13600667, 0.10632..."
4,0,"[-0.044311523, 0.0456604, 0.039697267, 0.20253..."
...,...,...
5567,1,"[0.017649932, 0.022539925, 0.027890373, 0.0795..."
5568,0,"[-0.01590983, 0.07259115, 0.12625122, 0.099726..."
5569,0,"[0.09309896, 0.09602865, 0.026326498, 0.124674..."
5570,0,"[0.10076226, 0.016004775, -0.023646036, 0.0993..."


### setting an array element in 2D

In [28]:
X = np.stack(df["Message"])
y = np.stack(df["Category"])

### Balancing the dataset

In [29]:
smote = SMOTE(sampling_strategy="minority")
X,y = smote.fit_resample(X,y)

### Spliting the dataset

In [30]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y, 
    test_size=0.33, 
    random_state=42,stratify=y)

### Using MinMaxScaler negative values convert into positive value

In [31]:
scaler = MinMaxScaler()
scaler_train = scaler.fit_transform(X_train)
scaler_test = scaler.fit_transform(X_test)

### Buliding a model using Navie Baye Multinomial

In [40]:
clf = MultinomialNB()
clf.fit(scaler_train,y_train)
y_pred = clf.predict(scaler_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.87      0.91      0.89      1581
           1       0.90      0.87      0.88      1581

    accuracy                           0.89      3162
   macro avg       0.89      0.89      0.89      3162
weighted avg       0.89      0.89      0.89      3162



### Testing the Navie Baye model

In [45]:
l1 = ["You got cash prize of 1000$, pls use the cash coupon code",
        'Niteesh pls come to ground, we can play cricket',
        "Hurry up! you got an offer of upto 20% discount for order delivery, exclusive offer just for you!!!, Don't miss the reward",
        'pls get me some food',
        'Send me the code',
        "Free Prizes, Gift Cards or Coupons Free prizes are quite uncommon. If you get a text about free gifts, think twice before taking action! .."
        "ACTION REQUIRED. Please verify your Bank of America account information to avoid a hold on your account. Click here to verify: https://bit.ly/97qW5R8",
        "You’ve been overcharged for your 2021 taxes. Get your IRS tax refund here: https://bit.lyPt808gF23",
        "Get delivery updates on your USPS order NQ-836491 here: https://bit.ly/Po9808Lq",
        "Congratulations! You’re being offered a no-interest Visa credit card. Click here to claim: https://bit.ly/07tjA786",
        "There’s an issue with your payment information from your recent order YQ-885629. Take action now: https://bit.ly/Hp187Ty19",
        "Wells Fargo: We have detected suspicious activity on your account. Log in at http://bit.ly/9Uy6Qw89 to update your account preferences and protect your information."]

a = []
for i in l1:
    a.append(vector(i))
clf.predict(a)


array([1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0])

### Buliding a model using KNN

In [34]:
from  sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier(n_neighbors = 5, metric = 'euclidean')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.80      0.89      1581
           1       0.83      0.99      0.91      1581

    accuracy                           0.90      3162
   macro avg       0.91      0.90      0.90      3162
weighted avg       0.91      0.90      0.90      3162



### Testing the KNN model

In [46]:
l1 = ["You got cash prize of 1000$, pls use the cash coupon code",
        'Niteesh pls come to ground, we can play cricket',
        "Hurry up! you got an offer of upto 20% discount for order delivery, exclusive offer just for you!!!, Don't miss the reward",
        'pls get me some food',
        'Send me the code',
        "Free Prizes, Gift Cards or Coupons Free prizes are quite uncommon. If you get a text about free gifts, think twice before taking action! .."
        "ACTION REQUIRED. Please verify your Bank of America account information to avoid a hold on your account. Click here to verify: https://bit.ly/97qW5R8",
        "You’ve been overcharged for your 2021 taxes. Get your IRS tax refund here: https://bit.lyPt808gF23",
        "Get delivery updates on your USPS order NQ-836491 here: https://bit.ly/Po9808Lq",
        "Congratulations! You’re being offered a no-interest Visa credit card. Click here to claim: https://bit.ly/07tjA786",
        "There’s an issue with your payment information from your recent order YQ-885629. Take action now: https://bit.ly/Hp187Ty19",
        "Wells Fargo: We have detected suspicious activity on your account. Log in at http://bit.ly/9Uy6Qw89 to update your account preferences and protect your information."]

a = []
for i in l1:
    a.append(vector(i))
clf.predict(a)


array([1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0])