### Libraries

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB
from imblearn.over_sampling import SMOTE
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import warnings
warnings.filterwarnings("ignore")


import spacy
nlp = spacy.load("en_core_web_lg")

### Checking the similarity between two words

In [2]:
n = nlp("girl")
s = nlp("pen boy toothpaste cat eye")
for token in s:
    print(f"{token.text} - {n.text} = {token.similarity(n)}")

pen - girl = 0.11529760421080253
boy - girl = 0.8607838386221719
toothpaste - girl = 0.1532412669253498
cat - girl = 0.4544697286614181
eye - girl = 0.22109918558775637


#                                            Glove Using Spacy

### Reading the dataset

In [3]:
df = pd.read_csv("spam.csv")
df

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


### Information of the dataset

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


### converting categorical records into vector

In [4]:
%%time
df = pd.get_dummies(df,columns=["Category"],drop_first=True)
df["Message"] = df["Message"].apply(lambda x : nlp(x).vector)

Wall time: 2min 22s


In [5]:
df

Unnamed: 0,Message,Category_spam
0,"[0.8000099, 0.9535963, -0.34847602, -0.3791346...",0
1,"[0.16232497, 0.8551012, -0.7290775, -0.6373049...",0
2,"[-0.739487, 0.5289012, -0.13894223, -0.0169310...",1
3,"[-1.8170546, 2.02302, 1.0373635, -1.0052722, 1...",0
4,"[0.18221398, 4.2482843, -3.2048604, -1.1110712...",0
...,...,...
5567,"[-1.1125491, -0.28742734, -1.3039072, 0.625423...",1
5568,"[-1.3827443, 4.3848166, -2.7198246, -1.683979,...",0
5569,"[-1.4874302, -0.14814456, -2.4567807, -1.36188...",0
5570,"[-1.459938, 1.7489644, -2.9308975, -1.8730268,...",0


### setting an array element in 2D

In [6]:
X = np.stack(df["Message"])
y = np.stack(df["Category_spam"])

### Balancing the dataset

In [7]:
smote = SMOTE(sampling_strategy="minority")
X,y = smote.fit_resample(X,y)

### Spliting the dataset

In [8]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y, 
    test_size=0.33, 
    random_state=42,stratify=y)

### Using MinMaxScaler negative values convert into positive value

In [9]:
scaler = MinMaxScaler()
scaler_train = scaler.fit_transform(X_train)
scaler_test = scaler.fit_transform(X_test)

### Buliding a model using Navie Baye Multinomial

In [10]:
clf = MultinomialNB()
clf.fit(scaler_train,y_train)
y_pred = clf.predict(scaler_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.94      0.75      0.83      1593
           1       0.79      0.95      0.86      1592

    accuracy                           0.85      3185
   macro avg       0.87      0.85      0.85      3185
weighted avg       0.87      0.85      0.85      3185



### Testing the Navie Baye model

In [28]:
l1 = ["You got cash prize of 1000$, pls use the cash coupon code",
        'Niteesh pls come to ground, we can play cricket',
        "Hurry up! you got an offer of upto 20% discount for order delivery, exclusive offer just for you!!!, Don't miss the reward",
        'pls get me some food',
        'Send me the code',
        "Free Prizes, Gift Cards or Coupons Free prizes are quite uncommon. If you get a text about free gifts, think twice before taking action! .."
        "ACTION REQUIRED. Please verify your Bank of America account information to avoid a hold on your account. Click here to verify: https://bit.ly/97qW5R8",
        "You’ve been overcharged for your 2021 taxes. Get your IRS tax refund here: https://bit.lyPt808gF23",
        "Get delivery updates on your USPS order NQ-836491 here: https://bit.ly/Po9808Lq",
        "Congratulations! You’re being offered a no-interest Visa credit card. Click here to claim: https://bit.ly/07tjA786",
        "There’s an issue with your payment information from your recent order YQ-885629. Take action now: https://bit.ly/Hp187Ty19",
        "Wells Fargo: We have detected suspicious activity on your account. Log in at http://bit.ly/9Uy6Qw89 to update your account preferences and protect your information."]

a = []
for i in l1:
    a.append(nlp(i).vector)
clf.predict(a)

array([1, 0, 1])

### Buliding a model using KNN

In [30]:
from  sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier(n_neighbors = 5, metric = 'euclidean')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.83      0.90      1593
           1       0.85      1.00      0.92      1592

    accuracy                           0.91      3185
   macro avg       0.92      0.91      0.91      3185
weighted avg       0.92      0.91      0.91      3185



  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


### Testing the KNN model

In [33]:
l1 = ["You got cash prize of 1000$, pls use the cash coupon code",
        'Niteesh pls come to ground, we can play cricket',
        "Hurry up! you got an offer of upto 20% discount for order delivery, exclusive offer just for you!!!, Don't miss the reward",
        'pls get me some food',
        'Send me the code',
        "Free Prizes, Gift Cards or Coupons Free prizes are quite uncommon. If you get a text about free gifts, think twice before taking action! .."
        "ACTION REQUIRED. Please verify your Bank of America account information to avoid a hold on your account. Click here to verify: https://bit.ly/97qW5R8",
        "You’ve been overcharged for your 2021 taxes. Get your IRS tax refund here: https://bit.lyPt808gF23",
        "Get delivery updates on your USPS order NQ-836491 here: https://bit.ly/Po9808Lq",
        "Congratulations! You’re being offered a no-interest Visa credit card. Click here to claim: https://bit.ly/07tjA786",
        "There’s an issue with your payment information from your recent order YQ-885629. Take action now: https://bit.ly/Hp187Ty19",
        "Wells Fargo: We have detected suspicious activity on your account. Log in at http://bit.ly/9Uy6Qw89 to update your account preferences and protect your information."]

a = []
for i in l1:
    a.append(nlp(i).vector)
clf.predict(a)

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


array([1, 1, 1, 1])