In [1]:
import pandas as pd

In [2]:
df=pd.read_csv(r"spam.csv",encoding="windows-1252")
df.head()

Unnamed: 0,type,text,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [3]:
df['Spam']=df['type'].map({'spam':1,'ham':0}).astype(int)

In [4]:
df.head()

Unnamed: 0,type,text,Unnamed: 2,Unnamed: 3,Unnamed: 4,Spam
0,ham,"Go until jurong point, crazy.. Available only ...",,,,0
1,ham,Ok lar... Joking wif u oni...,,,,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,,1
3,ham,U dun say so early hor... U c already then say...,,,,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,,0


In [5]:
df=df.drop(['Unnamed: 2','Unnamed: 3','Unnamed: 4'],axis=1)

In [6]:
df.head()

Unnamed: 0,type,text,Spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [7]:
import spacy
nlp=spacy.load("en_core_web_lg")

In [8]:
#Word tokenization
def tokenizer(text):
    return text.split()

In [9]:
df['text']=df['text'].apply(tokenizer)

In [10]:
df['text'][2]

['Free',
 'entry',
 'in',
 '2',
 'a',
 'wkly',
 'comp',
 'to',
 'win',
 'FA',
 'Cup',
 'final',
 'tkts',
 '21st',
 'May',
 '2005.',
 'Text',
 'FA',
 'to',
 '87121',
 'to',
 'receive',
 'entry',
 'question(std',
 'txt',
 "rate)T&C's",
 'apply',
 "08452810075over18's"]

In [10]:
#Lemmatization
def lemmatization(text):
    t=" ".join(text)
    doc=nlp(t)
    return [token.lemma_ for token in doc]
    

In [11]:
df['text']=df['text'].apply(lemmatization)

In [13]:
df['text'][0]

['go',
 'until',
 'jurong',
 'point',
 ',',
 'crazy',
 '..',
 'available',
 'only',
 'in',
 'bugis',
 'n',
 'great',
 'world',
 'la',
 'e',
 'buffet',
 '...',
 'cine',
 'there',
 'get',
 'amore',
 'wat',
 '...']

In [12]:
#POS Tagging
def postag(text):
    t=" ".join(text)
    doc=nlp(t)
    return [token for token in doc if token.pos_ not in ['SPACE','X','PUNCT']]

In [13]:
df['text']=df['text'].apply(postag)

In [16]:
df['text'][2]

[free,
 entry,
 in,
 2,
 a,
 wkly,
 comp,
 to,
 win,
 FA,
 Cup,
 final,
 tkts,
 21st,
 May,
 2005,
 text,
 fa,
 to,
 87121,
 to,
 receive,
 entry,
 question(std,
 txt,
 rate)T&C,
 's,
 apply,
 08452810075over18,
 's]

In [14]:
def join_tokens(token_list):
    return ' '.join([token.text for token in token_list])

In [15]:
df['text'] = df['text'].apply(join_tokens)

In [19]:
df.head()

Unnamed: 0,type,text,Spam
0,ham,go until jurong point crazy available only in ...,0
1,ham,ok lar joke wif u oni,0
2,spam,free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor U c already then say,0
4,ham,nah I do not think he go to usf he live around...,0


In [16]:
df['vector']=df['text'].apply(lambda x:nlp(x).vector)

In [17]:
df.head()

Unnamed: 0,type,text,Spam,vector
0,ham,go until jurong point crazy available only in ...,0,"[0.44553766, 2.2910447, -0.35224983, -0.037052..."
1,ham,ok lar joke wif u oni,0,"[-0.14939333, 1.0167166, 0.4778967, -1.6510634..."
2,spam,free entry in 2 a wkly comp to win FA Cup fina...,1,"[-0.70049304, 0.88594455, -0.4264396, -0.10242..."
3,ham,U dun say so early hor U c already then say,0,"[-2.2527246, 2.4505346, 1.8332298, -1.0634581,..."
4,ham,nah I do not think he go to usf he live around...,0,"[-0.2763943, 5.2276754, -3.4926186, -1.058722,..."


In [18]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(df.vector.values,df.Spam,test_size=0.2,random_state=42)

In [19]:
x_train

array([array([-1.2005311 ,  1.7363573 , -3.082999  , -0.12459709, -1.2368958 ,
        0.8715904 , -1.1755189 ,  4.2484446 , -2.634506  ,  4.032022  ,
        3.9582353 ,  2.9853487 , -2.2579408 , -0.13616475,  2.2714772 ,
        0.24252869,  0.4962448 , -4.506622  , -1.8191705 , -1.8669041 ,
        3.2847116 , -1.012859  , -0.27653733, -3.9796245 , -0.90893376,
       -1.7724179 , -3.0072963 , -0.45335624, -2.6853917 ,  2.3244486 ,
        0.8084486 , -1.6308483 , -2.995071  , -0.64589655,  0.52814895,
       -0.56542   , -3.570391  ,  0.9285657 ,  4.1702204 ,  1.8104442 ,
       -0.22730711,  2.763079  ,  1.1949809 , -1.9722613 ,  2.01119   ,
        1.722896  , -1.0524492 , -4.5510645 , -1.289072  ,  3.3262932 ,
        1.1113604 ,  0.64966047, -0.7811143 , -5.2633624 , -2.0118818 ,
        1.7031233 ,  0.7149386 ,  0.5084024 , -1.1158242 ,  3.307038  ,
        0.71058416, -2.1492925 , -1.1087347 ,  1.13084   , -2.4749167 ,
       -0.7861257 , -1.4521145 , -0.74540854,  0.7570525 

In [20]:
import numpy as np

In [21]:
x_train_2d=np.stack(x_train)
x_test_2d=np.stack(x_test)

In [28]:
from sklearn.preprocessing import MinMaxScaler
m=MinMaxScaler()
train=m.fit_transform(x_train_2d)
test=m.transform(x_test_2d)


In [29]:
from sklearn.metrics import classification_report ,accuracy_score

In [58]:
from sklearn.ensemble import RandomForestClassifier
s=RandomForestClassifier()
s.fit(train,y_train)

In [59]:
y_pred=s.predict(test)

In [60]:
print(accuracy_score(y_test,y_pred))

0.9704035874439462


In [61]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.97      1.00      0.98       965
           1       0.99      0.79      0.88       150

    accuracy                           0.97      1115
   macro avg       0.98      0.89      0.93      1115
weighted avg       0.97      0.97      0.97      1115



In [64]:

# Assuming you have already defined tokenizer, lemmatization, postag, join_tokens, and loaded the model `s`

s2 = "SIX chances to win CASH! From 100 to 20,000 pounds txt> CSH11 and send to 87575. Cost 150p/day, 6days, 16+ TsandCs apply Reply HL 4 info"
s2 = tokenizer(s2)
s2 = lemmatization(s2)
s2 = postag(s2)
s2 = join_tokens(s2)

s3 = nlp(s2).vector  # Get the vector representation of the sentence

# Convert the vector representation to a numpy array
s4 = np.array(s3)

# Reshape the array if necessary
# s5 = np.reshape(s4, (-1, 1))  # This line is unnecessary

# Predict using the model
prediction = s.predict([s4])  # Pass the vector directly without reshaping

print(prediction)


[0]


In [33]:
import pickle

In [35]:
pickle.dump(s, open('models.pkl', 'wb'))