In [1]:
import pandas as pd

df = pd.read_csv('Spam_SMS.csv')
df

Unnamed: 0,Class,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5569,spam,This is the 2nd time we have tried 2 contact u...
5570,ham,Will ü b going to esplanade fr home?
5571,ham,"Pity, * was in mood for that. So...any other s..."
5572,ham,The guy did some bitching but I acted like i'd...


In [2]:
df['Message'] = df['Message'].str.lower()

In [3]:
import string
exclude = string.punctuation

def remove_punc(text):
    return text.translate(str.maketrans('','',exclude))

df['Message'] = df['Message'].apply(remove_punc)
df['Message']

0       go until jurong point crazy available only in ...
1                                 ok lar joking wif u oni
2       free entry in 2 a wkly comp to win fa cup fina...
3             u dun say so early hor u c already then say
4       nah i dont think he goes to usf he lives aroun...
                              ...                        
5569    this is the 2nd time we have tried 2 contact u...
5570                  will ü b going to esplanade fr home
5571    pity  was in mood for that soany other suggest...
5572    the guy did some bitching but i acted like id ...
5573                            rofl its true to its name
Name: Message, Length: 5574, dtype: object

In [4]:
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))
def remove_stopwords(text):
    new_text = []
    for word in text.split():
        if word in stop_words:
            new_text.append(' ')
        else:
            new_text.append(word)
    X = new_text[:]
    new_text.clear()
    return " ".join(X)

df['Message'] = df['Message'].apply(remove_stopwords)
print(df['Message'])

0       go   jurong point crazy available     bugis n ...
1                                 ok lar joking wif u oni
2       free entry   2   wkly comp   win fa cup final ...
3                 u dun say   early hor u c already   say
4       nah   dont think   goes   usf   lives around  ...
                              ...                        
5569          2nd time     tried 2 contact u u       £...
5570                        ü b going   esplanade fr home
5571                pity     mood     soany   suggestions
5572      guy     bitching     acted like id   interes...
5573                                 rofl   true     name
Name: Message, Length: 5574, dtype: object


In [5]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    return " ".join([lemmatizer.lemmatize(word) for word in text.split()])

df['Message'] = df['Message'].apply(lemmatize_text)
print(df['Message'])

0       go jurong point crazy available bugis n great ...
1                                 ok lar joking wif u oni
2       free entry 2 wkly comp win fa cup final tkts 2...
3                     u dun say early hor u c already say
4                nah dont think go usf life around though
                              ...                        
5569    2nd time tried 2 contact u u £750 pound prize ...
5570                          ü b going esplanade fr home
5571                           pity mood soany suggestion
5572    guy bitching acted like id interested buying s...
5573                                       rofl true name
Name: Message, Length: 5574, dtype: object


In [6]:
import gensim.downloader as api
w2v_model = api.load("word2vec-google-news-300") 

In [8]:
import numpy as np
from nltk.tokenize import word_tokenize
import nltk

def message_to_vector(message):
    words = word_tokenize(message.lower())
    vectors = []
    
    for word in words:
        if word in w2v_model:
            vectors.append(w2v_model[word])
    
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(300)  

df['Message_Vector'] = df['Message'].apply(message_to_vector)
X = np.vstack(df['Message_Vector'].values)  
X

array([[-0.01980591,  0.05167062,  0.02709961, ..., -0.07992554,
        -0.05235944,  0.03033665],
       [-0.06323496,  0.0803833 ,  0.0609436 , ..., -0.11419169,
        -0.07287598,  0.10921224],
       [ 0.00078852, -0.0298785 , -0.0717199 , ..., -0.12769902,
        -0.09681567,  0.01087705],
       ...,
       [ 0.09309896,  0.09602865,  0.0263265 , ..., -0.07503255,
        -0.04199219, -0.00093587],
       [ 0.09333147,  0.01583426, -0.00273596, ..., -0.0831822 ,
        -0.03322983, -0.01823698],
       [ 0.11197916,  0.0679423 ,  0.08723959, ..., -0.05094401,
        -0.13898213,  0.01904297]])

In [10]:
df['Class'] = df['Class'].astype(str).str.lower().map({'ham': 0, 'spam': 1})
X = np.vstack(df['Message_Vector'].values)
y = df['Class']
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2, random_state = 1)
X_train.shape

(4459, 300)

In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

model = LogisticRegression(max_iter=1000)  
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy on Test Set: {accuracy * 100:.2f}%")


Accuracy on Test Set: 94.62%


In [14]:
df['Class']

0       0
1       0
2       1
3       0
4       0
       ..
5569    1
5570    0
5571    0
5572    0
5573    0
Name: Class, Length: 5574, dtype: int64

In [16]:
X = np.vstack(df['Message_Vector'].values)

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2, random_state = 1)
X_train.shape

(4459, 300)

In [20]:
from nltk.tokenize import word_tokenize
import numpy as np

def predict_message_class(model, w2v_model, message):
    words = word_tokenize(message.lower())
    vectors = [w2v_model[word] for word in words if word in w2v_model]

    if vectors:
        t = np.mean(vectors, axis=0)
    else:
        t = np.zeros(300)

    t = t.reshape(1, -1)  
    return model.predict(t)[0] 


In [22]:
x1 = df['Message'][0]
x2 = df['Class'][0]
print(predict_message_class(model, w2v_model,x1))
print(x2)

0
0
