In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\91987\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### Step 1: Load the Dataset

In [2]:
import pandas as pd
messages = pd.read_csv(r'C:\Users\91987\Downloads\SMSSpamCollection.txt',delimiter='\t',
                     names=['labels','message'])

In [3]:
messages

Unnamed: 0,labels,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


### Step 2: Data Preprocessing

In [4]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
from nltk.corpus import wordnet

In [5]:
stop_words = set(stopwords.words('english'))

In [6]:
# Function to map nltk pos_tag to wordnet pos_tag for accurate lemmatization

def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN  # Default to noun if no match


corpus =[]
for i in range(0,len(messages)):
    #removing non-alphabetic characters and converting into lowercase
    rev = re.sub('[^A-Za-z]',' ',messages['message'][i]).lower()
    rev=rev.split()
    rev=[word for word in rev if not word in stop_words]

    # Apply pos tag
    rev_pos=nltk.pos_tag(rev)

    #Lemmatizar with correct pos tag
    rev=[lemmatizer.lemmatize(word,get_wordnet_pos(pos)) for word,pos in rev_pos]
    rev = ' '.join(rev)
    corpus.append(rev)
    




In [7]:
corpus

['go jurong point crazy available bugis n great world la e buffet cine get amore wat',
 'ok lar joking wif u oni',
 'free entry wkly comp win fa cup final tkts st may text fa receive entry question std txt rate c apply',
 'u dun say early hor u c already say',
 'nah think go usf life around though',
 'freemsg hey darling week word back like fun still tb ok xxx std chgs send rcv',
 'even brother like speak treat like aid patent',
 'per request melle melle oru minnaminunginte nurungu vettam set callertune caller press copy friend callertune',
 'winner value network customer select receivea prize reward claim call claim code kl valid hour',
 'mobile month u r entitle update late colour mobile camera free call mobile update co free',
 'gonna home soon want talk stuff anymore tonight k cry enough today',
 'six chance win cash pound txt csh send cost p day day tsandcs apply reply hl info',
 'urgent week free membership prize jackpot txt word claim c www dbuk net lccltd pobox ldnw rw',
 'sear

In [8]:
# Ouput or Dependent Features 

y = pd.get_dummies(messages['labels'])
print(y)

      ham  spam
0       1     0
1       1     0
2       0     1
3       1     0
4       1     0
...   ...   ...
5567    0     1
5568    1     0
5569    1     0
5570    1     0
5571    1     0

[5572 rows x 2 columns]


In [9]:
y=y.iloc[:,0].values
print(y)

[1 1 0 ... 1 1 1]


### Step 3: Train Test Split

In [10]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(corpus,y,test_size=0.20)

### Step 4: Creation of BOW (Bag Of Words) Model


In [11]:
from sklearn.feature_extraction.text import CountVectorizer

# For Binary BOW enable binary=True

cv = CountVectorizer(max_features=2500,ngram_range=(1,2))


In [12]:
# Independent Features 

X_train = cv.fit_transform(X_train).toarray()
X_test = cv.transform(X_test).toarray()

In [13]:
X_train

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [14]:
cv.vocabulary_

{'ok': 1495,
 'lol': 1199,
 'would': 2441,
 'mom': 1355,
 'fit': 708,
 'tell': 2071,
 'whole': 2396,
 'family': 668,
 'crazy': 452,
 'tone': 2162,
 'reply': 1744,
 'poly': 1628,
 'mono': 1360,
 'eg': 600,
 'cha': 315,
 'yeah': 2477,
 'slow': 1914,
 'come': 380,
 'stop': 1996,
 'txt': 2210,
 'tone reply': 2166,
 'tone txt': 2167,
 'sorry': 1944,
 'call': 248,
 'later': 1131,
 'sorry call': 1945,
 'call later': 260,
 'free': 733,
 'ringtone': 1770,
 'wait': 2318,
 'collect': 368,
 'simply': 1896,
 'text': 2080,
 'password': 1555,
 'verify': 2291,
 'get': 785,
 'po': 1618,
 'box': 213,
 'free ringtone': 744,
 'po box': 1619,
 'break': 222,
 'list': 1182,
 'reason': 1717,
 'nobody': 1447,
 'town': 2180,
 'vl': 2307,
 'know': 1104,
 'know get': 1106,
 'arm': 95,
 'feeling': 685,
 'weak': 2363,
 'cuz': 474,
 'go': 827,
 'another': 67,
 'time': 2135,
 'da': 475,
 'wait call': 2319,
 'want': 2333,
 'new': 1429,
 'video': 2295,
 'phone': 1575,
 'anytime': 76,
 'network': 1423,
 'mins': 1318,
 '

### Step 5. Model Training  and Classification

In [15]:
from sklearn.naive_bayes import MultinomialNB

In [16]:
spam_detect_model = MultinomialNB().fit(X_train,y_train)

In [17]:
y_pred = spam_detect_model.predict(X_test)

### Step 6. Evaluation: Calculating metrics like accuracy etc.

In [18]:
from sklearn.metrics import accuracy_score, classification_report

In [19]:
accuracy = accuracy_score(y_test,y_pred)
accuracy_rounded= round(accuracy,4)
print(accuracy_rounded)

0.9865


In [20]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.95      0.95      0.95       147
           1       0.99      0.99      0.99       968

    accuracy                           0.99      1115
   macro avg       0.97      0.97      0.97      1115
weighted avg       0.99      0.99      0.99      1115

