# Import required libraries

In [2]:
import pandas as pd
import re

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.svm import SVC

# Read Data

In [4]:
data = []

with open('SMSSpamCollection', 'r') as file:
    for line in file:
        label, message = line.strip().split('\t', 1)
        data.append((label, message))

df = pd.DataFrame(data, columns=['label', 'message'])
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
df.shape

(5574, 2)

# Data Preprocessing

In [7]:
df.isnull().sum()

label      0
message    0
dtype: int64

In [8]:
df.drop_duplicates(inplace=True)

In [9]:
df.shape

(5160, 2)

In [10]:
df.reset_index(drop=True,inplace=True)

In [11]:
sw = set(stopwords.words("english"))
filters = set(stopwords.words("english"))
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to C:\Users\HP
[nltk_data]     EliteBook\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to C:\Users\HP
[nltk_data]     EliteBook\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [12]:
lemmatizer = WordNetLemmatizer()

In [13]:
sms = []
for i in range(len(df)):
  cleaned = re.sub('[^a-zA-Z]', ' ', df['message'][i])
  cleaned = cleaned.lower()
  cleaned = word_tokenize(cleaned)
  cleaned = [lemmatizer.lemmatize(word) for word in cleaned if word not in sw]
  cleaned = " ".join(cleaned)
  sms.append(cleaned)

In [14]:
# Words vectorizing
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(df['message'])
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

print(tfidf_df)

       00  000  000pes  008704050406  0089  0121  01223585236  01223585334  \
0     0.0  0.0     0.0           0.0   0.0   0.0          0.0          0.0   
1     0.0  0.0     0.0           0.0   0.0   0.0          0.0          0.0   
2     0.0  0.0     0.0           0.0   0.0   0.0          0.0          0.0   
3     0.0  0.0     0.0           0.0   0.0   0.0          0.0          0.0   
4     0.0  0.0     0.0           0.0   0.0   0.0          0.0          0.0   
...   ...  ...     ...           ...   ...   ...          ...          ...   
5155  0.0  0.0     0.0           0.0   0.0   0.0          0.0          0.0   
5156  0.0  0.0     0.0           0.0   0.0   0.0          0.0          0.0   
5157  0.0  0.0     0.0           0.0   0.0   0.0          0.0          0.0   
5158  0.0  0.0     0.0           0.0   0.0   0.0          0.0          0.0   
5159  0.0  0.0     0.0           0.0   0.0   0.0          0.0          0.0   

      0125698789   02  ...  zoe  zogtorius  zoom  zouk  zyada  

# Split Data

In [16]:
x = tfidf_df
y = df.label

In [17]:
# Encoding
le = LabelEncoder()
y = le.fit_transform(y)

In [18]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.20)

# Model building

### LgisticRegression

In [21]:
LR= LogisticRegression()
LR.fit(x_train,y_train)

In [22]:
LR.score(x_test, y_test)

0.935077519379845

In [23]:
y_pred = LR.predict(x_test)
y_pred_labels = (y_pred > 0.5).astype(int)

report = classification_report(y_test, y_pred_labels)
print(report)

              precision    recall  f1-score   support

           0       0.93      1.00      0.96       905
           1       1.00      0.47      0.64       127

    accuracy                           0.94      1032
   macro avg       0.97      0.74      0.80      1032
weighted avg       0.94      0.94      0.92      1032



### XGboost

In [25]:
XGB = XGBClassifier()
XGB.fit(x_train, y_train)

In [26]:
XGB.score(x_test, y_test)

0.9641472868217055

In [27]:
y_pred = XGB.predict(x_test)
y_pred_labels = (y_pred > 0.5).astype(int)

report = classification_report(y_test, y_pred_labels)
print(report)

              precision    recall  f1-score   support

           0       0.97      0.99      0.98       905
           1       0.95      0.75      0.84       127

    accuracy                           0.96      1032
   macro avg       0.96      0.87      0.91      1032
weighted avg       0.96      0.96      0.96      1032



### SVC

In [29]:
svc = SVC(kernel='linear') 
svc.fit(x_train, y_train)

In [30]:
svc.score(x_test, y_test)

0.9748062015503876

In [31]:
y_pred = svc.predict(x_test)
y_pred_labels = (y_pred > 0.5).astype(int)

report = classification_report(y_test, y_pred_labels)
print(report)

              precision    recall  f1-score   support

           0       0.97      1.00      0.99       905
           1       0.98      0.81      0.89       127

    accuracy                           0.97      1032
   macro avg       0.98      0.90      0.94      1032
weighted avg       0.97      0.97      0.97      1032

