In [48]:
#importing necessary libraries and Packagaes
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report,accuracy_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import re

In [12]:
#load data
#url = "https://drive.google.com/drive/folders/16rbc8dm9l-nqPPO0fL7mBdDyJ-ldPzLj?usp=drive_link"
path = "/content/spam.csv"
# path = "/content/spam_ham_dataset.csv"
df = pd.read_csv(path)
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [14]:
df.describe()

Unnamed: 0,Category,Message
count,5572,5572
unique,2,5157
top,ham,"Sorry, I'll call later"
freq,4825,30




---
**Data Preprocessing **


---



In [15]:

df['label'] = df['Category'].apply(lambda x: 1 if x == 'spam' else 0)


In [17]:
df.drop(['Category'], axis=1, inplace=True)

In [18]:
df.head()

Unnamed: 0,Message,label
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0


Text cleaning

In [21]:
def Clean_Text(Text):
    sms = re.sub('[^a-zA-Z]', ' ', Text)  # Keep only alphabets, replace others with space
    sms = sms.lower()                     # Convert to lowercase
    sms = sms.split()                     # Tokenize (split into words)
    sms = ' '.join(sms)                   # Join back into clean string
    return sms

df['CleanMessage'] = df['Message'].apply(Clean_Text)

In [22]:
df.head()

Unnamed: 0,Message,label,CleanMessage
0,"Go until jurong point, crazy.. Available only ...",0,go until jurong point crazy available only in ...
1,Ok lar... Joking wif u oni...,0,ok lar joking wif u oni
2,Free entry in 2 a wkly comp to win FA Cup fina...,1,free entry in a wkly comp to win fa cup final ...
3,U dun say so early hor... U c already then say...,0,u dun say so early hor u c already then say
4,"Nah I don't think he goes to usf, he lives aro...",0,nah i don t think he goes to usf he lives arou...


In [26]:
df.drop(['Message'], axis=1, inplace=True)
df.head()

Unnamed: 0,label,CleanMessage
0,0,go until jurong point crazy available only in ...
1,0,ok lar joking wif u oni
2,1,free entry in a wkly comp to win fa cup final ...
3,0,u dun say so early hor u c already then say
4,0,nah i don t think he goes to usf he lives arou...


TF-IDF-> It’s a way to transform text into numbers while highlighting important words.

In [24]:
vectorizer=TfidfVectorizer()

In [25]:
vectorizer

In [27]:
X = vectorizer.fit_transform(df['CleanMessage'])
y = df['label']

In [29]:
X

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 72238 stored elements and shape (5572, 7759)>

In [30]:
y

Unnamed: 0,label
0,0
1,0
2,1
3,0
4,0
...,...
5567,1
5568,0
5569,0
5570,0




---


# ***Train test splitting***

---



In [31]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)



---

# **Fitting transformed data into Models**

---



In [57]:
modelNB = MultinomialNB()
modelNB.fit(X_train, y_train)

## Prediction of model with test data

In [58]:
y_pred = modelNB.predict(X_test)

# Printing model accuracy over test data

In [59]:
print("Accuracy",accuracy_score(y_test,y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy 0.9623205741626795
Confusion Matrix:
 [[1448    0]
 [  63  161]]


In [60]:
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Classification Report:
               precision    recall  f1-score   support

           0       0.96      1.00      0.98      1448
           1       1.00      0.72      0.84       224

    accuracy                           0.96      1672
   macro avg       0.98      0.86      0.91      1672
weighted avg       0.96      0.96      0.96      1672





---


# Now, Using logistic regressionto analyze performance for different model.



In [40]:
logreg_model = LogisticRegression(max_iter=1000, random_state=42)
logreg_model.fit(X_train, y_train)
logreg_pred = logreg_model.predict(X_test)
print("\n=== Logistic Regression ===")
print("Accuracy:", accuracy_score(y_test, logreg_pred))
print(classification_report(y_test, logreg_pred))


=== Logistic Regression ===
Accuracy: 0.9659090909090909
              precision    recall  f1-score   support

           0       0.96      1.00      0.98      1448
           1       0.99      0.75      0.86       224

    accuracy                           0.97      1672
   macro avg       0.98      0.88      0.92      1672
weighted avg       0.97      0.97      0.96      1672



# Random Forest Classifier

In [43]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

In [44]:
rf_model.fit(X_train, y_train)

In [45]:
y_pred_rf = rf_model.predict(X_test)

In [49]:
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))


Random Forest Accuracy: 0.9760765550239234
Confusion Matrix:
 [[1448    0]
 [  40  184]]


# Support Vector Machine (SVM) calssifier

---







In [50]:
svm_model = SVC(kernel='linear', probability=True, random_state=42)
svm_model.fit(X_train, y_train)

In [52]:
y_pred_svm = svm_model.predict(X_test)

In [53]:
print("SVM Accuracy:", accuracy_score(y_test, y_pred_svm))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_svm))

SVM Accuracy: 0.9856459330143541
Confusion Matrix:
 [[1447    1]
 [  23  201]]
