In [3]:
#  SPAM SMS DETECTION

'''
 Build an AI model that can classify SMS messages as spam or
 legitimate. Use techniques like TF-IDF or word embeddings with
 classifiers like Naive Bayes, Logistic Regression, or Support Vector
 Machines to identify spam messages
'''

#importing the data set

import pandas as pd

df = pd.read_csv(r"C:\Users\shima\Downloads\Compressed\archive_4\spam.csv" , encoding= 'latin-1')

print(df.head(10))
df.info()


     v1                                                 v2 Unnamed: 2  \
0   ham  Go until jurong point, crazy.. Available only ...        NaN   
1   ham                      Ok lar... Joking wif u oni...        NaN   
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...        NaN   
3   ham  U dun say so early hor... U c already then say...        NaN   
4   ham  Nah I don't think he goes to usf, he lives aro...        NaN   
5  spam  FreeMsg Hey there darling it's been 3 week's n...        NaN   
6   ham  Even my brother is not like to speak with me. ...        NaN   
7   ham  As per your request 'Melle Melle (Oru Minnamin...        NaN   
8  spam  WINNER!! As a valued network customer you have...        NaN   
9  spam  Had your mobile 11 months or more? U R entitle...        NaN   

  Unnamed: 3 Unnamed: 4  
0        NaN        NaN  
1        NaN        NaN  
2        NaN        NaN  
3        NaN        NaN  
4        NaN        NaN  
5        NaN        NaN  
6        NaN  

In [4]:
# Droping the irrelevant columns 
df = df.drop(columns=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'])
df = df.rename(columns={'v1': 'label', 'v2': 'message'})

df['label'] = df['label'].map({'ham': 0, 'spam': 1})

print(df.head())
print(df.label.value_counts())

   label                                            message
0      0  Go until jurong point, crazy.. Available only ...
1      0                      Ok lar... Joking wif u oni...
2      1  Free entry in 2 a wkly comp to win FA Cup fina...
3      0  U dun say so early hor... U c already then say...
4      0  Nah I don't think he goes to usf, he lives aro...
label
0    4825
1     747
Name: count, dtype: int64


In [5]:
from sklearn.model_selection import train_test_split

X = df['message']  # The features (SMS text)
y = df['label']    # The target (0 or 1)

# Split the data 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english')

X_train_tfidf = vectorizer.fit_transform(X_train)


X_test_tfidf = vectorizer.transform(X_test)

print(f"Shape of TF-IDF matrix for training data: {X_train_tfidf.shape}")
print(f"Shape of TF-IDF matrix for testing data: {X_test_tfidf.shape}")


Shape of TF-IDF matrix for training data: (4457, 7472)
Shape of TF-IDF matrix for testing data: (1115, 7472)


In [10]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix ,precision_score , recall_score

# Initializing the Naive Bayes model
nb_model = MultinomialNB()

# Training the model
nb_model.fit(X_train_tfidf, y_train)
y_predict = nb_model.predict(X_test_tfidf)

#Checking model performance
Accuracy = accuracy_score(y_test , y_predict)
Precision = precision_score(y_test, y_predict)
confusion = confusion_matrix(y_test , y_predict)
f1 = f1_score(y_test , y_predict)
recall = recall_score(y_test , y_predict)

print(f"Accuracy of the model is : {Accuracy * 100:.2f}% ")  
print("Precision of the model is : ",Precision)
print("Confusion Matrix of the model is :\n" , confusion)
print("\nF1 Score of the model is :",f1)          #ideal >0.70
print("Recall Score of the model is :" , recall)  #Recall Ideal if  > 0.75


Accuracy of the model is : 96.68% 
Precision of the model is :  1.0
Confusion Matrix of the model is :
 [[965   0]
 [ 37 113]]

F1 Score of the model is : 0.8593155893536122
Recall Score of the model is : 0.7533333333333333


In [9]:
#TRaining with logistic regression

from sklearn.linear_model import LogisticRegression 

model = LogisticRegression(max_iter=1000)
model.fit(X_train_tfidf, y_train)

y_pred = model.predict(X_test_tfidf)

#Checking model performance
Accuracy = accuracy_score(y_test , y_predict)
Precision = precision_score(y_test, y_predict)
confusion = confusion_matrix(y_test , y_predict)
f1 = f1_score(y_test , y_predict)
recall = recall_score(y_test , y_predict)

print(f"Accuracy of the model is : {Accuracy * 100:.2f}% ")  
print("Precision of the model is : ",Precision)
print("Confusion Matrix of the model is :\n" , confusion)
print("\nF1 Score of the model is :",f1)          #ideal >0.70
print("Recall Score of the model is :" , recall)  #Recall Ideal if  > 0.75


Accuracy of the model is : 96.68% 
Precision of the model is :  1.0
Confusion Matrix of the model is :
 [[965   0]
 [ 37 113]]

F1 Score of the model is : 0.8593155893536122
Recall Score of the model is : 0.7533333333333333


In [None]:
'''
Both the models have same metrics scores, we can use any one of them
'''