# **SPAM SMS DETECTION**

In [26]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re
import nltk

In [27]:
df = pd.read_csv('spam.csv',encoding = "ISO-8859-1")

In [29]:
display(df)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


In [30]:
df.shape

(5572, 5)

In [31]:
df.describe()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
count,5572,5572,50,12,6
unique,2,5169,43,10,5
top,ham,"Sorry, I'll call later","bt not his girlfrnd... G o o d n i g h t . . .@""","MK17 92H. 450Ppw 16""","GNT:-)"""
freq,4825,30,3,2,2


In [32]:
df.isna().sum()

v1               0
v2               0
Unnamed: 2    5522
Unnamed: 3    5560
Unnamed: 4    5566
dtype: int64

In [33]:
df.drop(columns =['Unnamed: 2','Unnamed: 3','Unnamed: 4'],inplace =True)


In [34]:
display(df)

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [35]:
df.isna().sum()

v1    0
v2    0
dtype: int64

In [36]:
#renaming the column names
df.rename(columns={'v1':'Target','v2':'SMS'},inplace=True)

In [37]:
#Labelencoding ham as 0 ; spam as 1
encode = LabelEncoder()
spam_cleaned['Target'] = encode.fit_transform(df['Target'])
spam_cleaned

Unnamed: 0,Target,SMS
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...
5568,0,Will Ì_ b going to esplanade fr home?
5569,0,"Pity, * was in mood for that. So...any other s..."
5570,0,The guy did some bitching but I acted like i'd...


In [38]:
spam_cleaned=df 

In [39]:
display(spam_cleaned)

Unnamed: 0,Target,SMS
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [40]:
nltk.download('stopwords')

def clean_text(text):
    # Remove punctuation and numbers
    text = re.sub('[^a-zA-Z]', ' ', text)
    # Convert text to lowercase
    text = text.lower()
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in stop_words])
    # Stemming
    stemmer = PorterStemmer()
    text = ' '.join([stemmer.stem(word) for word in text.split()])
    return text

# Apply the cleaning function to the text messages
cleaned_texts = spam_cleaned['SMS'].apply(clean_text)

# Display the first few cleaned texts
print(cleaned_texts.head())

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\SHIVANI\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


0    go jurong point crazi avail bugi n great world...
1                                ok lar joke wif u oni
2    free entri wkli comp win fa cup final tkt st m...
3                  u dun say earli hor u c alreadi say
4                 nah think goe usf live around though
Name: SMS, dtype: object


In [41]:
#to drop duplicates
spam_cleaned.drop_duplicates(keep='first')

Unnamed: 0,Target,SMS
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [47]:

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(cleaned_texts)
Y = spam_cleaned['Target']

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Display the shape of the training and testing sets
print('Training set shape:', X_train.shape)
print('Testing set shape:', X_test.shape)


Training set shape: (4179, 6221)
Testing set shape: (1393, 6221)


In [48]:
print(X)

  (0, 5886)	0.19459721085856554
  (0, 188)	0.35227555532712895
  (0, 2185)	0.16514812015268623
  (0, 957)	0.2976038126814356
  (0, 730)	0.33628509567872483
  (0, 2898)	0.28504484907271926
  (0, 6060)	0.23615475543085498
  (0, 2222)	0.19459721085856554
  (0, 732)	0.2976038126814356
  (0, 377)	0.2634906267537017
  (0, 1162)	0.2728131680559813
  (0, 4046)	0.24054119706179236
  (0, 2794)	0.35227555532712895
  (0, 2148)	0.140840528429051
  (1, 3743)	0.5647537939557097
  (1, 5982)	0.4459451111953121
  (1, 2761)	0.47451057922863127
  (1, 2926)	0.4218684931830353
  (1, 3718)	0.2811632882742994
  (2, 260)	0.18734543331896464
  (2, 4313)	0.18099233980499008
  (2, 5629)	0.13714969058149892
  (2, 5072)	0.220381850740506
  (2, 4268)	0.18480736620794727
  (2, 4350)	0.18099233980499008
  :	:
  (5567, 3928)	0.2704984914783844
  (5567, 4313)	0.2859214251331077
  (5568, 1690)	0.6651601234243666
  (5568, 1979)	0.5740011643413078
  (5568, 2431)	0.37453093229027423
  (5568, 2148)	0.29632963790124456
  (556

In [49]:
print(Y)

0        ham
1        ham
2       spam
3        ham
4        ham
        ... 
5567    spam
5568     ham
5569     ham
5570     ham
5571     ham
Name: Target, Length: 5572, dtype: object


In [56]:
#splitting data into training and testing
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.25,random_state=42)

In [58]:
# LOGISTIC REGRESSION
model = LogisticRegression()
model.fit(X_train,Y_train)
pred = model.predict(X_test)

In [62]:
print("accuracy is ",100*accuracy_score(Y_test,pred))

accuracy is  94.90308686288586
