<a href="https://colab.research.google.com/github/Robin39-AFS/SMS-Spam-Detection/blob/main/SMS_Spam_Detection_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [75]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import string
import re

In [None]:
pip install pandas nltk




In [None]:
# Download required NLTK data
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
# Read the CSV file (handling encoding issues if necessary)
df = pd.read_csv("/content/drive/MyDrive/SMS SPAM DETECTION/spam.csv", encoding='latin-1')


In [None]:
# Remove the column
df = df.drop(columns=['Unnamed: 2','Unnamed: 3','Unnamed: 4'])

# Save the modified DataFrame to a new CSV file
df.to_csv("/content/drive/MyDrive/SMS SPAM DETECTION/spam_modified.csv", index=False)

     v1                                                 v2
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [49]:
# Read the CSV file (handling encoding issues if necessary)
df = pd.read_csv("/content/drive/MyDrive/SMS SPAM DETECTION/spam_modified.csv", encoding='latin-1')


In [50]:
print(df.head())

     v1                                                 v2
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [51]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   v1      5572 non-null   object
 1   v2      5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [52]:
# Rename v1 as traget and v2 as text
df.rename(columns={'v1':'target','v2': 'text'}, inplace=True)

In [53]:
df.head()

Unnamed: 0,target,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [56]:
# Check for Missing values
print(f"Missing value\n{df.isnull().sum()}")
# Check for Duplicate values
print(f"Duplicate value\n{df.duplicated().sum()}")

Missing value
target    0
text      0
dtype: int64
Duplicate value
0


In [55]:
# Remove duplicates
df = df.drop_duplicates()

In [57]:
# Convert text to lower case
def to_lower_case(text):
    return text.lower()

# Tokenize text
def tokenize_text(text):
    return nltk.word_tokenize(text)

# Remove special characters
def remove_special_characters(text):
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    return text
# Remove stop words and punctuation
stop_words = set(stopwords.words('english'))
punctuation = set(string.punctuation)

def remove_stopwords_and_punctuation(text):
    t = text[:]
    text.clear()
    for i in t:
        if i not in stop_words and i not in punctuation:
            text.append(i)
    return text
# Apply stemming
stemmer = PorterStemmer()

def apply_stemming(text):
    t = text[:]
    text.clear()
    for i in t:
        text.append(stemmer.stem(i))
    return text

In [58]:
# Preprocess the 'text' column
df['text'] = df['text'].apply(to_lower_case)
df['text'] = df['text'].apply(remove_special_characters)
df['tokens'] = df['text'].apply(tokenize_text)
df['tokens'] = df['tokens'].apply(remove_stopwords_and_punctuation)
df['tokens'] = df['tokens'].apply(apply_stemming)

# Combine tokens back to string for vectorization
df['preprocessed_text'] = df['tokens'].apply(lambda tokens: ' '.join(tokens))

# Save the preprocessed DataFrame to a new CSV file
output_file_path = '/content/drive/MyDrive/SMS SPAM DETECTION/preprocessed_file.csv'
df.to_csv(output_file_path, index=False)

print("Data preprocessing completed and saved successfully.")

Data preprocessing completed and saved successfully.


In [59]:
df.head()

Unnamed: 0,target,text,tokens,preprocessed_text
0,ham,go until jurong point crazy available only in ...,"[go, jurong, point, crazi, avail, bugi, n, gre...",go jurong point crazi avail bugi n great world...
1,ham,ok lar joking wif u oni,"[ok, lar, joke, wif, u, oni]",ok lar joke wif u oni
2,spam,free entry in 2 a wkly comp to win fa cup fina...,"[free, entri, 2, wkli, comp, win, fa, cup, fin...",free entri 2 wkli comp win fa cup final tkt 21...
3,ham,u dun say so early hor u c already then say,"[u, dun, say, earli, hor, u, c, alreadi, say]",u dun say earli hor u c alreadi say
4,ham,nah i dont think he goes to usf he lives aroun...,"[nah, dont, think, goe, usf, live, around, tho...",nah dont think goe usf live around though


In [60]:
df = pd.read_csv("/content/drive/MyDrive/SMS SPAM DETECTION/preprocessed_file.csv")

In [61]:
df.head()

Unnamed: 0,target,text,tokens,preprocessed_text
0,ham,go until jurong point crazy available only in ...,"['go', 'jurong', 'point', 'crazi', 'avail', 'b...",go jurong point crazi avail bugi n great world...
1,ham,ok lar joking wif u oni,"['ok', 'lar', 'joke', 'wif', 'u', 'oni']",ok lar joke wif u oni
2,spam,free entry in 2 a wkly comp to win fa cup fina...,"['free', 'entri', '2', 'wkli', 'comp', 'win', ...",free entri 2 wkli comp win fa cup final tkt 21...
3,ham,u dun say so early hor u c already then say,"['u', 'dun', 'say', 'earli', 'hor', 'u', 'c', ...",u dun say earli hor u c alreadi say
4,ham,nah i dont think he goes to usf he lives aroun...,"['nah', 'dont', 'think', 'goe', 'usf', 'live',...",nah dont think goe usf live around though


In [66]:
# Handle NaN values
df['preprocessed_text'].fillna('', inplace=True)

# Step 4: Text Vectorization using TfidfVectorizer
tfidf  = TfidfVectorizer()
X = tfidf.fit_transform(df['preprocessed_text'])


In [71]:
encoder = LabelEncoder()
df['target'] = encoder.fit_transform(df['target'])

In [73]:
y = df['target']

In [76]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)


In [77]:
# Model Training
models = {
    "Naive Bayes": MultinomialNB(),
    "Logistic Regression": LogisticRegression(),
    "Support Vector Machine": SVC()
}
for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"{model_name} Performance:")
    print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
    print(f"Precision: {precision_score(y_test, y_pred)}")
    print(f"Recall: {recall_score(y_test, y_pred)}")
    print(f"F1 Score: {f1_score(y_test, y_pred)}")
    print(f"Confusion Matrix:\n {confusion_matrix(y_test, y_pred)}\n")

Naive Bayes Performance:
Accuracy: 0.966183574879227
Precision: 1.0
Recall: 0.7445255474452555
F1 Score: 0.8535564853556485
Confusion Matrix:
 [[898   0]
 [ 35 102]]

Logistic Regression Performance:
Accuracy: 0.9536231884057971
Precision: 0.989010989010989
Recall: 0.656934306569343
F1 Score: 0.7894736842105264
Confusion Matrix:
 [[897   1]
 [ 47  90]]

Support Vector Machine Performance:
Accuracy: 0.970048309178744
Precision: 1.0
Recall: 0.7737226277372263
F1 Score: 0.8724279835390947
Confusion Matrix:
 [[898   0]
 [ 31 106]]

