In [None]:
import numpy as np
import pandas as pd

In [None]:
# Importing the dataset
df = pd.read_csv('/content/spam.csv', encoding='latin-1')

# Summary of the dataset
- This dataset is a collection of SMS tagged messages that are collected from SMS spam research. All the 5574 messages written in English are tagged being ham(legitimate) or a spam.

In [None]:
# How big is the dataset
df.shape

(5572, 5)

In [None]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [None]:
df.tail()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,
5571,ham,Rofl. Its true to its name,,,


In [None]:
df.sample(5)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
4885,ham,Or just do that 6times,,,
236,ham,Or ill be a little closer like at the bus stop...,,,
2169,spam,"Shop till u Drop, IS IT YOU, either 10K, 5K, å...",,,
335,ham,"Ta-Daaaaa! I am home babe, are you still up ?",,,
4312,ham,"I wasn't well babe, i have swollen glands at m...",,,


In [None]:
# checking for null values
df.isnull().sum()

v1               0
v2               0
Unnamed: 2    5522
Unnamed: 3    5560
Unnamed: 4    5566
dtype: int64

## - Columns named Unnamed: 2, Unnamed: 3, Unnamed: 4 contains all null values hance needed to be removed from the dataset

In [None]:
# Dropping the columns having null values
df.drop(columns=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], inplace=True)

In [None]:
df.isnull().sum()

v1    0
v2    0
dtype: int64

# Now we can rename the columns to get the context of the column by its name.

In [None]:
df.rename(columns={'v1':'tagged_as', 'v2':'content'}, inplace=True)


In [None]:
df.sample(5)

Unnamed: 0,tagged_as,content
556,ham,Having lunch:)you are not in online?why?
3394,ham,Then i buy.
505,ham,No it's waiting in e car dat's bored wat. Cos ...
4922,ham,Oh yah... We never cancel leh... Haha
1560,ham,"Just got some gas money, any chance you and th..."


### Applying Label Encoder on tagged_as column
- Here tagged_as column is a categorical columns at which label encoding needs to be performed.

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['tagged_as'] = le.fit_transform(df['tagged_as'])

In [None]:
df.head()

Unnamed: 0,tagged_as,content
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
X = df['content']
y = df['tagged_as']

### - Convert text of all sms to lowercase , remove urls, punctuation marks, special characters and remove stopwords.

In [None]:
# Removing urls

def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    text = url_pattern.sub(r'', text)
    return text


In [None]:
# Removing  Punctuation Marks and Special Characters
import re

def remove_punctuation_and_special_chars(text):

    # Remove special characters
    text = re.sub('[^A-Za-z0-9 ]+', '', text)

    return text


In [None]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
# Cleaning the content of messages like converting into lowercase, removing punctuation and special characters, urls etc.
def text_cleaning(text):
  text = text.lower()
  text = remove_punctuation_and_special_chars(text)
  text = remove_urls(text)
  words= word_tokenize(text)
  filtered_text= [word for word in words if word.lower() not in stopwords.words('english')]
  text = ' '.join(filtered_text)
  return text


X= X.apply(text_cleaning)
X[:5]


0    go jurong point crazy available bugis n great ...
1                              ok lar joking wif u oni
2    free entry 2 wkly comp win fa cup final tkts 2...
3                  u dun say early hor u c already say
4          nah dont think goes usf lives around though
Name: content, dtype: object

### Converting the content into its Lemma form

In [None]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
# Function to lemmatize a single text message
def lemmatize_text(text):
    # Tokenize the text into words
    words = word_tokenize(text)
    # Lemmatize each word
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    # Join the lemmatized words back into a single string
    lemmatized_text = ' '.join(lemmatized_words)
    return lemmatized_text


In [None]:
df['content_lemmatized'] = df['content'].apply(lemmatize_text)
df.sample(5)

Unnamed: 0,tagged_as,content,content_lemmatized
2403,0,Jesus christ bitch I'm trying to give you drug...,Jesus christ bitch I 'm trying to give you dru...
3386,0,So u workin overtime nigpun?,So u workin overtime nigpun ?
2224,0,"I prefer my free days... Tues, wed, fri oso ca...","I prefer my free day ... Tues , wed , fri oso ..."
5555,0,Yeh. Indians was nice. Tho it did kane me off ...,Yeh . Indians wa nice . Tho it did kane me off...
5538,0,I can't believe how attached I am to seeing yo...,I ca n't believe how attached I am to seeing y...


## Converting the content into Vectors using TF-IDF

In [None]:

from sklearn.feature_extraction.text import TfidfVectorizer
tf_idf= TfidfVectorizer()
transformed_tf_idf= tf_idf.fit_transform(X)
transformed_tf_idf.toarray()[2,100:200]


array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [None]:
# Converting the above obtained output in a numpy array
transformed_tf_idf = transformed_tf_idf.toarray()

## Building the model

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(transformed_tf_idf, y, test_size=0.2, random_state=42)

In [None]:
# Model building using Naive bayes
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
nb = MultinomialNB()
nb.fit(X_train, y_train)

In [None]:
y_pred = nb.predict(X_test)

In [None]:
# Model Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9659192825112107
Confusion Matrix:
 [[965   0]
 [ 38 112]]
Classification Report:
               precision    recall  f1-score   support

           0       0.96      1.00      0.98       965
           1       1.00      0.75      0.85       150

    accuracy                           0.97      1115
   macro avg       0.98      0.87      0.92      1115
weighted avg       0.97      0.97      0.96      1115



In [None]:
# Model Building using support vector machines
from sklearn.svm import SVC
svm = SVC()
svm.fit(X_train, y_train)

In [None]:
# Generate predictions on the test set
y_pred_svm = svm.predict(X_test)

# Evaluate the performance
print("Accuracy:", accuracy_score(y_test, y_pred_svm))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_svm))
print("Classification Report:\n", classification_report(y_test, y_pred_svm))

Accuracy: 0.9668161434977578
Confusion Matrix:
 [[963   2]
 [ 35 115]]
Classification Report:
               precision    recall  f1-score   support

           0       0.96      1.00      0.98       965
           1       0.98      0.77      0.86       150

    accuracy                           0.97      1115
   macro avg       0.97      0.88      0.92      1115
weighted avg       0.97      0.97      0.97      1115



In [None]:
# Model Building using Logistic regression
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train, y_train)

In [None]:

y_pred_svm = lr.predict(X_test)

# Evaluate the performance
print("Accuracy:", accuracy_score(y_test, y_pred_svm))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_svm))
print("Classification Report:\n", classification_report(y_test, y_pred_svm))

Accuracy: 0.9426008968609866
Confusion Matrix:
 [[961   4]
 [ 60  90]]
Classification Report:
               precision    recall  f1-score   support

           0       0.94      1.00      0.97       965
           1       0.96      0.60      0.74       150

    accuracy                           0.94      1115
   macro avg       0.95      0.80      0.85      1115
weighted avg       0.94      0.94      0.94      1115



### Here in all the models Naive bayes is performing better than the rest in inclusion with TF-IDF. hence it can be used as most optimised model.