In [10]:
# Importing necessary libraries
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix



In [11]:

import urllib.request

# URL of the dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip"

# Download and extract dataset
file_name = "smsspamcollection.zip"
urllib.request.urlretrieve(url, file_name)

# Unzipping the dataset
import zipfile
with zipfile.ZipFile(file_name, 'r') as zip_ref:
    zip_ref.extractall("sms_spam_data")

# Load dataset
file_path = "sms_spam_data/SMSSpamCollection"
df = pd.read_csv(file_path, sep='\t', header=None, names=['label', 'message'])

# Preview the dataset
print(df.head())

# Convert labels to binary format: spam = 1, ham = 0
df['label'] = df['label'].map({'spam': 1, 'ham': 0})

# Preview the processed dataset
print(df.head())


  label                                            message
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...
   label                                            message
0      0  Go until jurong point, crazy.. Available only ...
1      0                      Ok lar... Joking wif u oni...
2      1  Free entry in 2 a wkly comp to win FA Cup fina...
3      0  U dun say so early hor... U c already then say...
4      0  Nah I don't think he goes to usf, he lives aro...


In [26]:
# Importing necessary libraries
import urllib.request
import zipfile
import re
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix



In [27]:
# Download dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip"
file_name = "smsspamcollection.zip"
urllib.request.urlretrieve(url, file_name)



('smsspamcollection.zip', <http.client.HTTPMessage at 0x7979ed2dde70>)

In [28]:
# Extract dataset
with zipfile.ZipFile(file_name, 'r') as zip_ref:
    zip_ref.extractall("sms_spam_data")




In [36]:
df.head()


Unnamed: 0,label,message,cleaned_message
0,0,"Go until jurong point, crazy.. Available only ...",go jurong point crazy available bugis n great ...
1,0,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,free entry wkly comp win fa cup final tkts st ...
3,0,U dun say so early hor... U c already then say...,u dun say early hor u c already say
4,0,"Nah I don't think he goes to usf, he lives aro...",nah think goes usf lives around though


In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   label            5572 non-null   int64 
 1   message          5572 non-null   object
 2   cleaned_message  5572 non-null   object
dtypes: int64(1), object(2)
memory usage: 130.7+ KB


In [39]:
df.shape


(5572, 3)

In [40]:
df.isnull().sum()

Unnamed: 0,0
label,0
message,0
cleaned_message,0


In [42]:
print('count of label:\n', df.value_counts())

count of label:
 label  message                                                                                                                                                                                                                                cleaned_message                                                                                                                              
0      Sorry, I'll call later                                                                                                                                                                                                                 sorry call later                                                                                                                                 30
       I cant pick the phone right now. Pls send a message                                                                                                                                                               

In [44]:
print('not a spam email ratio i.e.e 0 label:',round(len(df[df['label']==0])/len(df),2))
print('spam email ratio i.e.e 1 label:',round(len(df[df['label']==1])/len(df),2))

not a spam email ratio i.e.e 0 label: 0.87
spam email ratio i.e.e 1 label: 0.13


In [29]:
# Load dataset into a pandas DataFrame
file_path = "sms_spam_data/SMSSpamCollection"
df = pd.read_csv(file_path, sep='\t', header=None, names=['label', 'message'])

# Convert labels to binary format: spam = 1, ham = 0
df['label'] = df['label'].map({'spam': 1, 'ham': 0})



In [30]:
# Preprocessing: Cleaning the text
nltk.download('stopwords')
from nltk.corpus import stopwords

def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\W', ' ', text)  # Remove punctuation
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = re.sub(r'\d+', '', text)  # Remove digits
    stop_words = set(stopwords.words('english'))
    words = [word for word in text.split() if word not in stop_words]  # Remove stopwords
    return ' '.join(words)

df['cleaned_message'] = df['message'].apply(preprocess_text)



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [31]:
# Splitting dataset into training and testing sets
X = df['cleaned_message']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)



In [32]:
# Vectorization using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)



In [33]:
# Training a Naive Bayes classifier
model = MultinomialNB()
model.fit(X_train_vectorized, y_train)



In [34]:
# Making predictions
y_pred = model.predict(X_test_vectorized)



In [35]:
# Evaluation
print(f"Accuracy: {accuracy_score(y_test, y_pred) * 100:.2f}%")
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Accuracy: 97.56%
Classification Report:
              precision    recall  f1-score   support

           0       0.97      1.00      0.99      1207
           1       0.99      0.82      0.90       186

    accuracy                           0.98      1393
   macro avg       0.98      0.91      0.94      1393
weighted avg       0.98      0.98      0.97      1393

Confusion Matrix:
[[1206    1]
 [  33  153]]
