In [3]:
# IMPORT REQUIRED LIBRARY
import pandas as pd    # Used for loading and handling tabular data


In [4]:
# LOAD THE SMS SPAM DATASET
df = pd.read_csv("spam.csv", encoding='latin-1') # Read the CSV file with proper encoding (latin-1 to handle special characters)

# The dataset has unnecessary columns, so we select only the useful ones
df = df[['v1', 'v2']] # 'v1' is label (spam/ham), 'v2' is the actual message

# Rename columns to meaningful names
df.columns = ['label', 'message']  # 'label' = spam/ham, 'message' = SMS text

# Preview the first 5 rows of the dataset
df.head()


Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
# Convert 'ham' to 0 and 'spam' to 1 using map function

df['label_num'] = df['label'].map({'ham': 0, 'spam': 1}) # Create a new column 'label_num' by mapping 'ham' to 0 and 'spam' to 1

In [10]:
 # Text preprocessing: Clean SMS messages using regex and string library

import re      # Regular expressions for pattern-based cleaning
import string  # for punctuation removal

# Define  a clean preprocessing function
def preprocess(text):

    text = text.lower() #  Convert text to lowercase
    text = re.sub(r'\d+', '', text)  # Remove digits (e.g., "win 1000" -> "win ")
    text = re.sub(rf"[{string.punctuation}]", '', text) # Remove punctuation (e.g., "hello!" -> "hello")
    text = text.strip() # Remove extra spaces (leading/trailing)
    return text

df['cleaned'] = df['message'].apply(preprocess) # Apply the preprocessing function to the message column

print("\nOriginal vs Cleaned Messages:\n") # Preview the cleaned messages
print(df[['message', 'cleaned']].head())


Original vs Cleaned Messages:

                                             message  \
0  Go until jurong point, crazy.. Available only ...   
1                      Ok lar... Joking wif u oni...   
2  Free entry in 2 a wkly comp to win FA Cup fina...   
3  U dun say so early hor... U c already then say...   
4  Nah I don't think he goes to usf, he lives aro...   

                                             cleaned  
0  go until jurong point crazy available only in ...  
1                            ok lar joking wif u oni  
2  free entry in  a wkly comp to win fa cup final...  
3        u dun say so early hor u c already then say  
4  nah i dont think he goes to usf he lives aroun...  


In [14]:
# Import train_test_split from scikit-learn
from sklearn.model_selection import train_test_split

# Step 1: Define input features and labels
X = df['cleaned']         # Cleaned SMS text (after preprocessing)
y = df['label_num']       # Labels: 0 = ham, 1 = spam

# Step 2: Split the data into training and testing sets
# 80% for training, 20% for testing
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

# Print the shape of the splits to verify
print("Training data shape:", X_train.shape)
print("Testing data shape:", X_test.shape)


Training data shape: (4457,)
Testing data shape: (1115,)


In [15]:
# Import the TfidfVectorizer from scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer

# Step 1: Initialize the TF-IDF Vectorizer
tfidf = TfidfVectorizer(max_features=3000)  # Optional: top 3000 most frequent words only

# Step 2: Fit the vectorizer on training text and transform it into numerical feature vectors
X_train_tfidf = tfidf.fit_transform(X_train)

# Step 3: Transform the test text using the same fitted vectorizer
X_test_tfidf = tfidf.transform(X_test)

# Check the shape of transformed data
print("Shape of TF-IDF training data:", X_train_tfidf.shape)
print("Shape of TF-IDF testing data:", X_test_tfidf.shape)


Shape of TF-IDF training data: (4457, 3000)
Shape of TF-IDF testing data: (1115, 3000)


In [16]:
# Import LogisticRegression and evaluation tools
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Step 1: Initialize the Logistic Regression model
lr_model = LogisticRegression()

# Step 2: Train the model on TF-IDF training data
lr_model.fit(X_train_tfidf, y_train)

# Step 3: Predict on test data
y_pred = lr_model.predict(X_test_tfidf)

# Step 4: Evaluate the model
print("✅ Logistic Regression Accuracy:", accuracy_score(y_test, y_pred))
print("\n📊 Classification Report:\n", classification_report(y_test, y_pred))
print("\n🧾 Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


✅ Logistic Regression Accuracy: 0.9650224215246637

📊 Classification Report:
               precision    recall  f1-score   support

           0       0.96      1.00      0.98       965
           1       1.00      0.74      0.85       150

    accuracy                           0.97      1115
   macro avg       0.98      0.87      0.92      1115
weighted avg       0.97      0.97      0.96      1115


🧾 Confusion Matrix:
 [[965   0]
 [ 39 111]]


In [None]:
import joblib
joblib.dump(model, 'spam_model.pkl')
joblib.dump(tfidf, 'vectorizer.pkl')

['vectorizer.pkl']

In [None]:
from google.colab import files
files.download('spam_model.pkl')
files.download('vectorizer.pkl')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>