# Download Dataset #

In [4]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("uciml/sms-spam-collection-dataset")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/uciml/sms-spam-collection-dataset?dataset_version_number=1...


100%|██████████| 211k/211k [00:00<00:00, 46.8MB/s]

Extracting files...
Path to dataset files: /root/.cache/kagglehub/datasets/uciml/sms-spam-collection-dataset/versions/1





# Initialinzing Data

In [5]:
import pandas as pd

# The 'path' variable points to the folder, and the file is inside.
# We need to combine the path and the filename.
file_path = f"{path}/spam.csv"

# Read the file. The encoding='latin-1' is important for this specific file.
df = pd.read_csv(file_path, encoding='latin-1')

# This dataset has extra junk columns. Let's clean it up.
# We only keep the 'v1' (label) and 'v2' (message) columns
df = df[['v1', 'v2']]

# Rename the columns to match our plan
df.columns = ['Category', 'Message']

# You are ready to go!
print(df.head())

  Category                                            Message
0      ham  Go until jurong point, crazy.. Available only ...
1      ham                      Ok lar... Joking wif u oni...
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...
3      ham  U dun say so early hor... U c already then say...
4      ham  Nah I don't think he goes to usf, he lives aro...


# Prepare Data

In [6]:
from sklearn.model_selection import train_test_split

X = df['Message']
y = df['Category']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Train the Model

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

# Create the pipeline
# Step 1: Convert text to numbers (TF-IDF)
# Step 2: Run the classifier (Naive Bayes)
model_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('classifier', MultinomialNB())
])

# Train the model!
model_pipeline.fit(X_train, y_train)

# Evaluate Model Performance

In [8]:
from sklearn.metrics import accuracy_score, classification_report

# Make predictions on the test set
predictions = model_pipeline.predict(X_test)

# Print the results
print(f"Accuracy: {accuracy_score(y_test, predictions)}")
print(classification_report(y_test, predictions))

Accuracy: 0.9597989949748744
              precision    recall  f1-score   support

         ham       0.96      1.00      0.98      1202
        spam       1.00      0.71      0.83       191

    accuracy                           0.96      1393
   macro avg       0.98      0.85      0.90      1393
weighted avg       0.96      0.96      0.96      1393



# Export the model

In [9]:
import joblib

# Save the pipeline to a file
joblib.dump(model_pipeline, 'model.pkl')

['model.pkl']