In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report

In [15]:
# Load and Explore the Data
url = '/content/sample_data/spam.csv'
data = pd.read_csv(url, encoding='latin-1')
data = data.rename(columns={'v1': 'Label', 'v2': 'Message'})
data = data[['Label', 'Message']]

# Display the first few rows of the dataset
print(data.head())

  Label                                            Message
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [16]:
#Preprocess the Data
data['Label'] = data['Label'].map({'ham': 0, 'spam': 1})

In [17]:
# Feature Extraction and Model Training
X_train, X_test, y_train, y_test = train_test_split(data['Message'], data['Label'], test_size=0.3, random_state=42)

# Define the pipeline with TF-IDF Vectorizer and Naive Bayes classifier
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', MultinomialNB())
])

# Train the pipeline
pipeline.fit(X_train, y_train)

In [18]:
# Evaluate the Model
# Make predictions
y_pred = pipeline.predict(X_test)

# Print evaluation metrics
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(f'Classification Report:\n{classification_report(y_test, y_pred)}')

Accuracy: 0.9599282296650717
Classification Report:
              precision    recall  f1-score   support

           0       0.96      1.00      0.98      1453
           1       1.00      0.69      0.82       219

    accuracy                           0.96      1672
   macro avg       0.98      0.85      0.90      1672
weighted avg       0.96      0.96      0.96      1672



In [19]:
# Make Predictions
# Example of classifying a new email
new_emails = [
    "Congratulations! You've won a $1,000 gift card. Click here to claim your prize.",
    "Hi, just wanted to follow up on our meeting next week. Let me know if you need anything."
]

# Predict spam or non-spam
predictions = pipeline.predict(new_emails)
print(f'Predictions: {["Spam" if p == 1 else "Non-Spam" for p in predictions]}')


Predictions: ['Spam', 'Non-Spam']
