In [1]:
pip install pandas scikit-learn nltk



In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from nltk.corpus import stopwords
import nltk
import string

# Download stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Load the dataset
data = pd.read_csv("spam.csv", encoding="latin-1")
data = data[['v1', 'v2']]  # Selecting only the relevant columns
data.columns = ['label', 'message']

# Encode labels as 0 (ham) and 1 (spam)
data['label'] = data['label'].map({'ham': 0, 'spam': 1})

# Preprocess text: remove punctuation and stop words
def preprocess_text(text):
    text = text.lower()  # Lowercase
    text = "".join([char for char in text if char not in string.punctuation])  # Remove punctuation
    text = " ".join([word for word in text.split() if word not in stop_words])  # Remove stop words
    return text

data['message'] = data['message'].apply(preprocess_text)

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(data['message'], data['label'], test_size=0.2, random_state=42)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


FileNotFoundError: [Errno 2] No such file or directory: 'spam.csv'

In [3]:
# Convert text to TF-IDF features
tfidf = TfidfVectorizer(max_features=3000)  # Limit to 3000 features for efficiency
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)


In [4]:
# Initialize the Naive Bayes classifier
classifier = MultinomialNB()

# Train the classifier on the training data
classifier.fit(X_train_tfidf, y_train)


In [5]:
# Make predictions on the test set
y_pred = classifier.predict(X_test_tfidf)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.9748878923766816
Classification Report:
               precision    recall  f1-score   support

           0       0.97      1.00      0.99       965
           1       1.00      0.81      0.90       150

    accuracy                           0.97      1115
   macro avg       0.99      0.91      0.94      1115
weighted avg       0.98      0.97      0.97      1115



In [6]:
import joblib

# Save the model and TF-IDF vectorizer
joblib.dump(classifier, "spam_classifier_model.pkl")
joblib.dump(tfidf, "tfidf_vectorizer.pkl")

# Load the model and vectorizer later if needed
classifier = joblib.load("spam_classifier_model.pkl")
tfidf = joblib.load("tfidf_vectorizer.pkl")


In [8]:
import string

# Preprocess text function (same as before)
def preprocess_text(text):
    text = text.lower()
    text = "".join([char for char in text if char not in string.punctuation])
    text = " ".join([word for word in text.split() if word not in stop_words])
    return text

# Prediction function
def predict_message(model, vectorizer, message):
    # Preprocess the custom input
    processed_message = preprocess_text(message)

    # Transform the message using the TF-IDF vectorizer
    message_tfidf = vectorizer.transform([processed_message])

    # Make a prediction
    prediction = model.predict(message_tfidf)

    # Return 'spam' if prediction is 1, else 'ham'
    return "spam" if prediction[0] == 1 else "ham"

# Example usage
custom_message = "Hi when do we meet!!"
result = predict_message(classifier, tfidf, custom_message)
print(f"The message is classified as: {result}")


The message is classified as: ham
