In [10]:
# Import necessary libraries
import pandas as pd
import string
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from nltk.corpus import stopwords
import nltk

In [11]:

# --- 1️ Load and Inspect the Dataset ---
# Load dataset with proper encoding
file_path = "spam.csv"  # Update the correct file path
data = pd.read_csv(file_path, encoding="ISO-8859-1")

# Drop unnecessary columns and rename the relevant columns
data = data[['v1', 'v2']].rename(columns={'v1': 'label', 'v2': 'message'})

# Check for missing values and drop them if necessary
data = data.dropna()

# Encode labels: 'ham' -> 0, 'spam' -> 1
data['label'] = data['label'].map({'ham': 0, 'spam': 1})


In [12]:
# Download NLTK stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nithi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [13]:
# --- 2️ Text Preprocessing ---
def preprocess_text(text):
    """
    Preprocesses the input text by:
    - Converting to lowercase
    - Removing punctuation and special characters
    - Removing stopwords
    """
    # Convert to lowercase
    text = text.lower()

    # Remove punctuation and special characters
    text = re.sub(f"[{string.punctuation}]", "", text)

    # Tokenize and remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = text.split()
    text = " ".join([word for word in tokens if word not in stop_words])

    return text

# Apply preprocessing to all messages
data['message'] = data['message'].apply(preprocess_text)


In [14]:
# --- 3️ Split Data into Training and Testing Sets ---
X = data['message']
y = data['label']

# Split data into 75% training and 25% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [15]:

# --- 4️ Vectorize Text Using TF-IDF ---
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [16]:
# --- 5️ Train Logistic Regression Model ---
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)

In [17]:

# --- 6️ Make Predictions and Evaluate the Model ---
y_pred = model.predict(X_test_tfidf)

# Evaluate the model
print(" Model Evaluation Results:")
print("Accuracy Score:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

 Model Evaluation Results:
Accuracy Score: 0.9540559942569993

Confusion Matrix:
 [[1198    4]
 [  60  131]]

Classification Report:
               precision    recall  f1-score   support

           0       0.95      1.00      0.97      1202
           1       0.97      0.69      0.80       191

    accuracy                           0.95      1393
   macro avg       0.96      0.84      0.89      1393
weighted avg       0.95      0.95      0.95      1393



In [18]:
# --- 7️ Save the Model and Vectorizer for Future Use ---
import joblib

# Save the trained model
joblib.dump(model, "sms_spam_classifier.pkl")
# Save the TF-IDF vectorizer
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")

print("\n Model and Vectorizer Saved Successfully!")


 Model and Vectorizer Saved Successfully!
