# Importing Essential Libraries

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils.class_weight import compute_class_weight
from nltk.corpus import stopwords
import nltk
import numpy as np
import pandas as pd
nltk.download('stopwords')
import re

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\harsh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Loading the dataset and preprocessing

In [18]:
# Loading the dataset in pandas dataframe

emails = pd.read_csv(r"SPAM Data.csv")
emails.head(5)

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [19]:
# Check for duplicated values

emails["Message"].duplicated().sum()

415

In [20]:
# Removing NAN values and duplicates

emails.dropna(inplace=True)
emails.drop_duplicates(inplace=True)

In [21]:
# Coverting the labels into Binary for model training

emails['Category'] = emails['Category'].map({'spam': 1, 'ham': 0})

In [22]:
# Function for text pre processing
def preprocess_text(text):
    # Normalize text: lower case and remove punctuation
    text = text.lower()
    text = re.sub(r'[\W_]+', ' ', text)
    
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    words = text.split()
    words = [word for word in words if word not in stop_words]
    return ' '.join(words)

# Apply preprocessing to the dataset
emails['Processed_Message'] = emails['Message'].apply(preprocess_text)

# Model Training

In [23]:

# Splitting the dataset into training and testing set
X_train, X_test, y_train, y_test = train_test_split(emails['Processed_Message'], emails['Category'], test_size=0.25, random_state=42)


# Calculating the class weights
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)

# Convert class weights to dictionary format
class_weight_dict = dict(zip(np.unique(y_train), class_weights))

# Model pipeline
model_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1, 2))),
    ('classifier', RandomForestClassifier(n_estimators=100, class_weight={0: class_weights[0], 1: class_weights[1]}, random_state=42))
])

# Training the dataset on pipeline
model_pipeline.fit(X_train, y_train)


# Hyper parameter Tuning using GridSearch

In [24]:
from sklearn.model_selection import GridSearchCV

# Grid of parameters to search over
param_grid = {
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [None, 10, 20],
    'tfidf__max_df': [0.5, 0.75, 1.0],
    'tfidf__min_df': [1, 2, 5]
}

# Grid search
grid_search = GridSearchCV(model_pipeline, param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

# Use the best estimator for further predictions
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)


Fitting 5 folds for each of 54 candidates, totalling 270 fits
Best parameters: {'classifier__max_depth': None, 'classifier__n_estimators': 200, 'tfidf__max_df': 0.5, 'tfidf__min_df': 5}
Best score: 0.9704526668935675


# Model Evaluation

In [25]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

# Generate a confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:\n", conf_matrix)

# Detailed classification report
report = classification_report(y_test, y_pred, target_names=['Ham', 'Spam'])
print("\nClassification Report:\n", report)


Accuracy: 97.40%

Confusion Matrix:
 [[1113    7]
 [  27  162]]

Classification Report:
               precision    recall  f1-score   support

         Ham       0.98      0.99      0.98      1120
        Spam       0.96      0.86      0.91       189

    accuracy                           0.97      1309
   macro avg       0.97      0.93      0.94      1309
weighted avg       0.97      0.97      0.97      1309



# Model Testing

In [26]:
# Testing on the new data

new_messages = [
    "Congratulations! You've won a $1000 Walmart gift card. Go to this link to claim now.",
    "Hey, are we still on for dinner tonight?",
    "Urgent: Your bank account has been compromised. Please log in using the link below to secure your account.",
    "Hot Deal! Buy one get one free on all our exclusive skincare products. Offer ends soon. Visit our site to order now!",
    "Reminder: The team meeting is scheduled for 10 AM tomorrow in the main conference room. Please be prepared to discuss your current projects.",
    "Parents: School will be closed on Friday due to the severe weather forecast. Please check the school website for further updates.",
    "You have won the National Lottery! To claim your $1,000,000 prize, please send us your personal details and banking information."
]

# Preprocess the new messages
preprocessed_messages = [preprocess_text(msg) for msg in new_messages]

# Use the model pipeline to predict categories
new_predictions = best_model.predict(preprocessed_messages)

print("New Message Predictions:\n")
for message, prediction in zip(new_messages, new_predictions):
    print(f"Message: {message}\nPredicted Category: {'Spam' if prediction == 1 else 'Ham'}\n")


New Message Predictions:

Message: Congratulations! You've won a $1000 Walmart gift card. Go to this link to claim now.
Predicted Category: Spam

Message: Hey, are we still on for dinner tonight?
Predicted Category: Ham

Message: Urgent: Your bank account has been compromised. Please log in using the link below to secure your account.
Predicted Category: Spam

Message: Hot Deal! Buy one get one free on all our exclusive skincare products. Offer ends soon. Visit our site to order now!
Predicted Category: Spam

Message: Reminder: The team meeting is scheduled for 10 AM tomorrow in the main conference room. Please be prepared to discuss your current projects.
Predicted Category: Ham

Message: Parents: School will be closed on Friday due to the severe weather forecast. Please check the school website for further updates.
Predicted Category: Ham

Message: You have won the National Lottery! To claim your $1,000,000 prize, please send us your personal details and banking information.
Predicte

# Saving the Model

In [27]:
import joblib

# Save the model
joblib.dump(best_model, 'spam_classifier_model.pkl')


['spam_classifier_model.pkl']