In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report
import joblib 
import numpy as np 
from sklearn.preprocessing import StandardScaler

In [2]:

try:
    df = pd.read_csv("C:/Users/ravik/Downloads/dataset_ spam detection/spam.csv", encoding='latin-1')
   
    df = df.drop(columns=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], errors='ignore')
    
    df.rename(columns={'v1': 'target', 'v2': 'text'}, inplace=True)
    print("Dataset loaded successfully!")
    print(df.head())
    print("\nDataset Info:")
    df.info()
except FileNotFoundError:
    print("Error: 'spam.csv' not found. Make sure the file is in the correct directory.")
    
except Exception as e:
    print(f"An error occurred while loading the dataset: {e}")

Dataset loaded successfully!
  target                                               text
0    ham  Go until jurong point, crazy.. Available only ...
1    ham                      Ok lar... Joking wif u oni...
2   spam  Free entry in 2 a wkly comp to win FA Cup fina...
3    ham  U dun say so early hor... U c already then say...
4    ham  Nah I don't think he goes to usf, he lives aro...

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   target  5572 non-null   object
 1   text    5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [3]:
X = df['text']
y = df['target']


y = y.map({'ham': 0, 'spam': 1})

print(f"X (text samples) shape: {X.shape}")
print(f"y (target labels) shape: {y.shape}")
print("\nTarget distribution:")
print(y.value_counts())

X (text samples) shape: (5572,)
y (target labels) shape: (5572,)

Target distribution:
target
0    4825
1     747
Name: count, dtype: int64


In [4]:
# Initialize TF-IDF Vectorizer

vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)


X_vectorized = vectorizer.fit_transform(X)


print(f"X_vectorized shape: {X_vectorized.shape}")
print("\nFirst 5 TF-IDF values for the first message (example):")
print(X_vectorized[0].toarray()[:, :10]) 

X_vectorized shape: (5572, 5000)

First 5 TF-IDF values for the first message (example):
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]


In [5]:
# train_test_split(X_scaled, y, ...) 
X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y, test_size=0.3, random_state=42, stratify=y)

print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

X_train shape: (3900, 5000)
X_test shape: (1672, 5000)
y_train shape: (3900,)
y_test shape: (1672,)


In [6]:
model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)


print("Training the RandomForestClassifier model...")
model.fit(X_train, y_train)
print("Model training complete!")

Training the RandomForestClassifier model...
Model training complete!


In [7]:
print("Making predictions on the test set...")
y_pred = model.predict(X_test)
print("Predictions complete!")

print("\n--- Model Evaluation ---")
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Making predictions on the test set...
Predictions complete!

--- Model Evaluation ---
Confusion Matrix:
 [[1448    0]
 [  39  185]]

Classification Report:
               precision    recall  f1-score   support

           0       0.97      1.00      0.99      1448
           1       1.00      0.83      0.90       224

    accuracy                           0.98      1672
   macro avg       0.99      0.91      0.95      1672
weighted avg       0.98      0.98      0.98      1672



In [8]:
# Saving the model
model_filename = 'spam_detection_randomforest_model.pkl'
joblib.dump(model, model_filename)
print(f"Trained model saved as '{model_filename}'")

# Saving the vectorizer 
vectorizer_filename = 'tfidf_vectorizer.pkl'
joblib.dump(vectorizer, vectorizer_filename)
print(f"TF-IDF Vectorizer saved as '{vectorizer_filename}'")

Trained model saved as 'spam_detection_randomforest_model.pkl'
TF-IDF Vectorizer saved as 'tfidf_vectorizer.pkl'


In [9]:
print("--- Demonstrating Loading and Predicting ---")


loaded_model = joblib.load('spam_detection_randomforest_model.pkl')
loaded_vectorizer = joblib.load('tfidf_vectorizer.pkl')
print("Model and vectorizer loaded successfully.")


new_messages = [
    "Congratulations! You've won a FREE iPhone! Click here to claim.",
    "Hey, how are you doing today?",
    "URGENT! Your bank account has been compromised. Log in immediately at fakebank.com",
    "Meeting at 3 PM today.",
    "Free entry to a cash prize draw if you reply to this message now!"
]

print("\nNew messages for prediction:")
for msg in new_messages:
    print(f"- '{msg}'")


new_messages_vectorized = loaded_vectorizer.transform(new_messages)
print(f"\nTransformed new messages shape: {new_messages_vectorized.shape}")


new_predictions = loaded_model.predict(new_messages_vectorized)
print("\nPredictions:")


prediction_labels = np.array(['ham', 'spam'])
for i, prediction in enumerate(new_predictions):
    print(f"'{new_messages[i]}' -> {prediction_labels[prediction]}")

--- Demonstrating Loading and Predicting ---
Model and vectorizer loaded successfully.

New messages for prediction:
- 'Congratulations! You've won a FREE iPhone! Click here to claim.'
- 'Hey, how are you doing today?'
- 'URGENT! Your bank account has been compromised. Log in immediately at fakebank.com'
- 'Meeting at 3 PM today.'
- 'Free entry to a cash prize draw if you reply to this message now!'

Transformed new messages shape: (5, 5000)

Predictions:
'Congratulations! You've won a FREE iPhone! Click here to claim.' -> spam
'Hey, how are you doing today?' -> ham
'URGENT! Your bank account has been compromised. Log in immediately at fakebank.com' -> spam
'Meeting at 3 PM today.' -> ham
'Free entry to a cash prize draw if you reply to this message now!' -> spam
