In [4]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
# pip install imbalanced-learn
# pip install imbalanced-learn
from imblearn.over_sampling import SMOTE

# If the above import fails, you may need to install the imbalanced-learn package:
# Uncomment the following line and run it:
# !pip install imbalanced-learn

# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('wordnet')

# Load dataset
df = pd.read_csv("spam.csv", encoding="latin-1")
df = df[['v1', 'v2']]  # Keep only required columns
df.columns = ['label', 'message']

# Convert labels to binary (ham=0, spam=1)
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

# Text preprocessing function
lemmatizer = WordNetLemmatizer()
custom_stopwords = set(stopwords.words('english')) - {"not", "no", "free", "win", "urgent"}

def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9]', ' ', text)  # Remove special characters
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in custom_stopwords]
    return " ".join(words)

df['message'] = df['message'].apply(clean_text)

# Convert text into numerical vectors using TF-IDF
vectorizer = TfidfVectorizer(max_features=15000, min_df=2, ngram_range=(1,2))  
X = vectorizer.fit_transform(df['message']).toarray()
y = df['label']

# Train-test split (80-20)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Handle class imbalance using SMOTE
smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)

# Train a Random Forest model
model = RandomForestClassifier(n_estimators=200, random_state=42)
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Updated Model Accuracy: {accuracy * 100:.2f}%")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Updated Model Accuracy: 98.21%


In [5]:
X_train

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.17426736, 0.20559401, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.1462703 , 0.        , 0.15271491, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.12853103, 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]], shape=(7720, 9766))

In [6]:


import joblib  

# Save the trained model  
joblib.dump(model, 'spam_filter_model.pkl')

# Save the TF-IDF Vectorizer  
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')


['tfidf_vectorizer.pkl']