In [1]:
import pandas as pd 
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as ImbPipeline

data=pd.read_csv(r"C:\Users\LENOVO\Downloads\spam.csv", encoding='latin1')
data.head()
                 
                

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [2]:
data.columns

Index(['v1', 'v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], dtype='object')

In [3]:
drop_columns=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4']
data.drop(drop_columns,axis=1,inplace=True)

In [4]:
data.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
# Rename columns using the rename method
data = data.rename(columns={'v1': 'Label', 'v2': 'Message'})

# Display the updated DataFrame
data.head()


Unnamed: 0,Label,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
data.isnull().sum()

Label      0
Message    0
dtype: int64

In [7]:
data['Label'].value_counts()

Label
ham     4825
spam     747
Name: count, dtype: int64

In [8]:

le = LabelEncoder()
data['Label'] = le.fit_transform(data['Label'])


In [9]:
data.head()

Unnamed: 0,Label,Message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [10]:

# Separate features and labels
X = data['Message']
y = data['Label']


In [14]:
import re
import spacy
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Load spaCy's English model
nlp = spacy.load("en_core_web_sm")

# Load stop words
nltk_stopwords = set(stopwords.words('english'))

# Define a preprocessing function
def preprocess_text(text):
    # 1. Lowercase the text
    text = text.lower()
    
    # 2. Replace URLs and emails
    text = re.sub(r"http\S+|www\S+|https\S+", "url_placeholder", text, flags=re.MULTILINE)
    text = re.sub(r"\S+@\S+", "email_placeholder", text)
    
    # 3. Remove special characters and numbers
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    
    # 4. Tokenize using spaCy
    doc = nlp(text)
    
    # 5. Lemmatize and remove stop words
    tokens = [
        token.lemma_ for token in doc 
        if token.is_alpha and token.text not in nltk_stopwords
    ]
    
    # 6. Join tokens back into a single string
    cleaned_text = " ".join(tokens)
    
    return cleaned_text

# Apply preprocessing to the data
data['Cleaned_Message'] = data['Message'].apply(preprocess_text)

# Separate features and labels
X = data['Cleaned_Message']
y = data['Label']


In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [18]:
models = {
    'Random Forest': (RandomForestClassifier(random_state=42), {
        'clf__n_estimators': [50, 100, 150],
        'clf__max_depth': [None, 10, 20],
        'clf__min_samples_split': [2, 5],
    }),
    'XGBoost': (XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42), {
        'clf__n_estimators': [50, 100],
        'clf__max_depth': [3, 5],
        'clf__learning_rate': [0.1, 0.2],
    }),
    'Logistic Regression': (LogisticRegression(random_state=42, max_iter=1000), {
        'clf__C': [0.1, 1.0, 10.0],
        'clf__penalty': ['l2'],
    })
}

In [19]:
for model_name, (model, param_grid) in models.items():
    print(f"\n=== Tuning {model_name} ===\n")
    
    # Define the pipeline
    pipeline = ImbPipeline([
        ('tfidf', TfidfVectorizer(max_features=1000)),  # TF-IDF Vectorizer
        ('undersample', RandomUnderSampler(random_state=42)),  # Random Undersampling
        ('clf', model)  # Classifier
    ])
    
    # Set up GridSearchCV
    grid_search = GridSearchCV(
        pipeline, 
        param_grid, 
        cv=3,  # 3-fold cross-validation
        scoring='f1',  # Optimize for F1-score
        verbose=2, 
        n_jobs=-1  # Use all available processors
    )
    
    # Fit the model using GridSearchCV
    grid_search.fit(X_train, y_train)
    
    # Best parameters and estimator
    print("Best Parameters:", grid_search.best_params_)
    print("Best Estimator:", grid_search.best_estimator_)
    
    # Evaluate the model on the test set
    y_pred = grid_search.predict(X_test)
    print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=le.classes_))


=== Tuning Random Forest ===

Fitting 3 folds for each of 18 candidates, totalling 54 fits




Best Parameters: {'clf__max_depth': 20, 'clf__min_samples_split': 2, 'clf__n_estimators': 150}
Best Estimator: Pipeline(steps=[('tfidf', TfidfVectorizer(max_features=1000)),
                ('undersample', RandomUnderSampler(random_state=42)),
                ('clf',
                 RandomForestClassifier(max_depth=20, n_estimators=150,
                                        random_state=42))])

Classification Report:
               precision    recall  f1-score   support

         ham       0.98      0.98      0.98       965
        spam       0.88      0.89      0.88       150

    accuracy                           0.97      1115
   macro avg       0.93      0.94      0.93      1115
weighted avg       0.97      0.97      0.97      1115


=== Tuning XGBoost ===

Fitting 3 folds for each of 8 candidates, totalling 24 fits




Best Parameters: {'clf__learning_rate': 0.2, 'clf__max_depth': 3, 'clf__n_estimators': 100}
Best Estimator: Pipeline(steps=[('tfidf', TfidfVectorizer(max_features=1000)),
                ('undersample', RandomUnderSampler(random_state=42)),
                ('clf',
                 XGBClassifier(base_score=None, booster=None, callbacks=None,
                               colsample_bylevel=None, colsample_bynode=None,
                               colsample_bytree=None, device=None,
                               early_stopping_rounds=None,
                               enable_categorical=False, eval_metric='logloss',
                               feature_types=None,...ne, grow_policy=None,
                               importance_type=None,
                               interaction_constraints=None, learning_rate=0.2,
                               max_bin=None, max_cat_threshold=None,
                               max_cat_to_onehot=None, max_delta_step=None,
                    

