In [16]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [17]:
data = pd.read_csv("spam.csv", encoding="latin-1").iloc[:, :2]

In [18]:
data.columns = ['label', 'text']
data.dropna(subset=['text'], inplace=True)
data['label'] = data['label'].map({'ham': 0, 'spam': 1})
print(data)

      label                                               text
0         0  Go until jurong point, crazy.. Available only ...
1         0                      Ok lar... Joking wif u oni...
2         1  Free entry in 2 a wkly comp to win FA Cup fina...
3         0  U dun say so early hor... U c already then say...
4         0  Nah I don't think he goes to usf, he lives aro...
...     ...                                                ...
5567      1  This is the 2nd time we have tried 2 contact u...
5568      0              Will Ì_ b going to esplanade fr home?
5569      0  Pity, * was in mood for that. So...any other s...
5570      0  The guy did some bitching but I acted like i'd...
5571      0                         Rofl. Its true to its name

[5572 rows x 2 columns]


In [19]:
import re
import string
def clean_text(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(f"[{string.punctuation}]", "", text)
    words = text.split()
    words = [word for word in words if len(word) > 2]  # Remove very short words
    return " ".join(words)

In [20]:
data['clean_text'] = data['text'].apply(clean_text)
print(data)

      label                                               text  \
0         0  Go until jurong point, crazy.. Available only ...   
1         0                      Ok lar... Joking wif u oni...   
2         1  Free entry in 2 a wkly comp to win FA Cup fina...   
3         0  U dun say so early hor... U c already then say...   
4         0  Nah I don't think he goes to usf, he lives aro...   
...     ...                                                ...   
5567      1  This is the 2nd time we have tried 2 contact u...   
5568      0              Will Ì_ b going to esplanade fr home?   
5569      0  Pity, * was in mood for that. So...any other s...   
5570      0  The guy did some bitching but I acted like i'd...   
5571      0                         Rofl. Its true to its name   

                                             clean_text  
0     until jurong point crazy available only bugis ...  
1                                    lar joking wif oni  
2     free entry wkly comp win cu

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
X_train_raw, X_test_raw, y_train, y_test = train_test_split(data['text'], data['label'], test_size=0.2, random_state=42)
X_train_clean, X_test_clean, y_train, y_test = train_test_split(data['clean_text'], data['label'], test_size=0.2, random_state=42)

In [22]:
from sklearn.model_selection import train_test_split
vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train_clean)
X_test_vec = vectorizer.transform(X_test_clean)

In [23]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier
models = {
    'NaiveBayes': MultinomialNB(),
    'RandomForest': RandomForestClassifier(n_estimators=100, random_state=42),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

In [24]:
for model_name, model in models.items():
    model.fit(X_train_vec, y_train)
    y_pred = model.predict(X_test_vec)
    acc = accuracy_score(y_test, y_pred)
    print(f"{model_name} Accuracy: {acc:.4f}")

ensemble_model = VotingClassifier(
    estimators=[('NB', MultinomialNB()), ('RF', RandomForestClassifier(n_estimators=100, random_state=42)), ('XGB', XGBClassifier(use_label_encoder=False, eval_metric='logloss'))],
    voting='hard'
)
ensemble_model.fit(X_train_vec, y_train)

NaiveBayes Accuracy: 0.9596
RandomForest Accuracy: 0.9686


Parameters: { "use_label_encoder" } are not used.



XGBoost Accuracy: 0.9722


Parameters: { "use_label_encoder" } are not used.



VotingClassifier(estimators=[('NB', MultinomialNB()),
                             ('RF', RandomForestClassifier(random_state=42)),
                             ('XGB',
                              XGBClassifier(base_score=None, booster=None,
                                            callbacks=None,
                                            colsample_bylevel=None,
                                            colsample_bynode=None,
                                            colsample_bytree=None, device=None,
                                            early_stopping_rounds=None,
                                            enable_categorical=False,
                                            eval_metric='logloss',
                                            feature_types=None, gamma=None,
                                            grow_policy=None,
                                            importance_type=None,
                                            interaction_constraints=

In [25]:
from sklearn.metrics import accuracy_score
y_pred_ensemble = ensemble_model.predict(X_test_vec)
accuracy_score(y_test, y_pred_ensemble)


0.967713004484305