In [1]:
import pandas as pd
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
nltk.download('stopwords')

df=pd.read_csv('spam.csv', encoding='latin-1')
df.drop(columns=['Unnamed: 2','Unnamed: 3','Unnamed: 4'], inplace=True)
df.columns=['label', 'text']
df = df.drop_duplicates()
df.head()


[nltk_data] Downloading package stopwords to C:\Users\Ryan
[nltk_data]     Matthew\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)                
    text = re.sub(r'[^\w\s]', '', text)            
    text = re.sub(r'\s+', ' ', text).strip()      
    
    stop_words = set(stopwords.words('english'))
    words = text.split()
    words = [word for word in words if word not in stop_words]
    return ' '.join(words)


df['clean_text'] = df['text'].apply(clean_text)
le = LabelEncoder()
df['label'] = le.fit_transform(df['label'])
df

Unnamed: 0,label,text,clean_text
0,0,"Go until jurong point, crazy.. Available only ...",go jurong point crazy available bugis n great ...
1,0,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,free entry wkly comp win fa cup final tkts st ...
3,0,U dun say so early hor... U c already then say...,u dun say early hor u c already say
4,0,"Nah I don't think he goes to usf, he lives aro...",nah dont think goes usf lives around though
...,...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...,nd time tried contact u u å pound prize claim ...
5568,0,Will Ì_ b going to esplanade fr home?,ì_ b going esplanade fr home
5569,0,"Pity, * was in mood for that. So...any other s...",pity mood soany suggestions
5570,0,The guy did some bitching but I acted like i'd...,guy bitching acted like id interested buying s...


In [None]:

datasets = {
    'original': df['text'],
    'cleaned': df['clean_text']
}

models = {
    'Naive Bayes': MultinomialNB(),
    'Random Forest': RandomForestClassifier(n_estimators=10, random_state=42),
    'XGBoost': XGBClassifier( eval_metric='logloss')
}

feature_extractors = {
    'BOW': CountVectorizer(),
    'TF-IDF': TfidfVectorizer()
}

results = []

for data_name, text_data in datasets.items():
    for feat_name, vectorizer in feature_extractors.items():
        X = vectorizer.fit_transform(text_data).toarray()
        y = df['label']
        
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42
        )
        
        for model_name, model in models.items():
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            acc = accuracy_score(y_test, y_pred)
            results.append({
                'Data': data_name,
                'Features': feat_name,
                'Model': model_name,
                'Accuracy': round(acc, 4)
            })

In [None]:

for data_name, text_data in datasets.items():
    for feat_name, vectorizer in feature_extractors.items():
        X = vectorizer.fit_transform(text_data).toarray()
        y = df['label']
        
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42
        )
        
        ensemble = VotingClassifier(
            estimators=[
                ('nb', MultinomialNB()),
                ('rf', RandomForestClassifier(n_estimators=10, random_state=42)),
                ('xgb', XGBClassifier( eval_metric='logloss'))
            ],
            voting='soft'
        )
        
        ensemble.fit(X_train, y_train)
        y_pred = ensemble.predict(X_test)
        acc = accuracy_score(y_test, y_pred)
        results.append({
            'Data': data_name,
            'Features': feat_name,
            'Model': 'Voting Ensemble',
            'Accuracy': round(acc, 4)
        })

results_df = pd.DataFrame(results)
print(results_df.pivot_table(index=['Data', 'Features'], columns='Model', values='Accuracy'))

Model              Naive Bayes  Random Forest  Voting Ensemble  XGBoost
Data     Features                                                      
cleaned  BOW            0.9613         0.9555           0.9797   0.9642
         TF-IDF         0.9603         0.9555           0.9739   0.9729
original BOW            0.9826         0.9671           0.9894   0.9836
         TF-IDF         0.9555         0.9700           0.9797   0.9787
