In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('./input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd

# Load the dataset
df = pd.read_csv('./input/spam.csv', encoding='latin-1')[['v1', 'v2']]
df.columns = ['label', 'text']
print(df.head())


In [None]:
from sklearn.preprocessing import LabelEncoder

# Convert 'ham' to 0 and 'spam' to 1
le = LabelEncoder()
df['label'] = le.fit_transform(df['label'])  # ham=0, spam=1


In [None]:
import re
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import nltk
nltk.download('stopwords')

def clean_text(text):
    # Lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = text.split()
    words = [w for w in words if w not in stop_words]
    # Stemming
    stemmer = PorterStemmer()
    words = [stemmer.stem(w) for w in words]
    return ' '.join(words)

# Create cleaned version
df['clean_text'] = df['text'].apply(clean_text)


In [None]:
from sklearn.model_selection import train_test_split

X_raw = df['text']
X_clean = df['clean_text']
y = df['label']

# Split both raw and clean
X_train_raw, X_test_raw, y_train, y_test = train_test_split(X_raw, y, test_size=0.2, random_state=42)
X_train_clean, X_test_clean, _, _ = train_test_split(X_clean, y, test_size=0.2, random_state=42)


In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Initialize vectorizers
bow_vectorizer = CountVectorizer()
tfidf_vectorizer = TfidfVectorizer()

# --- Bag of Words ---
X_train_bow_raw = bow_vectorizer.fit_transform(X_train_raw)
X_test_bow_raw = bow_vectorizer.transform(X_test_raw)

X_train_bow_clean = bow_vectorizer.fit_transform(X_train_clean)
X_test_bow_clean = bow_vectorizer.transform(X_test_clean)

# --- TF-IDF ---
X_train_tfidf_raw = tfidf_vectorizer.fit_transform(X_train_raw)
X_test_tfidf_raw = tfidf_vectorizer.transform(X_test_raw)

X_train_tfidf_clean = tfidf_vectorizer.fit_transform(X_train_clean)
X_test_tfidf_clean = tfidf_vectorizer.transform(X_test_clean)


In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score

def train_models(X_train, X_test, y_train, y_test, label=""):
    print(f"\n--- {label} ---")
    
    # Naive Bayes
    nb = MultinomialNB()
    nb.fit(X_train, y_train)
    pred_nb = nb.predict(X_test)
    print("\nNaive Bayes:")
    print(classification_report(y_test, pred_nb))

    # Random Forest
    rf = RandomForestClassifier(n_estimators=100, random_state=42)
    rf.fit(X_train, y_train)
    pred_rf = rf.predict(X_test)
    print("\nRandom Forest:")
    print(classification_report(y_test, pred_rf))

    # XGBoost
    xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
    xgb.fit(X_train, y_train)
    pred_xgb = xgb.predict(X_test)
    print("\nXGBoost:")
    print(classification_report(y_test, pred_xgb))
    
    # Add KNN
    #knn = KNeighborsClassifier(n_neighbors=5, metric='cosine')
    knn = KNeighborsClassifier(n_neighbors=5)
    knn.fit(X_train, y_train)
    pred_knn = knn.predict(X_test)
    print("\nK-Nearest Neighbors:")
    print(classification_report(y_test, pred_knn))


In [None]:
# BoW + Raw
train_models(X_train_bow_raw, X_test_bow_raw, y_train, y_test, label="BoW + Raw")

# BoW + Clean
train_models(X_train_bow_clean, X_test_bow_clean, y_train, y_test, label="BoW + Clean")

# TF-IDF + Raw
train_models(X_train_tfidf_raw, X_test_tfidf_raw, y_train, y_test, label="TF-IDF + Raw")

# TF-IDF + Clean
train_models(X_train_tfidf_clean, X_test_tfidf_clean, y_train, y_test, label="TF-IDF + Clean")


In [None]:
from sklearn.ensemble import VotingClassifier

def train_ensemble(X_train, X_test, y_train, y_test, label=""):
    print(f"\n--- Ensemble: {label} ---")
    ensemble = VotingClassifier(estimators=[
        ('nb', MultinomialNB()),
        ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
        ('xgb', XGBClassifier(use_label_encoder=False, eval_metric='logloss'))
    ], voting='soft')

    ensemble.fit(X_train, y_train)
    pred = ensemble.predict(X_test)
    print(classification_report(y_test, pred))

# Try ensemble on TF-IDF + Clean (best setup typically)
train_ensemble(X_train_tfidf_clean, X_test_tfidf_clean, y_train, y_test, label="TF-IDF + Clean")


In [None]:
import matplotlib.pyplot as plt

# Dummy example
models = ['NB-BOW', 'RF-BOW', 'XGB-BOW', 'NB-TFIDF', 'RF-TFIDF', 'XGB-TFIDF']
accuracies = [0.97, 0.98, 0.99, 0.985, 0.989, 0.993]

plt.figure(figsize=(10,5))
plt.bar(models, accuracies)
plt.ylabel('Accuracy')
plt.title('Model Performance Comparison')
plt.ylim(0.9, 1.0)
plt.xticks(rotation=45)
plt.show()
