In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('./input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd

# Load the dataset
df = pd.read_csv('./input/spam.csv', encoding='latin-1')[['v1', 'v2']]
df.columns = ['label', 'text']
print(df.head())


In [None]:
from sklearn.preprocessing import LabelEncoder

# Convert 'ham' to 0 and 'spam' to 1
le = LabelEncoder()
df['label'] = le.fit_transform(df['label'])  # ham=0, spam=1


In [None]:
import re
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import nltk
nltk.download('stopwords')

def clean_text(text):
    # Lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = text.split()
    words = [w for w in words if w not in stop_words]
    # Stemming
    stemmer = PorterStemmer()
    words = [stemmer.stem(w) for w in words]
    return ' '.join(words)

# Create cleaned version
df['clean_text'] = df['text'].apply(clean_text)


In [None]:
from sklearn.model_selection import train_test_split

X_raw = df['text']
X_clean = df['clean_text']
y = df['label']

# Split both raw and clean
X_train_raw, X_test_raw, y_train, y_test = train_test_split(X_raw, y, test_size=0.2, random_state=42)
X_train_clean, X_test_clean, _, _ = train_test_split(X_clean, y, test_size=0.2, random_state=42)


In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.decomposition import PCA

# Initialize vectorizers
bow_vectorizer = CountVectorizer()
tfidf_vectorizer = TfidfVectorizer()

# --- Bag of Words ---
X_train_bow_raw = bow_vectorizer.fit_transform(X_train_raw)
X_test_bow_raw = bow_vectorizer.transform(X_test_raw)

# Check number of features
n_features_train = X_train_bow_raw.shape
n_features_test = X_test_bow_raw.shape
print("Number of samples/features:", n_features_train)
print("Number of samples/features:", n_features_test)

X_train_bow_clean = bow_vectorizer.fit_transform(X_train_clean)
X_test_bow_clean = bow_vectorizer.transform(X_test_clean)

# --- TF-IDF ---
X_train_tfidf_raw = tfidf_vectorizer.fit_transform(X_train_raw)
X_test_tfidf_raw = tfidf_vectorizer.transform(X_test_raw)

X_train_tfidf_clean = tfidf_vectorizer.fit_transform(X_train_clean)
X_test_tfidf_clean = tfidf_vectorizer.transform(X_test_clean)


k = 1500  # Select top 1500 features (reduced from 1500 for efficiency)
selector = SelectKBest(chi2, k=k)
X_train_selected_raw = selector.fit_transform(X_train_tfidf_raw, y_train)  # Use TF-IDF matrix
X_test_selected_raw = selector.transform(X_test_tfidf_raw)
print("\nAfter Chi-Squared Selection:")
print("X_train_selected shape:", X_train_selected_raw.shape)
print("X_test_selected shape:", X_test_selected_raw.shape)

# Step 2: Feature Transformation with PCA
n_components = 20  # Reduce to 20 dimensions
pca_raw = PCA(n_components=n_components)
X_train_reduced_raw = pca_raw.fit_transform(X_train_selected_raw.toarray())  # Convert to dense
X_test_reduced_raw = pca_raw.transform(X_test_selected_raw.toarray())
print("\nAfter PCA:")
print("X_train_reduced shape:", X_train_reduced_raw.shape)
print("X_test_reduced shape:", X_test_reduced_raw.shape)

k = 1500  # Select top 1500 features (reduced from 1500 for efficiency)
selector = SelectKBest(chi2, k=k)
X_train_selected_clean = selector.fit_transform(X_train_tfidf_clean, y_train)  # Use TF-IDF matrix
X_test_selected_clean = selector.transform(X_test_tfidf_clean)
print("\nAfter Chi-Squared Selection:")
print("X_train_selected shape:", X_train_selected_clean.shape)
print("X_test_selected shape:", X_test_selected_clean.shape)

# Step 2: Feature Transformation with PCA
n_components = 20  # Reduce to 20 dimensions
pca_clean = PCA(n_components=n_components)
X_train_reduced_clean = pca_clean.fit_transform(X_train_selected_clean.toarray())  # Convert to dense
X_test_reduced_clean = pca_clean.transform(X_test_selected_clean.toarray())
print("\nAfter PCA:")
print("X_train_reduced shape:", X_train_reduced_clean.shape)
print("X_test_reduced shape:", X_test_reduced_clean.shape)


In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import KDTree

import time

from sklearn.metrics import classification_report, accuracy_score

def train_models(X_train, X_test, y_train, y_test, label=""):
    print(f"\n--- {label} ---")
    
    start_time_bayes = time.time()
    # Naive Bayes
    nb = MultinomialNB()
    nb.fit(X_train, y_train)
    pred_nb = nb.predict(X_test)
    end_time_bayes = time.time()
    execution_time_bayes = end_time_bayes - start_time_bayes
    print("\nNaive Bayes:")
    print(classification_report(y_test, pred_nb))
    
    
    start_time_brute = time.time()
    
    # Add KNN
    #knn = KNeighborsClassifier(n_neighbors=5, metric='cosine')
    knn = KNeighborsClassifier(n_neighbors=5)
    knn.fit(X_train, y_train)
    pred_knn = knn.predict(X_test)
    end_time_brute = time.time()
    execution_time_brute = end_time_brute - start_time_brute
    print("\nK-Nearest Neighbors:")
    print(classification_report(y_test, pred_knn))
    
    
    # KNN with k-d tree
    start_time_kd = time.time()
    kd = KNeighborsClassifier(n_neighbors=5, algorithm='kd_tree')
    kd.fit(X_train.toarray(), y_train)
    pred_kd = kd.predict(X_test.toarray())
    end_time_kd = time.time()
    execution_time_kd = end_time_kd - start_time_kd
    print("\nK-Nearest Neighbors with k-d Tree:")
    print(classification_report(y_test, pred_kd))
    
        # Calculate the difference in execution times
    time_difference = execution_time_kd - execution_time_brute
    time_difference_bd = execution_time_kd - execution_time_bayes
    time_difference_bb = execution_time_brute - execution_time_bayes

    # Print the execution times and their difference
    print(f"Execution Time (Brute Force): {execution_time_brute:.6f} seconds")
    print(f"Execution Time (KD Tree): {execution_time_kd:.6f} seconds")
    print(f"Execution Time (Bayes): {execution_time_bayes:.6f} seconds")
    print(f"Difference (KD Tree - Brute Force): {time_difference:.6f} seconds")
    print(f"Difference (KD Tree - Bayes): {time_difference_bd:.6f} seconds")
    print(f"Difference (Bayes - Brute Force): {time_difference_bb:.6f} seconds")
    
    

In [None]:
def train_models_reduced(X_train, X_test, y_train, y_test, label=""):
  
  
  
    print(f"\n--- {label} ---")
    start_time_brute = time.time()
    
    # Add KNN
    #knn = KNeighborsClassifier(n_neighbors=5, metric='cosine')
    knn = KNeighborsClassifier(n_neighbors=5)
    knn.fit(X_train, y_train)
    pred_knn = knn.predict(X_test)
    end_time_brute = time.time()
    execution_time_brute = end_time_brute - start_time_brute
    print("\nK-Nearest Neighbors:")
    print(classification_report(y_test, pred_knn))
    
    # # Print classification reports
    # print("\nk-d Tree Classification Report:")
    # print(classification_report(y_test, pred_knn, zero_division=0))
    
    # KNN with k-d tree
    start_time_kd = time.time()
    kd = KNeighborsClassifier(n_neighbors=5, algorithm='kd_tree')
    kd.fit(X_train, y_train)
    pred_kd = kd.predict(X_test)
    end_time_kd = time.time()
    execution_time_kd = end_time_kd - start_time_kd
    print("\nK-Nearest Neighbors with k-d Tree:")
    print(classification_report(y_test, pred_kd))
    
        # Calculate the difference in execution times
    time_difference = execution_time_kd - execution_time_brute
    
    
    # print("\nBrute Force Classification Report:")
    # print(classification_report(y_test, pred_kd, zero_division=0))
    
      # Print the execution times and their difference
    print(f"Execution Time (Brute Force): {execution_time_brute:.6f} seconds")
    print(f"Execution Time (KD Tree): {execution_time_kd:.6f} seconds")
    print(f"Difference (KD Tree - Brute Force): {time_difference:.6f} seconds")
    
    

In [None]:
# BoW + Raw
train_models(X_train_bow_raw, X_test_bow_raw, y_train, y_test, label="BoW + Raw")

# BoW + Clean
train_models(X_train_bow_clean, X_test_bow_clean, y_train, y_test, label="BoW + Clean")

# TF-IDF + Raw
train_models(X_train_tfidf_raw, X_test_tfidf_raw, y_train, y_test, label="TF-IDF + Raw")

# TF-IDF + Clean
train_models(X_train_tfidf_clean, X_test_tfidf_clean, y_train, y_test, label="TF-IDF + Clean")

#  TF-IDF + Raw for best 20 fts
train_models_reduced(X_train_reduced_raw, X_test_reduced_raw, y_train, y_test, label="TF-IDF Reduced Raw")

#  TF-IDF + Clean for best 20 fts
train_models_reduced(X_train_reduced_clean, X_test_reduced_clean, y_train, y_test, label="TF-IDF Reduced Clean")


In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier

def plot_knn_accuracy(X_train, X_test, y_train, y_test, k_values, label=""):
    accuracies = []
    for k in k_values:
        knn = KNeighborsClassifier(n_neighbors=k)
        knn.fit(X_train, y_train)
        pred = knn.predict(X_test)
        acc = accuracy_score(y_test, pred)
        accuracies.append(acc)
    
    plt.figure(figsize=(10, 6))
    plt.plot(k_values, accuracies, marker='o')
    plt.title(f'kNN Accuracy vs. Number of Neighbors ({label})')
    plt.xlabel('Number of Neighbors (k)')
    plt.ylabel('Accuracy')
    plt.grid(True)
    plt.show()

# Define the range of k values from 2 to 20
k_values = range(2, 21)

# Generate plot for TF-IDF Reduced Clean
plot_knn_accuracy(X_train_reduced_clean, X_test_reduced_clean, y_train, y_test, k_values, label="TF-IDF Reduced Clean")

In [None]:
import time
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier

# Define range of PCA components to test
n_components_list = range(10, 61)
times_brute = []
times_kd = []

# Loop over different numbers of PCA components
for n_components in n_components_list:
    print(f"Processing n_components = {n_components}")
    
    # Apply PCA
    pca = PCA(n_components=n_components)
    X_train_pca = pca.fit_transform(X_train_selected_clean.toarray())
    X_test_pca = pca.transform(X_test_selected_clean.toarray())
    
    # kNN with Brute Force
    knn_brute = KNeighborsClassifier(n_neighbors=5, algorithm='brute')
    start_time = time.time()
    knn_brute.fit(X_train_pca, y_train)
    pred_brute = knn_brute.predict(X_test_pca)
    end_time = time.time()
    time_brute = end_time - start_time
    times_brute.append(time_brute)
    
    # kNN with KD-Tree
    knn_kd = KNeighborsClassifier(n_neighbors=5, algorithm='kd_tree')
    start_time = time.time()
    knn_kd.fit(X_train_pca, y_train)
    pred_kd = knn_kd.predict(X_test_pca)
    end_time = time.time()
    time_kd = end_time - start_time
    times_kd.append(time_kd)

# Plot the results
plt.figure(figsize=(10, 6))
plt.plot(n_components_list, times_brute, label='Brute Force', marker='o')
plt.plot(n_components_list, times_kd, label='KD-Tree', marker='o')
plt.xlabel('Number of Features (PCA Components)')
plt.ylabel('Execution Time (seconds)')
plt.title('Execution Time vs. Number of Features for kNN')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import numpy as np

# Convert sparse matrices to dense (assuming X_train_selected_clean and X_test_selected_clean are sparse)
X_train_dense = X_train_selected_clean.toarray()
X_test_dense = X_test_selected_clean.toarray()

# Standardize the data
scaler = StandardScaler()
X_train_std = scaler.fit_transform(X_train_dense)
X_test_std = scaler.transform(X_test_dense)

# Define the range of PCA components
n_components_list = range(2, 26)
accuracies = []

# Loop over number of PCA components
for n in n_components_list:
    # Apply PCA
    pca = PCA(n_components=n)
    X_train_pca = pca.fit_transform(X_train_std)
    X_test_pca = pca.transform(X_test_std)
    
    # Train k-NN with KD-tree
    knn = KNeighborsClassifier(n_neighbors=5, algorithm='kd_tree')
    knn.fit(X_train_pca, y_train)
    pred = knn.predict(X_test_pca)
    acc = accuracy_score(y_test, pred)
    accuracies.append(acc)
    print(f'n_components={n}, accuracy={acc:.4f}')

# Plot the results
plt.figure(figsize=(10, 6))
plt.plot(n_components_list, accuracies, marker='o', linestyle='-', color='b')
plt.xlabel('Number of PCA Components')
plt.ylabel('Test Accuracy')
plt.title('Accuracy vs. Number of PCA Components for k-NN with KD-Tree')
plt.grid(True)
plt.show()