## Import Neccessary Libraries

In [27]:
import pandas as pd
import numpy as np
import re
import nltk
import spacy
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.feature_extraction.text import CountVectorizer
from langdetect import detect, LangDetectException

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')

nlp = spacy.load("en_core_web_sm")

[nltk_data] Downloading package stopwords to C:\Users\Nitro
[nltk_data]     V15\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Nitro
[nltk_data]     V15\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to C:\Users\Nitro
[nltk_data]     V15\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


## Load Data

In [28]:
df = pd.read_csv('spam.csv', encoding='latin-1')
df = df.rename(columns={'v1': 'label', 'v2': 'text'})
df = df[['label', 'text']]

## Preprocess Text

In [29]:
def is_english(text):
    try:
        return detect(text) == 'en'
    except LangDetectException:
        return False

print(f"Original row count: {len(df)}")
df['is_en'] = df['text'].apply(is_english)
df = df[df['is_en'] == True].drop(columns=['is_en'])
print(f"Row count after removing non-English: {len(df)}")

stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = text.strip()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    tokens = word_tokenize(text.lower())
    filtered_tokens = [w for w in tokens if w not in stop_words]

    return " ".join(filtered_tokens)

df['text2'] = df['text'].apply(preprocess_text)

df = df[df['text2'] != ""]

Original row count: 5572
Row count after removing non-English: 5043


## Feature Engineering

In [30]:
def feature_engineering(text):
    doc = nlp(text)
    total_tokens = len(doc)

    if total_tokens == 0:
        return pd.Series([0, 0, 0, 0, 0, 0])

    nouns = len([token for token in doc if token.pos_ == "NOUN"])
    verbs = len([token for token in doc if token.pos_ == "VERB"])
    adjs  = len([token for token in doc if token.pos_ == "ADJ"])

    noun_ratio = nouns / total_tokens
    verb_ratio = verbs / total_tokens
    adj_ratio  = adjs / total_tokens

    avg_token_len = sum(len(token.text) for token in doc) / total_tokens if total_tokens > 0 else 0

    unique_pos = len(set([token.pos_ for token in doc]))
    pos_diversity = unique_pos / total_tokens

    length = total_tokens

    return pd.Series([length, noun_ratio, verb_ratio, adj_ratio, avg_token_len, pos_diversity])

print("Processing calculate_text_stats...")

df = df.reset_index(drop=True)
df[['length', 'noun_ratio', 'verb_ratio', 'adj_ratio', 'avg_token_len', 'pos_diversity']] = df['text2'].apply(feature_engineering)

Processing calculate_text_stats...


## Transformation

In [31]:
le = LabelEncoder()
df['label_encoded'] = le.fit_transform(df['label'])

scaler = MinMaxScaler()
cols_to_norm = ['length', 'noun_ratio', 'verb_ratio', 'adj_ratio', 'avg_token_len', 'pos_diversity']
df[cols_to_norm] = scaler.fit_transform(df[cols_to_norm])

## BOW & Final Output

In [32]:
cv = CountVectorizer(max_features=300)
bow_matrix = cv.fit_transform(df['text2']).toarray()
bow_df = pd.DataFrame(bow_matrix, columns=cv.get_feature_names_out())

final_df = pd.concat([df[cols_to_norm], df[['label_encoded']], bow_df], axis=1)

print(f"Final Data Shape: {final_df.shape}")

print("\n--- Top 5 Samples ---")
display(final_df.head(5))

print("\n--- Bottom 5 Samples ---")
display(final_df.tail(5))

Final Data Shape: (5040, 307)

--- Top 5 Samples ---


Unnamed: 0,length,noun_ratio,verb_ratio,adj_ratio,avg_token_len,pos_diversity,label_encoded,account,actually,aight,...,would,xxx,ya,yeah,year,yes,yet,yo,youre,yup
0,0.182927,0.1875,0.125,0.125,0.080023,0.3125,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0.231707,0.35,0.2,0.1,0.085981,0.175,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0.097561,0.222222,0.222222,0.111111,0.046729,0.633333,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0.097561,0.111111,0.222222,0.111111,0.074766,0.755556,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0.207317,0.333333,0.111111,0.0,0.063863,0.266667,1,0,0,0,...,0,1,0,0,0,0,0,0,0,0



--- Bottom 5 Samples ---


Unnamed: 0,length,noun_ratio,verb_ratio,adj_ratio,avg_token_len,pos_diversity,label_encoded,account,actually,aight,...,would,xxx,ya,yeah,year,yes,yet,yo,youre,yup
5035,0.170732,0.666667,0.066667,0.066667,0.08785,0.34,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5036,0.04878,0.4,0.2,0.0,0.080374,0.78,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5037,0.036585,0.75,0.0,0.25,0.130841,0.45,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5038,0.170732,0.266667,0.2,0.266667,0.091589,0.34,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5039,0.02439,0.333333,0.333333,0.333333,0.074766,1.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Train & Test KNN Model

In [33]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score

X = final_df.drop(columns=['label_encoded'])
y = final_df['label_encoded']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

print(f"Training set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")

knn = KNeighborsClassifier(n_neighbors=5)

print("Training KNN model...")
knn.fit(X_train, y_train)

y_pred = knn.predict(X_test)

print("\n--- Model Evaluation ---")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Training set shape: (3528, 306)
Test set shape: (1512, 306)
Training KNN model...

--- Model Evaluation ---
Accuracy: 0.9511

Classification Report:
              precision    recall  f1-score   support

           0       0.95      1.00      0.97      1300
           1       0.96      0.68      0.80       212

    accuracy                           0.95      1512
   macro avg       0.96      0.84      0.88      1512
weighted avg       0.95      0.95      0.95      1512

