In [None]:
!pip install lazypredict

Collecting lazypredict
  Downloading lazypredict-0.2.16-py2.py3-none-any.whl.metadata (13 kB)
Collecting pytest-runner (from lazypredict)
  Downloading pytest_runner-6.0.1-py3-none-any.whl.metadata (7.3 kB)
Collecting mlflow>=2.0.0 (from lazypredict)
  Downloading mlflow-3.2.0-py3-none-any.whl.metadata (29 kB)
Collecting mlflow-skinny==3.2.0 (from mlflow>=2.0.0->lazypredict)
  Downloading mlflow_skinny-3.2.0-py3-none-any.whl.metadata (30 kB)
Collecting mlflow-tracing==3.2.0 (from mlflow>=2.0.0->lazypredict)
  Downloading mlflow_tracing-3.2.0-py3-none-any.whl.metadata (19 kB)
Collecting alembic!=1.10.0,<2 (from mlflow>=2.0.0->lazypredict)
  Downloading alembic-1.16.4-py3-none-any.whl.metadata (7.3 kB)
Collecting docker<8,>=4.0.0 (from mlflow>=2.0.0->lazypredict)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow>=2.0.0->lazypredict)
  Downloading graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting gunicorn<24 (from mlflow>=2.0.0

In [None]:
import re
import zipfile
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.metrics import accuracy_score
from lazypredict.Supervised import LazyClassifier

In [None]:
patterns = {
    "alphanumeric": r"^(?=.*[a-zA-Z])(?=.*\d)[a-zA-Z0-9]+$",
    "numbers": r"^\d+$",
    "alpha_num_sym": r"^(?=.*[a-zA-Z])(?=.*\d)(?=.*\W).+$",
    "num_sym": r"^(?=.*\d)(?=.*\W)[^a-zA-Z]+$",
    "alpha_sym": r"^(?=.*[a-zA-Z])(?=.*\W)[^\d]+$",
    "symbols": r"^[^a-zA-Z0-9\s]+$",
}

In [None]:
def categorize_word(word):
    for category, pattern in patterns.items():
        if re.match(pattern, word):
            return category
    return "unknown"

In [None]:
def compute_tf(message):
    words = re.findall(r'\S+', message)
    counts = {key: 0 for key in patterns}
    for word in words:
        cat = categorize_word(word)
        if cat in counts:
            counts[cat] += 1
    total = sum(counts.values())
    return {key: counts[key] / total if total > 0 else 0 for key in patterns}

In [None]:
def compute_idf(tf_data, categories):
    N = len(tf_data)
    idf_scores = {}
    for cat in categories:
        doc_count = sum(1 for tf in tf_data if tf[cat] > 0)
        idf_scores[cat] = np.log((N + 1) / (doc_count + 1)) + 1
    return idf_scores

In [None]:
## To execute the Zip file
def build_tf_idf_dataframe(zip_path, internal_filename="SMSSpamCollection"):
    messages, labels, tf_data = [], [], []

    with zipfile.ZipFile(zip_path, "r") as zip_ref:
        try:
            with zip_ref.open(internal_filename) as f:
                lines = f.read().decode("utf-8").splitlines()
        except UnicodeDecodeError:
            with zip_ref.open(internal_filename) as f:
                lines = f.read().decode("latin1").splitlines()

        for line in lines:
            parts = line.strip().split('\t', 1)
            if len(parts) != 2:
                continue
            label, message = parts
            tf = compute_tf(message)
            tf_data.append(tf)
            messages.append(message)
            labels.append(label)

    categories = list(patterns.keys())
    idf_scores = compute_idf(tf_data, categories)

    data = []
    for i in range(len(messages)):
        row = {f"tf_{cat}": tf_data[i][cat] for cat in categories}
        row.update({f"idf_{cat}": idf_scores[cat] for cat in categories})
        row["label"] = labels[i]
        row["message"] = messages[i]
        data.append(row)

    df = pd.DataFrame(data)
    return df, categories

In [None]:
# To execute a CSV file
def build_tf_idf_dataframe(input_file):
    messages, labels, tf_data = [], [], []

    with open(input_file, "r", encoding="utf-8") as f:
        lines = f.read().splitlines()
        for line in lines:
            parts = line.strip().split(',', 1)
            if len(parts) != 2:
                continue
            label, message = parts
            tf = compute_tf(message)
            tf_data.append(tf)
            messages.append(message)
            labels.append(label)

    categories = list(patterns.keys())
    idf_scores = compute_idf(tf_data, categories)

    data = []
    for i in range(len(messages)):
        row = {f"tf_{cat}": tf_data[i][cat] for cat in categories}
        row.update({f"idf_{cat}": idf_scores[cat] for cat in categories})
        row["label"] = labels[i]
        row["message"] = messages[i]
        data.append(row)

    df = pd.DataFrame(data)
    return df, categories

In [None]:
input_file = "SMSSpamCollection.zip"  # Make sure this ZIP file is in your directory
output_file = "tf_idf_combined.csv"

In [None]:
##df, categories = build_tf_idf_dataframe(input_file, "SMSSpamCollection")
df, categories = build_tf_idf_dataframe(input_file)
df.to_csv(output_file, index=False)

le = LabelEncoder()
y = le.fit_transform(df["label"])

tf_cols = [f"tf_{cat}" for cat in categories]
idf_cols = [f"idf_{cat}" for cat in categories]
tf_idf_cols = [f"tf_idf_{cat}" for cat in categories]

X_tf = df[tf_cols].values
X_idf = df[idf_cols].values


In [None]:
df.head()

Unnamed: 0,tf_alphanumeric,tf_numbers,tf_alpha_num_sym,tf_num_sym,tf_alpha_sym,tf_symbols,idf_alphanumeric,idf_numbers,idf_alpha_num_sym,idf_num_sym,idf_alpha_sym,idf_symbols,label,message
0,0.0,0.0,0.0,0.0,0.0,0.0,5.67,3.47,4.09,3.87,2.24,2.08,message,label
1,0.0,0.0,0.0,0.0,0.0,0.0,5.67,3.47,4.09,3.87,2.24,2.08,एक डॉलर का ख़रीदारी ख़रीदारी के लिए अपने घर जा...,ham
2,0.0,0.0,0.0,0.0,0.25,0.75,5.67,3.47,4.09,3.87,2.24,2.08,"""हर छोटी कोशिश का बड़ा परिणाम होता है","उसे नजरअंदाज न करें।"",ham"
3,0.0,0.0,0.0,0.0,0.0,0.0,5.67,3.47,4.09,3.87,2.24,2.08,क्या तुमने मेरे पिछले संदेश का जवाब दिया?,ham
4,0.0,0.0,0.0,0.0,0.0,0.0,5.67,3.47,4.09,3.87,2.24,2.08,प्रति जोड़ा *मनाली 3एन/4डी@14 999 *शिमला 2एन3ड...,spam


In [None]:
df ['tf_idf_alphanumeric'] = df['tf_alphanumeric']*df['idf_alphanumeric']
df ['tf_idf_numbers'] = df['tf_numbers'] * df['idf_numbers']
df ['tf_idf_alpha_num_sym'] = df['tf_alpha_num_sym'] * df['idf_alpha_num_sym']
df ['tf_idf_num_sym'] = df['tf_num_sym'] * df['idf_num_sym']
df ['tf_idf_alpha_sym'] = df['tf_alpha_sym'] * df['idf_alpha_sym']
df ['tf_idf_symbols'] = df['tf_symbols'] * df['idf_symbols']

In [None]:
df

Unnamed: 0,tf_alphanumeric,tf_numbers,tf_alpha_num_sym,tf_num_sym,tf_alpha_sym,tf_symbols,idf_alphanumeric,idf_numbers,idf_alpha_num_sym,idf_num_sym,idf_alpha_sym,idf_symbols,label,message,tf_idf_alphanumeric,tf_idf_numbers,tf_idf_alpha_num_sym,tf_idf_num_sym,tf_idf_alpha_sym,tf_idf_symbols
0,0.00,0.00,0.00,0.00,0.00,0.00,5.67,3.47,4.09,3.87,2.24,2.08,message,label,0.00,0.00,0.00,0.00,0.00,0.00
1,0.00,0.00,0.00,0.00,0.00,0.00,5.67,3.47,4.09,3.87,2.24,2.08,एक डॉलर का ख़रीदारी ख़रीदारी के लिए अपने घर जा...,ham,0.00,0.00,0.00,0.00,0.00,0.00
2,0.00,0.00,0.00,0.00,0.25,0.75,5.67,3.47,4.09,3.87,2.24,2.08,"""हर छोटी कोशिश का बड़ा परिणाम होता है","उसे नजरअंदाज न करें।"",ham",0.00,0.00,0.00,0.00,0.56,1.56
3,0.00,0.00,0.00,0.00,0.00,0.00,5.67,3.47,4.09,3.87,2.24,2.08,क्या तुमने मेरे पिछले संदेश का जवाब दिया?,ham,0.00,0.00,0.00,0.00,0.00,0.00
4,0.00,0.00,0.00,0.00,0.00,0.00,5.67,3.47,4.09,3.87,2.24,2.08,प्रति जोड़ा *मनाली 3एन/4डी@14 999 *शिमला 2एन3ड...,spam,0.00,0.00,0.00,0.00,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4044,0.00,0.00,0.00,0.00,0.05,0.95,5.67,3.47,4.09,3.87,2.24,2.08,"""मैं सोने जा रहा हूं... मुझे अब पढ़ाई करने का ...",धन्यवाद... बेहतर होगा कि आप प्रार्थना करें कि...,0.00,0.00,0.00,0.00,0.11,1.98
4045,0.00,0.00,0.00,0.00,0.00,0.00,5.67,3.47,4.09,3.87,2.24,2.08,हमने मुफ्त जियो मोबाइल + मुफ्त कैमकॉर्डर के लि...,spam,0.00,0.00,0.00,0.00,0.00,0.00
4046,0.00,0.00,0.00,0.00,0.02,0.98,5.67,3.47,4.09,3.87,2.24,2.08,"""मैं चाहता हूं कि आप मुझे बताएं कि कोलग में क्...",क्या मुझे अपना केक ऑर्डर करना चाहिए या आप सब ...,0.00,0.00,0.00,0.00,0.05,2.04
4047,0.00,0.00,0.00,0.00,0.00,0.00,5.67,3.47,4.09,3.87,2.24,2.08,मैं आपको प्यार करता हूं!,ham,0.00,0.00,0.00,0.00,0.00,0.00


In [None]:
df_final = df.iloc[:, 12:19]

In [None]:
df_final

Unnamed: 0,label,message,tf_idf_alphanumeric,tf_idf_numbers,tf_idf_alpha_num_sym,tf_idf_num_sym,tf_idf_alpha_sym
0,message,label,0.00,0.00,0.00,0.00,0.00
1,एक डॉलर का ख़रीदारी ख़रीदारी के लिए अपने घर जा...,ham,0.00,0.00,0.00,0.00,0.00
2,"""हर छोटी कोशिश का बड़ा परिणाम होता है","उसे नजरअंदाज न करें।"",ham",0.00,0.00,0.00,0.00,0.56
3,क्या तुमने मेरे पिछले संदेश का जवाब दिया?,ham,0.00,0.00,0.00,0.00,0.00
4,प्रति जोड़ा *मनाली 3एन/4डी@14 999 *शिमला 2एन3ड...,spam,0.00,0.00,0.00,0.00,0.00
...,...,...,...,...,...,...,...
4044,"""मैं सोने जा रहा हूं... मुझे अब पढ़ाई करने का ...",धन्यवाद... बेहतर होगा कि आप प्रार्थना करें कि...,0.00,0.00,0.00,0.00,0.11
4045,हमने मुफ्त जियो मोबाइल + मुफ्त कैमकॉर्डर के लि...,spam,0.00,0.00,0.00,0.00,0.00
4046,"""मैं चाहता हूं कि आप मुझे बताएं कि कोलग में क्...",क्या मुझे अपना केक ऑर्डर करना चाहिए या आप सब ...,0.00,0.00,0.00,0.00,0.05
4047,मैं आपको प्यार करता हूं!,ham,0.00,0.00,0.00,0.00,0.00


In [None]:
y=df_final['label']

In [None]:
X = df_final.drop(['label','message'], axis=1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)
models, predictions = clf.fit(X_train, X_test, y_train, y_test)

# Display results
print(models)

  0%|          | 0/32 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
knn = KNeighborsClassifier(n_neighbors=3)  # You can tune 'n_neighbors'
knn.fit(X_train, y_train)

In [None]:
y_pred = knn.predict(X_test)

In [None]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9479820627802691
Confusion Matrix:
 [[938  16]
 [ 42 119]]
Classification Report:
               precision    recall  f1-score   support

         ham       0.96      0.98      0.97       954
        spam       0.88      0.74      0.80       161

    accuracy                           0.95      1115
   macro avg       0.92      0.86      0.89      1115
weighted avg       0.95      0.95      0.95      1115



In [None]:
print(f"Accuracy : {accuracy_score(y_test, y_pred):.4f}")

Accuracy : 0.9480
