In [1]:
import re
import string

import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model      import LogisticRegression
from sklearn.naive_bayes       import MultinomialNB
from sklearn.ensemble          import RandomForestClassifier
from sklearn.model_selection   import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics           import (accuracy_score, precision_score,
                                       recall_score, f1_score, classification_report)

# 1) LOAD & CLEAN
df = pd.read_excel(r"C:\Users\rohit\OneDrive\Documents\Projects\Capstone Project\Training Dataset.xlsx")

# Drop stray header‐copy row & unnamed columns
df = df[df["Category"] != "Category"].copy()
df.drop(columns=[c for c in df.columns if c.startswith("Unnamed")], inplace=True)

# Rename for clarity
df.rename(columns={"Category": "Label", "Messages": "Text"}, inplace=True)

# Drop any missing texts
df.dropna(subset=["Text"], inplace=True)

# Encode labels: Ham=0, Spam=1
df["Label"] = df["Label"].map({"Ham": 0, "Spam": 1}).astype(int)

# 2) PREPROCESSING
def clean_telugu(text: str) -> str:
    text = text.lower()
    text = re.sub(r"\d+", "", text)                     # remove digits
    text = re.sub(f"[{re.escape(string.punctuation)}]", "", text)  # remove punctuation
    text = re.sub(r"\s+", " ", text)                    # collapse whitespace
    return text.strip()

df["Clean_Text"] = df["Text"].apply(clean_telugu)

# 3) FEATURE EXTRACTION
vectorizer = TfidfVectorizer(
    ngram_range=(1,2),   # unigrams + bigrams
    max_features=5000    # top 5k features by TF‑IDF score
)
X = vectorizer.fit_transform(df["Clean_Text"])
y = df["Label"]

# 4) TRAIN‑TEST SPLIT
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

# 5) MODEL SELECTION
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Naive Bayes":         MultinomialNB(),
    "Random Forest":       RandomForestClassifier(n_estimators=100, random_state=42),
}

results = []
for name, model in models.items():
    # 5a) (Optional) 5‑fold CV on training data
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    cv_f1 = cross_val_score(model, X_train, y_train, cv=cv,
                             scoring="f1", n_jobs=-1).mean()
    
    # 5b) Fit & predict on hold‑out
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # 6) EVALUATION
    acc  = accuracy_score(y_test,  y_pred)
    prec = precision_score(y_test, y_pred)
    rec  = recall_score(y_test, y_pred)
    f1   = f1_score(y_test,   y_pred)
    results.append({
        "Model": name,
        "CV F1": round(cv_f1,4),
        "Test Accuracy": round(acc,4),
        "Precision": round(prec,4),
        "Recall": round(rec,4),
        "F1 Score": round(f1,4),
    })
    print(f"\n=== {name} ===")
    print(classification_report(y_test, y_pred, digits=4))

# 7) SUMMARY
results_df = pd.DataFrame(results).sort_values(by="F1 Score", ascending=False)
print("\nModel Comparison:")
print(results_df.to_string(index=False))



=== Logistic Regression ===
              precision    recall  f1-score   support

           0     0.9261    1.0000    0.9616       263
           1     1.0000    0.9508    0.9748       427

    accuracy                         0.9696       690
   macro avg     0.9630    0.9754    0.9682       690
weighted avg     0.9718    0.9696    0.9698       690


=== Naive Bayes ===
              precision    recall  f1-score   support

           0     0.9490    0.7072    0.8105       263
           1     0.8441    0.9766    0.9055       427

    accuracy                         0.8739       690
   macro avg     0.8966    0.8419    0.8580       690
weighted avg     0.8841    0.8739    0.8693       690


=== Random Forest ===
              precision    recall  f1-score   support

           0     0.9228    1.0000    0.9599       263
           1     1.0000    0.9485    0.9736       427

    accuracy                         0.9681       690
   macro avg     0.9614    0.9742    0.9667       690
w

In [None]:
import re
import string
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    classification_report, confusion_matrix, ConfusionMatrixDisplay
)

# 1) LOAD & CLEAN
df = pd.read_excel(r"C:\Users\rohit\OneDrive\Documents\Projects\Capstone Project\Training Dataset.xlsx")

# Drop stray header‐copy row & unnamed columns
df = df[df["Category"] != "Category"].copy()
df.drop(columns=[c for c in df.columns if c.startswith("Unnamed")], inplace=True)

# Rename for clarity
df.rename(columns={"Category": "Label", "Messages": "Text"}, inplace=True)

# Drop any missing texts
df.dropna(subset=["Text"], inplace=True)

# Encode labels: Ham=0, Spam=1
df["Label"] = df["Label"].map({"Ham": 0, "Spam": 1}).astype(int)

# 2) EXPLORATORY DATA ANALYSIS (EDA)
plt.figure(figsize=(6, 4))
sns.countplot(data=df, x='Label')
plt.xticks([0, 1], ['Ham', 'Spam'])
plt.title("Distribution of Spam vs. Ham")
plt.xlabel("Label")
plt.ylabel("Count")
plt.show()

# Message length distribution
df['Text_Length'] = df['Text'].apply(len)
plt.figure(figsize=(8, 5))
sns.histplot(df, x='Text_Length', hue='Label', bins=50, kde=True)
plt.title("Message Length Distribution by Class")
plt.xlabel("Text Length")
plt.ylabel("Frequency")
plt.show()

# Top 20 words in spam and ham
spam_words = " ".join(df[df['Label'] == 1]['Text']).split()
ham_words = " ".join(df[df['Label'] == 0]['Text']).split()

spam_freq = Counter(spam_words)
ham_freq = Counter(ham_words)

spam_common = pd.DataFrame(spam_freq.most_common(20), columns=['Word', 'Frequency'])
ham_common = pd.DataFrame(ham_freq.most_common(20), columns=['Word', 'Frequency'])

plt.figure(figsize=(10, 5))
sns.barplot(data=spam_common, x='Frequency', y='Word', color='red')
plt.title("Top 20 Words in Spam Messages")
plt.xlabel("Frequency")
plt.ylabel("Word")
plt.show()

plt.figure(figsize=(10, 5))
sns.barplot(data=ham_common, x='Frequency', y='Word', color='green')
plt.title("Top 20 Words in Ham Messages")
plt.xlabel("Frequency")
plt.ylabel("Word")
plt.show()

# 3) TEXT PREPROCESSING
def clean_telugu(text: str) -> str:
    text = text.lower()
    text = re.sub(r"\d+", "", text)
    text = re.sub(f"[{re.escape(string.punctuation)}]", "", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()

df["Clean_Text"] = df["Text"].apply(clean_telugu)

# 4) FEATURE EXTRACTION
vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features=5000)
X = vectorizer.fit_transform(df["Clean_Text"])
y = df["Label"]

# 5) TRAIN‑TEST SPLIT
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# 6) MODEL SELECTION
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Naive Bayes": MultinomialNB(),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
}

results = []

for name, model in models.items():
    # Cross-validation F1 on training set
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    cv_f1 = cross_val_score(model, X_train, y_train, cv=cv, scoring="f1", n_jobs=-1).mean()

    # Train & predict
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Metrics
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    results.append({
        "Model": name,
        "CV F1": round(cv_f1, 4),
        "Test Accuracy": round(acc, 4),
        "Precision": round(prec, 4),
        "Recall": round(rec, 4),
        "F1 Score": round(f1, 4),
    })

    # Print report
    print(f"\n=== {name} ===")
    print(classification_report(y_test, y_pred, digits=4))

    # Confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["Ham", "Spam"])
    disp.plot(cmap="Blues")
    plt.title(f"Confusion Matrix – {name}")
    plt.show()

# 7) COMPARISON RESULTS
results_df = pd.DataFrame(results).sort_values(by="F1 Score", ascending=False)
print("\nModel Comparison:")
print(results_df.to_string(index=False))

# 8) MODEL COMPARISON VISUALIZATION
plt.figure(figsize=(10, 6))
sns.barplot(data=results_df, x="F1 Score", y="Model", palette="mako")
plt.title("Model Comparison by F1 Score")
plt.xlabel("F1 Score")
plt.ylabel("Model")
plt.xlim(0, 1)
plt.grid(axis='x')
plt.show()


In [None]:
def predict_new_telugu(text, model=models["Logistic Regression"]):
    cleaned = clean_telugu(text)
    vectorized = vectorizer.transform([cleaned])
    prediction = model.predict(vectorized)[0]
    return "Spam" if prediction == 1 else "Ham"


test_texts = [
    "మీరు ఒక లక్ష రూపాయలు గెలుచుకున్నారు! వివరాల కోసం ఈ లింక్‌ను క్లిక్ చేయండి.",
    "ఈరోజు మీ విద్యార్థి కార్డు తీసుకురండి.",
    "మీ బ్యాంకు ఖాతా సక్రియం చేయడానికి OTP పంపబడింది.",
    "పాఠశాల గురువారం ఉదయం 9:00కి ప్రారంభమవుతుంది.",
    "కాఫీకి వెళ్లుదాం నన్ను పిక్ చేయ్.",
    "కొత్త స్నేహితులు మీ కాల్ కోసం ఎదురు చూస్తున్నారు.డయల్ 5567866110.",
    "మీరు ఏమి చేస్తున్నారు.",
    "నువ్వు రేపు ఇంటికి వస్తావా?"
]

for txt in test_texts:
    result = predict_new_telugu(txt)
    print(f" '{txt}' → {result}")