In [None]:
import re
import zipfile
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.preprocessing import LabelEncoder

patterns = {
    "alphanumeric": r"^(?=.*[a-zA-Z])(?=.*\d)[a-zA-Z0-9]+$",
    "numbers": r"^\d+$",
    "alpha_num_sym": r"^(?=.*[a-zA-Z])(?=.*\d)(?=.*\W).+$",
    "num_sym": r"^(?=.*\d)(?=.*\W)[^a-zA-Z]+$",
    "alpha_sym": r"^(?=.*[a-zA-Z])(?=.*\W)[^\d]+$",
    "symbols": r"^[^a-zA-Z0-9\s]+$",
}

def categorize_word(word):
    for category, pattern in patterns.items():
        if re.match(pattern, word):
            return category
    return "unknown"

# TF
def compute_tf(message):
    words = re.findall(r'\S+', message)
    counts = {key: 0 for key in patterns}
    for word in words:
        cat = categorize_word(word)
        if cat in counts:
            counts[cat] += 1
    total = sum(counts.values())
    return {key: counts[key] / total if total > 0 else 0 for key in patterns}

# IDF
def compute_idf(tf_data, categories):
    N = len(tf_data)
    idf_scores = {}
    for cat in categories:
        doc_count = sum(1 for tf in tf_data if tf[cat] > 0)
        idf_scores[cat] = np.log((N + 1) / (doc_count + 1)) + 1
    return idf_scores

def build_tf_idf_dataframe(zip_path, internal_filename="SMSSpamCollection"):
    messages, labels, tf_data = [], [], []

    with zipfile.ZipFile(zip_path, "r") as zip_ref:
        try:
            with zip_ref.open(internal_filename) as f:
                lines = f.read().decode("utf-8").splitlines()
        except UnicodeDecodeError:
            with zip_ref.open(internal_filename) as f:
                lines = f.read().decode("latin1").splitlines()

        for line in lines:
            parts = line.strip().split('\t', 1)
            if len(parts) != 2:
                continue
            label, message = parts
            tf = compute_tf(message)
            tf_data.append(tf)
            messages.append(message)
            labels.append(label)

    categories = list(patterns.keys())
    idf_scores = compute_idf(tf_data, categories)

    data = []
    for i in range(len(messages)):
        row = {f"tf_{cat}": tf_data[i][cat] for cat in categories}
        row.update({f"idf_{cat}": idf_scores[cat] for cat in categories})
        row["label"] = labels[i]
        row["message"] = messages[i]
        data.append(row)

    df = pd.DataFrame(data)
    return df, categories

# Cosine classifier
def cosine_classifier(X_train, y_train, X_input):
    sim_matrix = cosine_similarity(X_input, X_train)
    return np.array([y_train[sim.argmax()] for sim in sim_matrix])

input_file = "SMSSpamCollection.zip"  # Must be in the same directory

df, categories = build_tf_idf_dataframe(input_file, "SMSSpamCollection")
le = LabelEncoder()
y = le.fit_transform(df["label"])

tf_cols = [f"tf_{cat}" for cat in categories]
idf_cols = [f"idf_{cat}" for cat in categories]
X_combined = np.hstack((df[tf_cols].values, df[idf_cols].values))

# Split into 70% train, 30% test
X_train, X_test, y_train, y_test, df_train, df_test = train_test_split(
    X_combined, y, df, test_size=0.3, random_state=42
)

y_pred_train = cosine_classifier(X_train, y_train, X_train)
y_pred_test = cosine_classifier(X_train, y_train, X_test)

def print_metrics(split, y_true, y_pred):
    print(f"\n=== {split} Results (Combined TF + IDF with Cosine Similarity) ===")
    print(f"Accuracy : {accuracy_score(y_true, y_pred):.4f}")
    print(f"Precision: {precision_score(y_true, y_pred, zero_division=0):.4f}")
    print(f"Recall   : {recall_score(y_true, y_pred, zero_division=0):.4f}")

print_metrics("Train", y_train, y_pred_train)
print_metrics("Test", y_test, y_pred_test)

df_train["true_label"] = y_train
df_train["predicted"] = y_pred_train
df_train.to_csv("train_results.csv", index=False)

df_test["true_label"] = y_test
df_test["predicted"] = y_pred_test
df_test.to_csv("test_results.csv", index=False)



=== Train Results (Combined TF + IDF with Cosine Similarity) ===
Accuracy : 0.9613
Precision: 0.8776
Recall   : 0.8253

=== Test Results (Combined TF + IDF with Cosine Similarity) ===
Accuracy : 0.9438
Precision: 0.8333
Recall   : 0.7301


In [None]:






# Hindi dataset

In [None]:
!pip install lazypredict

Collecting lazypredict
  Downloading lazypredict-0.2.16-py2.py3-none-any.whl.metadata (13 kB)
Collecting pytest-runner (from lazypredict)
  Downloading pytest_runner-6.0.1-py3-none-any.whl.metadata (7.3 kB)
Collecting mlflow>=2.0.0 (from lazypredict)
  Downloading mlflow-3.3.1-py3-none-any.whl.metadata (30 kB)
Collecting mlflow-skinny==3.3.1 (from mlflow>=2.0.0->lazypredict)
  Downloading mlflow_skinny-3.3.1-py3-none-any.whl.metadata (31 kB)
Collecting mlflow-tracing==3.3.1 (from mlflow>=2.0.0->lazypredict)
  Downloading mlflow_tracing-3.3.1-py3-none-any.whl.metadata (19 kB)
Collecting alembic!=1.10.0,<2 (from mlflow>=2.0.0->lazypredict)
  Downloading alembic-1.16.4-py3-none-any.whl.metadata (7.3 kB)
Collecting docker<8,>=4.0.0 (from mlflow>=2.0.0->lazypredict)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow>=2.0.0->lazypredict)
  Downloading graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting gunicorn<24 (from mlflow>=2.0.0

In [None]:
import re
import zipfile
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.metrics import accuracy_score
from lazypredict.Supervised import LazyClassifier

In [None]:
patterns = {
    "alphanumeric": r"^(?=.*[a-zA-Z])(?=.*\d)[a-zA-Z0-9]+$",
    "numbers": r"^\d+$",
    "alpha_num_sym": r"^(?=.*[a-zA-Z])(?=.*\d)(?=.*\W).+$",
    "num_sym": r"^(?=.*\d)(?=.*\W)[^a-zA-Z]+$",
    "alpha_sym": r"^(?=.*[a-zA-Z])(?=.*\W)[^\d]+$",
    "symbols": r"^[^a-zA-Z0-9\s]+$",
}

In [None]:
def categorize_word(word):
    for category, pattern in patterns.items():
        if re.match(pattern, word):
            return category
    return "unknown"

In [None]:
def compute_tf(message):
    words = re.findall(r'\S+', message)
    counts = {key: 0 for key in patterns}
    for word in words:
        cat = categorize_word(word)
        if cat in counts:
            counts[cat] += 1
    total = sum(counts.values())
    return {key: counts[key] / total if total > 0 else 0 for key in patterns}

In [None]:
def compute_idf(tf_data, categories):
    N = len(tf_data)
    idf_scores = {}
    for cat in categories:
        doc_count = sum(1 for tf in tf_data if tf[cat] > 0)
        idf_scores[cat] = np.log((N + 1) / (doc_count + 1)) + 1
    return idf_scores

In [None]:
## To execute the Zip file
def build_tf_idf_dataframe(zip_path, internal_filename="Hindi.csv"):
    messages, labels, tf_data = [], [], []

    with zipfile.ZipFile(zip_path, "r") as zip_ref:
        try:
            with zip_ref.open(internal_filename) as f:
                lines = f.read().decode("utf-8").splitlines()
        except UnicodeDecodeError:
            with zip_ref.open(internal_filename) as f:
                lines = f.read().decode("latin1").splitlines()

        for line in lines:
            parts = line.strip().split('\t', 1)
            if len(parts) != 2:
                continue
            label, message = parts
            tf = compute_tf(message)
            tf_data.append(tf)
            messages.append(message)
            labels.append(label)

    categories = list(patterns.keys())
    idf_scores = compute_idf(tf_data, categories)

    data = []
    for i in range(len(messages)):
        row = {f"tf_{cat}": tf_data[i][cat] for cat in categories}
        row.update({f"idf_{cat}": idf_scores[cat] for cat in categories})
        row["label"] = labels[i]
        row["message"] = messages[i]
        data.append(row)

    df = pd.DataFrame(data)
    return df, categories

In [None]:
# To execute a CSV file
def build_tf_idf_dataframe(input_file):
    messages, labels, tf_data = [], [], []

    with open(input_file, "r", encoding="utf-8") as f:
        lines = f.read().splitlines()
        for line in lines:
            parts = line.strip().split(',', 1)
            if len(parts) != 2:
                continue
            label, message = parts
            tf = compute_tf(message)
            tf_data.append(tf)
            messages.append(message)
            labels.append(label)

    categories = list(patterns.keys())
    idf_scores = compute_idf(tf_data, categories)

    data = []
    for i in range(len(messages)):
        row = {f"tf_{cat}": tf_data[i][cat] for cat in categories}
        row.update({f"idf_{cat}": idf_scores[cat] for cat in categories})
        row["label"] = labels[i]
        row["message"] = messages[i]
        data.append(row)

    df = pd.DataFrame(data)
    return df, categories

In [None]:
input_file = "Hindi.csv"  # Make sure this ZIP file is in your directory
output_file = "tf_idf_combined.csv"

In [None]:
##df, categories = build_tf_idf_dataframe(input_file, "/content/Hindi.csv")
df, categories = build_tf_idf_dataframe(input_file)
df.to_csv(output_file, index=False)

le = LabelEncoder()
y = le.fit_transform(df["label"])

tf_cols = [f"tf_{cat}" for cat in categories]
idf_cols = [f"idf_{cat}" for cat in categories]
tf_idf_cols = [f"tf_idf_{cat}" for cat in categories]

X_tf = df[tf_cols].values
X_idf = df[idf_cols].values




In [None]:
df.head()


Unnamed: 0,tf_alphanumeric,tf_numbers,tf_alpha_num_sym,tf_num_sym,tf_alpha_sym,tf_symbols,idf_alphanumeric,idf_numbers,idf_alpha_num_sym,idf_num_sym,idf_alpha_sym,idf_symbols,label,message
0,0.0,0.0,0.0,0.0,0.0,0.0,5.67,3.47,4.09,3.87,2.24,2.08,message,label
1,0.0,0.0,0.0,0.0,0.0,0.0,5.67,3.47,4.09,3.87,2.24,2.08,एक डॉलर का ख़रीदारी ख़रीदारी के लिए अपने घर जा...,ham
2,0.0,0.0,0.0,0.0,0.25,0.75,5.67,3.47,4.09,3.87,2.24,2.08,"""हर छोटी कोशिश का बड़ा परिणाम होता है","उसे नजरअंदाज न करें।"",ham"
3,0.0,0.0,0.0,0.0,0.0,0.0,5.67,3.47,4.09,3.87,2.24,2.08,क्या तुमने मेरे पिछले संदेश का जवाब दिया?,ham
4,0.0,0.0,0.0,0.0,0.0,0.0,5.67,3.47,4.09,3.87,2.24,2.08,प्रति जोड़ा *मनाली 3एन/4डी@14 999 *शिमला 2एन3ड...,spam


In [None]:
df ['tf_idf_alphanumeric'] = df['tf_alphanumeric']*df['idf_alphanumeric']
df ['tf_idf_numbers'] = df['tf_numbers'] * df['idf_numbers']
df ['tf_idf_alpha_num_sym'] = df['tf_alpha_num_sym'] * df['idf_alpha_num_sym']
df ['tf_idf_num_sym'] = df['tf_num_sym'] * df['idf_num_sym']
df ['tf_idf_alpha_sym'] = df['tf_alpha_sym'] * df['idf_alpha_sym']
df ['tf_idf_symbols'] = df['tf_symbols'] * df['idf_symbols']

In [None]:
df

Unnamed: 0,tf_alphanumeric,tf_numbers,tf_alpha_num_sym,tf_num_sym,tf_alpha_sym,tf_symbols,idf_alphanumeric,idf_numbers,idf_alpha_num_sym,idf_num_sym,idf_alpha_sym,idf_symbols,label,message,tf_idf_alphanumeric,tf_idf_numbers,tf_idf_alpha_num_sym,tf_idf_num_sym,tf_idf_alpha_sym,tf_idf_symbols
0,0.00,0.00,0.00,0.00,0.00,0.00,5.67,3.47,4.09,3.87,2.24,2.08,message,label,0.00,0.00,0.00,0.00,0.00,0.00
1,0.00,0.00,0.00,0.00,0.00,0.00,5.67,3.47,4.09,3.87,2.24,2.08,एक डॉलर का ख़रीदारी ख़रीदारी के लिए अपने घर जा...,ham,0.00,0.00,0.00,0.00,0.00,0.00
2,0.00,0.00,0.00,0.00,0.25,0.75,5.67,3.47,4.09,3.87,2.24,2.08,"""हर छोटी कोशिश का बड़ा परिणाम होता है","उसे नजरअंदाज न करें।"",ham",0.00,0.00,0.00,0.00,0.56,1.56
3,0.00,0.00,0.00,0.00,0.00,0.00,5.67,3.47,4.09,3.87,2.24,2.08,क्या तुमने मेरे पिछले संदेश का जवाब दिया?,ham,0.00,0.00,0.00,0.00,0.00,0.00
4,0.00,0.00,0.00,0.00,0.00,0.00,5.67,3.47,4.09,3.87,2.24,2.08,प्रति जोड़ा *मनाली 3एन/4डी@14 999 *शिमला 2एन3ड...,spam,0.00,0.00,0.00,0.00,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4044,0.00,0.00,0.00,0.00,0.05,0.95,5.67,3.47,4.09,3.87,2.24,2.08,"""मैं सोने जा रहा हूं... मुझे अब पढ़ाई करने का ...",धन्यवाद... बेहतर होगा कि आप प्रार्थना करें कि...,0.00,0.00,0.00,0.00,0.11,1.98
4045,0.00,0.00,0.00,0.00,0.00,0.00,5.67,3.47,4.09,3.87,2.24,2.08,हमने मुफ्त जियो मोबाइल + मुफ्त कैमकॉर्डर के लि...,spam,0.00,0.00,0.00,0.00,0.00,0.00
4046,0.00,0.00,0.00,0.00,0.02,0.98,5.67,3.47,4.09,3.87,2.24,2.08,"""मैं चाहता हूं कि आप मुझे बताएं कि कोलग में क्...",क्या मुझे अपना केक ऑर्डर करना चाहिए या आप सब ...,0.00,0.00,0.00,0.00,0.05,2.04
4047,0.00,0.00,0.00,0.00,0.00,0.00,5.67,3.47,4.09,3.87,2.24,2.08,मैं आपको प्यार करता हूं!,ham,0.00,0.00,0.00,0.00,0.00,0.00


In [None]:
#df_final = df.iloc[:, 12:19]

df_final = df.iloc[:, 12:19]
y = df_final['label']
X = df_final.drop(['label','message'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)
models, predictions = clf.fit(X_train, X_test, y_train, y_test)

print(models)


  0%|          | 0/32 [00:00<?, ?it/s]

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
                             Accuracy  Balanced Accuracy ROC AUC  F1 Score  \
Model                                                                        
LabelSpreading                   0.01               0.01    None      0.01   
LabelPropagation                 0.01               0.01    None      0.01   
RandomForestClassifier           0.01               0.01    None      0.00   
ExtraTreesClassifier             0.01               0.01    None      0.01   
ExtraTreeClassifier              0.01               0.01    None      0.01   
BaggingClassifier                0.01               0.01    None      0.00   
LinearDiscriminantAnalysis       0.01               0.00    None      0.00   
DecisionTreeClassifier           0.01               0.00    None      0.00   
BernoulliNB                      0.01               0.00    None      0.00   
SVC                              0.01               0.00    None      0.00   

In [None]:
df_final

Unnamed: 0,label,message,tf_idf_alphanumeric,tf_idf_numbers,tf_idf_alpha_num_sym,tf_idf_num_sym,tf_idf_alpha_sym
0,message,label,0.00,0.00,0.00,0.00,0.00
1,एक डॉलर का ख़रीदारी ख़रीदारी के लिए अपने घर जा...,ham,0.00,0.00,0.00,0.00,0.00
2,"""हर छोटी कोशिश का बड़ा परिणाम होता है","उसे नजरअंदाज न करें।"",ham",0.00,0.00,0.00,0.00,0.56
3,क्या तुमने मेरे पिछले संदेश का जवाब दिया?,ham,0.00,0.00,0.00,0.00,0.00
4,प्रति जोड़ा *मनाली 3एन/4डी@14 999 *शिमला 2एन3ड...,spam,0.00,0.00,0.00,0.00,0.00
...,...,...,...,...,...,...,...
4044,"""मैं सोने जा रहा हूं... मुझे अब पढ़ाई करने का ...",धन्यवाद... बेहतर होगा कि आप प्रार्थना करें कि...,0.00,0.00,0.00,0.00,0.11
4045,हमने मुफ्त जियो मोबाइल + मुफ्त कैमकॉर्डर के लि...,spam,0.00,0.00,0.00,0.00,0.00
4046,"""मैं चाहता हूं कि आप मुझे बताएं कि कोलग में क्...",क्या मुझे अपना केक ऑर्डर करना चाहिए या आप सब ...,0.00,0.00,0.00,0.00,0.05
4047,मैं आपको प्यार करता हूं!,ham,0.00,0.00,0.00,0.00,0.00


In [None]:
y=df_final['label']

In [None]:
X = df_final.drop(['label','message'], axis=1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)
models, predictions = clf.fit(X_train, X_test, y_train, y_test)

# Display results
print(models)

  0%|          | 0/32 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
knn = KNeighborsClassifier(n_neighbors=3)  # You can tune 'n_neighbors'
knn.fit(X_train, y_train)

In [None]:
y_pred = knn.predict(X_test)

In [None]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.003703703703703704
Confusion Matrix:
 [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
Classification Report:
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  precision    recall  f1-score   support

                                                                                                                                                                                                                            

In [None]:
print(f"Accuracy : {accuracy_score(y_test, y_pred):.4f}")

Accuracy : 0.0037


In [None]:






#english dataset




In [None]:
!pip install lazypredict

Collecting lazypredict
  Downloading lazypredict-0.2.16-py2.py3-none-any.whl.metadata (13 kB)
Collecting pytest-runner (from lazypredict)
  Downloading pytest_runner-6.0.1-py3-none-any.whl.metadata (7.3 kB)
Collecting mlflow>=2.0.0 (from lazypredict)
  Downloading mlflow-3.3.1-py3-none-any.whl.metadata (30 kB)
Collecting mlflow-skinny==3.3.1 (from mlflow>=2.0.0->lazypredict)
  Downloading mlflow_skinny-3.3.1-py3-none-any.whl.metadata (31 kB)
Collecting mlflow-tracing==3.3.1 (from mlflow>=2.0.0->lazypredict)
  Downloading mlflow_tracing-3.3.1-py3-none-any.whl.metadata (19 kB)
Collecting alembic!=1.10.0,<2 (from mlflow>=2.0.0->lazypredict)
  Downloading alembic-1.16.4-py3-none-any.whl.metadata (7.3 kB)
Collecting docker<8,>=4.0.0 (from mlflow>=2.0.0->lazypredict)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow>=2.0.0->lazypredict)
  Downloading graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting gunicorn<24 (from mlflow>=2.0.0

In [None]:
import re
import pandas as pd
import numpy as np
import time
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report, confusion_matrix
from lazypredict.Supervised import LazyClassifier

In [None]:
patterns = {
    "alphanumeric": r"^(?=.*[a-zA-Z])(?=.*\d)[a-zA-Z0-9]+$",
    "numbers": r"^\d+$",
    "alpha_num_sym": r"^(?=.*[a-zA-Z])(?=.*\d)(?=.*\W).+$",
    "num_sym": r"^(?=.*\d)(?=.*\W)[^a-zA-Z]+$",
    "alpha_sym": r"^(?=.*[a-zA-Z])(?=.*\W)[^\d]+$",
    "symbols": r"^[^a-zA-Z0-9\s]+$",
}


In [None]:
def categorize_word(word):
    for category, pattern in patterns.items():
        if re.match(pattern, word):
            return category
    return "unknown"

# Compute Term Frequency
def compute_tf(message):
    words = re.findall(r'\S+', message)
    counts = {key: 0 for key in patterns}
    for word in words:
        cat = categorize_word(word)
        if cat in counts:
            counts[cat] += 1
    total = sum(counts.values())
    return {key: counts[key]/total if total > 0 else 0 for key in patterns}

# Compute Inverse Document Frequency
def compute_idf(tf_data, categories):
    N = len(tf_data)
    idf_scores = {}
    for cat in categories:
        doc_count = sum(1 for tf in tf_data if tf[cat] > 0)
        idf_scores[cat] = np.log((N+1)/(doc_count+1)) + 1
    return idf_scores


In [None]:
def build_tf_idf_dataframe_text(input_file):
    messages, labels, tf_data = [], [], []
    start_time = time.time()

    with open(input_file, "r", encoding="utf-8") as f:
        lines = f.read().splitlines()
        for line in lines:
            parts = line.strip().split('\t', 1)
            if len(parts) != 2:
                continue
            label, message = parts
            tf = compute_tf(message)
            tf_data.append(tf)
            messages.append(message)
            labels.append(label)

    categories = list(patterns.keys())
    idf_scores = compute_idf(tf_data, categories)

    data = []
    for i in range(len(messages)):
        row = {f"tf_{cat}": tf_data[i][cat] for cat in categories}
        row.update({f"idf_{cat}": idf_scores[cat] for cat in categories})
        row["label"] = labels[i]
        row["message"] = messages[i]
        data.append(row)

    df = pd.DataFrame(data)
    tfidf_time = time.time() - start_time
    print(f"TF-IDF Construction Time: {tfidf_time:.2f} seconds")

    return df, categories, tfidf_time


In [None]:
 input_file = "SMSSpamCollection"
df, categories, tfidf_time = build_tf_idf_dataframe_text(input_file)

# Compute combined TF-IDF
df['tf_idf_alphanumeric'] = df['tf_alphanumeric'] * df['idf_alphanumeric']
df['tf_idf_numbers'] = df['tf_numbers'] * df['idf_numbers']
df['tf_idf_alpha_num_sym'] = df['tf_alpha_num_sym'] * df['idf_alpha_num_sym']
df['tf_idf_num_sym'] = df['tf_num_sym'] * df['idf_num_sym']
df['tf_idf_alpha_sym'] = df['tf_alpha_sym'] * df['idf_alpha_sym']
df['tf_idf_symbols'] = df['tf_symbols'] * df['idf_symbols']


combined_columns = []
for cat in categories:
    df[f"tf_idf_check_{cat}"] = df[f"tf_{cat}"] * df[f"idf_{cat}"]
    combined_columns.extend([f"tf_{cat}", f"idf_{cat}", f"tf_idf_check_{cat}"])

combined_columns.extend(["label", "message"])
df_combined_check = df[combined_columns]

print("TF, IDF, and TF-IDF combined check (first 5 rows):")
print(df_combined_check.head())

# Save CSVs
df_combined_check.to_csv("tf_idf_combined_check.csv", index=False)
df_final = df[[f"tf_idf_{cat}" for cat in categories] + ["label", "message"]]
df_final.to_csv("tf_idf_features_final.csv", index=False)
print("Saved CSVs: 'tf_idf_combined_check.csv' and 'tf_idf_features_final.csv'")


TF-IDF Construction Time: 0.42 seconds
TF, IDF, and TF-IDF combined check (first 5 rows):
   tf_alphanumeric  idf_alphanumeric  tf_idf_check_alphanumeric  tf_numbers  \
0             0.00              3.38                       0.00        0.00   
1             0.00              3.38                       0.00        0.00   
2             0.14              3.38                       0.48        0.29   
3             0.00              3.38                       0.00        0.00   
4             0.00              3.38                       0.00        0.00   

   idf_numbers  tf_idf_check_numbers  tf_alpha_num_sym  idf_alpha_num_sym  \
0         2.63                  0.00              0.00               3.47   
1         2.63                  0.00              0.00               3.47   
2         2.63                  0.75              0.14               3.47   
3         2.63                  0.00              0.00               3.47   
4         2.63                  0.00              

In [None]:
X = df_final[[f"tf_idf_{cat}" for cat in categories]].values
y = df_final["label"].values

le = LabelEncoder()
y_encoded = le.fit_transform(y)  # ham=0, spam=1

# Train-test split 70-30
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.3, random_state=42)


In [None]:
clf_lazy = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)
start_lazy = time.time()
models, predictions = clf_lazy.fit(X_train, X_test, y_train, y_test)
lazy_time = time.time() - start_lazy

print(f"LazyClassifier Time: {lazy_time:.2f} seconds")
print("LazyClassifier Model Comparison:\n")
print(models)


  0%|          | 0/32 [00:00<?, ?it/s]

[LightGBM] [Info] Number of positive: 521, number of negative: 3380
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000544 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 195
[LightGBM] [Info] Number of data points in the train set: 3901, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.133555 -> initscore=-1.869881
[LightGBM] [Info] Start training from score -1.869881
LazyClassifier Time: 3.65 seconds
LazyClassifier Model Comparison:

                               Accuracy  Balanced Accuracy  ROC AUC  F1 Score  \
Model                                                                           
QuadraticDiscriminantAnalysis      0.90               0.87     0.87      0.91   
XGBClassifier                      0.95               0.87     0.87      0.95   
LabelSpreading                     0.95               0.

In [None]:
start_train = time.time()
rf = RandomForestClassifier(n_estimators=200, random_state=42)
rf.fit(X_train, y_train)
train_time = time.time() - start_train

start_test = time.time()
y_pred = rf.predict(X_test)
test_time = time.time() - start_test

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auroc = roc_auc_score(y_test, y_pred)

print("\n--- Random Forest Performance ---")
print(f"TF-IDF Construction Time: {tfidf_time:.2f} sec")
print(f"Training Time: {train_time:.2f} sec")
print(f"Testing Time: {test_time:.2f} sec")
print(f"Accuracy  : {accuracy:.4f}")
print(f"Precision : {precision:.4f}")
print(f"Recall    : {recall:.4f}")
print(f"F1-score  : {f1:.4f}")
print(f"AUROC     : {auroc:.4f}")
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))



--- Random Forest Performance ---
TF-IDF Construction Time: 0.42 sec
Training Time: 0.86 sec
Testing Time: 0.07 sec
Accuracy  : 0.9474
Precision : 0.8416
Recall    : 0.7522
F1-score  : 0.7944
AUROC     : 0.8650

Confusion Matrix:
 [[1415   32]
 [  56  170]]

Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.98      0.97      1447
           1       0.84      0.75      0.79       226

    accuracy                           0.95      1673
   macro avg       0.90      0.87      0.88      1673
weighted avg       0.95      0.95      0.95      1673



In [None]:

import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


df = pd.read_csv("Hindi (1).csv")

print("Columns in dataset:", df.columns)
print(df.head())

X = df["message"].astype(str)
y = df["label"].astype(str)

vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X_vec = vectorizer.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(
    X_vec, y, test_size=0.2, random_state=42, stratify=y
)

clf = RandomForestClassifier(n_estimators=200, random_state=42)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, digits=4))

cv_scores = cross_val_score(clf, X_vec, y, cv=5, scoring="accuracy")
print("\nCross-validation Accuracy (5-fold):", cv_scores.mean())



Columns in dataset: Index(['message', 'label'], dtype='object')
                                             message label
0  एक डॉलर का ख़रीदारी ख़रीदारी के लिए अपने घर जा...   ham
1  हर छोटी कोशिश का बड़ा परिणाम होता है, उसे नजरअ...   ham
2          क्या तुमने मेरे पिछले संदेश का जवाब दिया?   ham
3  प्रति जोड़ा *मनाली 3एन/4डी@14 999 *शिमला 2एन3ड...  spam
4  बिबाह में समस्या?❤️ प्यार को पाने का सही तरीका...  spam
Accuracy: 0.926829268292683

Confusion Matrix:
 [[490   8]
 [ 49 232]]

Classification Report:
               precision    recall  f1-score   support

         ham     0.9091    0.9839    0.9450       498
        spam     0.9667    0.8256    0.8906       281

    accuracy                         0.9268       779
   macro avg     0.9379    0.9048    0.9178       779
weighted avg     0.9299    0.9268    0.9254       779


Cross-validation Accuracy (5-fold): 0.9234725160132132


In [None]:

import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


df = pd.read_csv("DS2(1) (1).csv")

print("Columns in dataset:", df.columns)
print(df.head())

X = df["message"].astype(str)
y = df["label"].astype(str)

vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X_vec = vectorizer.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(
    X_vec, y, test_size=0.2, random_state=42, stratify=y
)

clf = RandomForestClassifier(n_estimators=200, random_state=42)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, digits=4))

cv_scores = cross_val_score(clf, X_vec, y, cv=5, scoring="accuracy")
print("\nCross-validation Accuracy (5-fold):", cv_scores.mean())



Columns in dataset: Index(['message', 'label'], dtype='object')
                                             message label
0  உங்கள் டீமை நண்பர்களுடன் உற்சாகப்படுத்தவும் 59...  Spam
1  போரடிக்கிறதா? களிப்பை தொடர்க! ரம்மி ஆடி பணம் வ...  Spam
2  போரடிக்கிறதா? களிப்பை தொடர்க! ரம்மி ஆடி பணம் வ...  Spam
3  உங்கள் டீமை நண்பர்களுடன் உற்சாகப்படுத்தவும் 59...  Spam
4  இது எப்படி இருக்கு?தமிழா காம்போ பேக்,ஜீ,சன்,வி...  Spam
Accuracy: 0.9619845360824743

Confusion Matrix:
 [[1261   10]
 [  49  232]]

Classification Report:
               precision    recall  f1-score   support

         Ham     0.9626    0.9921    0.9771      1271
        Spam     0.9587    0.8256    0.8872       281

    accuracy                         0.9620      1552
   macro avg     0.9606    0.9089    0.9322      1552
weighted avg     0.9619    0.9620    0.9609      1552


Cross-validation Accuracy (5-fold): 0.923427934754432


In [None]:

import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


df = pd.read_csv("IIITD Precog (1).csv")

print("Columns in dataset:", df.columns)
print(df.head())

X = df["message"].astype(str)
y = df["label"].astype(str)

vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X_vec = vectorizer.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(
    X_vec, y, test_size=0.2, random_state=42, stratify=y
)

clf = RandomForestClassifier(n_estimators=200, random_state=42)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, digits=4))

cv_scores = cross_val_score(clf, X_vec, y, cv=5, scoring="accuracy")
print("\nCross-validation Accuracy (5-fold):", cv_scores.mean())



Columns in dataset: Index(['label', 'message', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4',
       'Unnamed: 5', 'Unnamed: 6'],
      dtype='object')
  label                                            message Unnamed: 2  \
0   ham  Husband Suhag raat pe- Tum ne kabhi Blue Film ...        NaN   
1   ham  Husband n wife were opening joint bank ac Husb...        NaN   
2   ham  indagi me 5 cheez kabhi bhi aa sakti h 'Hum' '...        NaN   
3   ham  Why do Muslims hate Pigs !!? becz- Pigs produc...        NaN   
4   ham  Incase Notice koi na dekha ho,aj evry clas md ...        NaN   

  Unnamed: 3 Unnamed: 4 Unnamed: 5 Unnamed: 6  
0        NaN        NaN        NaN        NaN  
1        NaN        NaN        NaN        NaN  
2        NaN        NaN        NaN        NaN  
3        NaN        NaN        NaN        NaN  
4        NaN        NaN        NaN        NaN  
Accuracy: 0.965

Confusion Matrix:
 [[196   4]
 [ 10 190]]

Classification Report:
               precision    recall  f1-scor

In [None]:

import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


df = pd.read_csv("/content/DS1 revisedindiandataset (1).csv")

print("Columns in dataset:", df.columns)
print(df.head())

X = df["message"].astype(str)
y = df["label"].astype(str)

vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X_vec = vectorizer.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(
    X_vec, y, test_size=0.2, random_state=42, stratify=y
)

clf = RandomForestClassifier(n_estimators=200, random_state=42)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, digits=4))

cv_scores = cross_val_score(clf, X_vec, y, cv=5, scoring="accuracy")
print("\nCross-validation Accuracy (5-fold):", cv_scores.mean())



Columns in dataset: Index(['label', 'message'], dtype='object')
  label                                            message
0   ham  Dear Customer, +916300623587 is now available ...
1   ham  Dear Customer, You have a missed call from +91...
2  spam  Join Hike to get Rs 40. Earn upto Rs. 10,000 b...
3  spam  Just sent you some money and invited you to Hi...
4  spam  Just sent you some money and invited you to Hi...
Accuracy: 0.9595185995623632

Confusion Matrix:
 [[660  12]
 [ 25 217]]

Classification Report:
               precision    recall  f1-score   support

         ham     0.9635    0.9821    0.9727       672
        spam     0.9476    0.8967    0.9214       242

    accuracy                         0.9595       914
   macro avg     0.9556    0.9394    0.9471       914
weighted avg     0.9593    0.9595    0.9592       914


Cross-validation Accuracy (5-fold): 0.9146006744303652
