In [87]:
import pandas as pd
import json
import re
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from nltk.corpus import stopwords
import nltk
import random

In [88]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Load Dataset from JSON
with open("dataset.json", "r", encoding="utf-8") as file:
    data = json.load(file)

df = pd.DataFrame(data)
print(df.columns)
print(df.head())



Index(['Msg', 'Label'], dtype='object')
                                                 Msg     Label
0  CONGRATULATIONS! FREE 2GB data is yours! Claim...      spam
1  Hi! Thank you for being with Vi-India's FASTES...      spam
2  As part of Cyber Swachhta Pakhwada, CERT-In Go...      spam
3                  I will try to manage took tablets  not_spam
4  Study from Home with Vi!! Watch Kite Victers C...      spam


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\PythonTeam\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [89]:
# Ensure consistent column names
df = df.rename(columns={'post': 'Msg', 'Label': 'Label'})

# Encode labels
df['Label'] = df['Label'].map({'spam': 1, 'not_spam': 0})

In [90]:
if 'Msg' in df.columns and 'Label' in df.columns:
    df.rename(columns={'Msg': 'Post', 'Label': 'Label'}, inplace=True)


In [91]:
df.head()

Unnamed: 0,Post,Label
0,CONGRATULATIONS! FREE 2GB data is yours! Claim...,1
1,Hi! Thank you for being with Vi-India's FASTES...,1
2,"As part of Cyber Swachhta Pakhwada, CERT-In Go...",1
3,I will try to manage took tablets,0
4,Study from Home with Vi!! Watch Kite Victers C...,1


In [92]:
# Data Augmentation with Evasive Techniques
def add_evasive_techniques(text):
    replacements = {
        'a': '@', 'e': '3', 'i': '1', 'o': '0', 's': '$', 'b': '8',
        't': '7', 'g': '6', 'z': '2'
    }
    # Character substitution
    evasive_text = ''.join([replacements.get(c, c) for c in text.lower()])
    
    # Random symbol injection (with ~30% chance per character)
    symbols = ['*', '~', '#', '!', '?']
    evasive_text = ''.join(
        [ch + random.choice(symbols) if random.random() > 0.7 else ch for ch in evasive_text]
    )
    
    return evasive_text


In [93]:
# Apply augmentation to spam messages only (Label == 1) with 70% probability
df['augmented_message'] = df.apply(
    lambda row: add_evasive_techniques(row['Post']) if row['Label'] == 1 and random.random() > 0.3 else row['Post'],
    axis=1
)

In [94]:
df.head()

Unnamed: 0,Post,Label,augmented_message
0,CONGRATULATIONS! FREE 2GB data is yours! Claim...,1,c!0n6r@7ul#@710n$~!* ~fr3!3 268 ~d@?7@ ?1?$ y0...
1,Hi! Thank you for being with Vi-India's FASTES...,1,Hi! Thank you for being with Vi-India's FASTES...
2,"As part of Cyber Swachhta Pakhwada, CERT-In Go...",1,"@$ p#@r7 *0f cy*83r~ $*w@chh7@ ?p@kh~w*@d@,~ c..."
3,I will try to manage took tablets,0,I will try to manage took tablets
4,Study from Home with Vi!! Watch Kite Victers C...,1,$!7u?dy fr~0m ?h0m3~ w#17!h* ~v1!*! ~w@7ch k17...


In [95]:
def preprocess_text(text):
    if not isinstance(text, str):
        return ""  # or you could return `text` unchanged
    text = re.sub(r'\W', ' ', text)  
    text = text.lower()
    text = " ".join([word for word in text.split() if word not in stop_words])
    return text


df['cleaned_message'] = df['Post'].apply(preprocess_text)



## Split Data

In [96]:
X = df['cleaned_message']
y = df['Label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

##  Tokenization and Padding for LSTM Model

In [97]:
max_words = 5000
max_len = 100
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len)

# Comparative Analysis of Machine Learning Models

## Naive Bayes Model & Logistic Regression

In [98]:
# TF-IDF Vectorization for Traditional Models
tfidf = TfidfVectorizer(max_features=3000)
X_train_tfidf = tfidf.fit_transform(X_train).toarray()
X_test_tfidf = tfidf.transform(X_test).toarray()


In [99]:
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)
preds = model.predict(X_test_tfidf)
print("Accuracy:", accuracy_score(y_test, preds))


Accuracy: 0.9706314243759178


In [100]:


# Initialize and train Naive Bayes model
model2 = MultinomialNB()
model2.fit(X_train_tfidf, y_train)

# Predict and evaluate
preds2 = model2.predict(X_test_tfidf)
print("Naive Bayes Accuracy:", accuracy_score(y_test, preds2))

Naive Bayes Accuracy: 0.9838472834067548


In [101]:
# Save
joblib.dump(model, 'spam_logReq.joblib')
joblib.dump(model2, 'spam_Navie.joblib')
joblib.dump(tfidf, 'tfidf_vectorizer.joblib')

['tfidf_vectorizer.joblib']