In [9]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

#Feature extraction
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

In [10]:
df = pd.read_csv("https://raw.githubusercontent.com/ShreyaSaha102/Spam_Classifier/main/spam.csv", encoding = 'latin-1')
# Keep only necessary columns
df = df[['v1', 'v2']]
df.columns = ['label','text']
df.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [11]:
df['label'].value_counts()

#convert label to 0 & 1
df['label_num']=df['label'].map({'ham':0,'spam':1})

print(df.shape)
print(df.isnull().sum())
df.head()

(5572, 3)
label        0
text         0
label_num    0
dtype: int64


Unnamed: 0,label,text,label_num
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [12]:
import nltk

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [13]:
import re
import string

def preprocess_text(text):
    # Lowercase
    text = text.lower()

    # Remove digits and punctuation
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Tokenize
    tokens = word_tokenize(text)

    # Remove stopwords and lemmatize
    cleaned = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]

    # Join back into a single string
    return ' '.join(cleaned)


In [14]:
#Applying preprocessing to dataset
df['clean_text'] = df['text'].apply(preprocess_text)
df[['text', 'clean_text']].head()

Unnamed: 0,text,clean_text
0,"Go until jurong point, crazy.. Available only ...",go jurong point crazy available bugis n great ...
1,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,Free entry in 2 a wkly comp to win FA Cup fina...,free entry wkly comp win fa cup final tkts st ...
3,U dun say so early hor... U c already then say...,u dun say early hor u c already say
4,"Nah I don't think he goes to usf, he lives aro...",nah dont think go usf life around though


In [16]:
#Feature Extraction
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['clean_text'])

In [17]:
#Encode label
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
y = encoder.fit_transform(df['label'])  # assuming 'label' column is spam/ham

In [18]:
#Train Test splits
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# **Classify using Naive Bayes**

In [19]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train, y_train)

In [20]:
#Metric calculation

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("AUC:", auc)

Accuracy: 0.9704035874439462
Precision: 0.9915966386554622
Recall: 0.7866666666666666
F1 Score: 0.8773234200743495
AUC: 0.8928151986183074


In [49]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'alpha': [0.01, 0.1, 0.5, 1.0],
    'fit_prior': [True, False]
}

nb = MultinomialNB()
grid_nb = GridSearchCV(
    estimator=nb,
    param_grid=param_grid,
    scoring='accuracy',
    cv=5,
    n_jobs=-1
)

grid_nb.fit(X_train, y_train)
print("Best Parameters:", grid_nb.best_params_)

#Best parameters
best_nb = grid_nb.best_estimator_

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

y_pred = best_nb.predict(X_test)

#Calculate Metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("AUC:", auc)

Best Parameters: {'alpha': 0.01, 'fit_prior': True}
Accuracy: 0.9766816143497757
Precision: 0.9428571428571428
Recall: 0.88
F1 Score: 0.9103448275862069
AUC: 0.9358549222797926


In [None]:
#Improved after using GridSearchCV

# **Classify using Logistic Regression**

In [31]:
from sklearn.linear_model import LogisticRegression

log_model=LogisticRegression()
log_model.fit(X_train,y_train)

y_pred = log_model.predict(X_test)

In [32]:
#Metric calculation

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

y_pred = log_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("AUC:", auc)

Accuracy: 0.9497757847533632
Precision: 0.9607843137254902
Recall: 0.6533333333333333
F1 Score: 0.7777777777777778
AUC: 0.824594127806563


**Tuning with GridSearchCV**

In [34]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'penalty': ['l1', 'l2'],
    'C': [0.01, 0.1, 1, 10],
    'solver': ['liblinear', 'saga']
}

log_grid_model=LogisticRegression()
grid_search = GridSearchCV(
    estimator=log_grid_model,
    param_grid=param_grid,
    cv=5,                   # 5-fold cross-validation
    scoring='accuracy',     # or 'f1', 'roc_auc'
    n_jobs=-1,              # Use all cores
)

grid_search.fit(X_train, y_train)
print("Best Parameters:", grid_search.best_params_)

Best Parameters: {'C': 10, 'penalty': 'l2', 'solver': 'saga'}




In [36]:
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("AUC:", auc)

Accuracy: 0.9730941704035875
Precision: 0.96875
Recall: 0.8266666666666667
F1 Score: 0.8920863309352518
AUC: 0.9112607944732297


In [None]:
#Improved after using GridSearchCV

# **Classify using SVM**

In [42]:
from sklearn.svm import LinearSVC

svm_model = LinearSVC()
svm_model.fit(X_train, y_train)

y_pred = svm_model.predict(X_test)

#Calculation of Metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("AUC:", auc)

Accuracy: 0.9748878923766816
Precision: 0.9692307692307692
Recall: 0.84
F1 Score: 0.9
AUC: 0.9179274611398964


In [43]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'C': [0.01, 0.1, 1, 10],
    'loss': ['hinge', 'squared_hinge']
}

grid_svm = GridSearchCV(LinearSVC(), param_grid, cv=5, scoring='accuracy', verbose=2)
grid_svm.fit(X_train, y_train)

print("Best Params:", grid_svm.best_params_)
best_svm = grid_svm.best_estimator_

Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV] END .................................C=0.01, loss=hinge; total time=   0.0s
[CV] END .................................C=0.01, loss=hinge; total time=   0.0s
[CV] END .................................C=0.01, loss=hinge; total time=   0.0s
[CV] END .................................C=0.01, loss=hinge; total time=   0.0s
[CV] END .................................C=0.01, loss=hinge; total time=   0.0s
[CV] END .........................C=0.01, loss=squared_hinge; total time=   0.0s
[CV] END .........................C=0.01, loss=squared_hinge; total time=   0.0s
[CV] END .........................C=0.01, loss=squared_hinge; total time=   0.0s
[CV] END .........................C=0.01, loss=squared_hinge; total time=   0.0s
[CV] END .........................C=0.01, loss=squared_hinge; total time=   0.0s
[CV] END ..................................C=0.1, loss=hinge; total time=   0.0s
[CV] END ..................................C=0.1,

In [44]:
#Calculation of metrics after tuning

y_pred = best_svm.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("AUC:", auc)

Accuracy: 0.9748878923766816
Precision: 0.9692307692307692
Recall: 0.84
F1 Score: 0.9
AUC: 0.9179274611398964


In [51]:
#Comparision of models

model=["Naive Bayes", "Logistic Regression", "SVM"]
Accuracy=[0.9766,0.9730, 0.9748]
Precision=[0.9428, 0.9687, 0.9692]
Recall=[0.88, 0.8266, 0.84]
F1_Score=[0.9103, 0.8920, 0.9]
AUC=[0.9358, 0.9112, 0.9179]

data={"Model":model, "Accuracy":Accuracy, "Precision":Precision, "Recall":Recall, "F1_Score":F1_Score, "AUC":AUC}
df=pd.DataFrame(data)
df

Unnamed: 0,Model,Accuracy,Precision,Recall,F1_Score,AUC
0,Naive Bayes,0.9766,0.9428,0.88,0.9103,0.9358
1,Logistic Regression,0.973,0.9687,0.8266,0.892,0.9112
2,SVM,0.9748,0.9692,0.84,0.9,0.9179


# **Predict for a new sample**

In [45]:
import pickle

# Save vectorizer
with open('vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

# Save model
with open('spam_model.pkl', 'wb') as f:
    pickle.dump(model, f)  # or best_model, svm_model, etc.

def predict_spam(text):
    # Load vectorizer & model
    import pickle
    with open('vectorizer.pkl', 'rb') as f:
        vectorizer = pickle.load(f)
    with open('spam_model.pkl', 'rb') as f:
        model = pickle.load(f)

    # Preprocess new input
    cleaned = preprocess_text(text)
    vectorized = vectorizer.transform([cleaned])

    # Predict
    pred = model.predict(vectorized)[0]
    return "Spam" if pred == 1 else "Not Spam"


In [47]:
sample1 = "Get a free recharge by clicking this link now!"
sample2 = "Let's meet at 1pm for the preject"
print(predict_spam(sample1))
print(predict_spam(sample2))

Spam
Not Spam
