<a href="https://colab.research.google.com/github/VardaanGulati20/Spam-Detection-Model/blob/main/spam.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import pandas as pd
import string
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

data = pd.read_csv('/content/spam.csv', encoding='latin-1')
data = data[['v1', 'v2']]
data.columns = ['category', 'text']

def clean_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    words = [word for word in text.split() if word not in stop_words]
    return ' '.join(words)

data['clean_text'] = data['text'].apply(clean_text)
data['category'] = data['category'].map({'ham': 0, 'spam': 1})

x_train, x_test, y_train, y_test = train_test_split(
    data['clean_text'], data['category'], test_size=0.2, random_state=42, stratify=data['category']
)

vectorizer = TfidfVectorizer(max_features=5000, stop_words='english', ngram_range=(1,2))
x_train_tfidf = vectorizer.fit_transform(x_train)
x_test_tfidf = vectorizer.transform(x_test)

model = LogisticRegression(class_weight='balanced')
model.fit(x_train_tfidf, y_train)

y_pred = model.predict(x_test_tfidf)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f"Model Accuracy: {accuracy * 100:.2f}%")
print("\nDetailed Classification Report:\n", report)
print("\nConfusion Matrix:\n", conf_matrix)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Model Accuracy: 98.21%

Detailed Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99       966
           1       0.94      0.92      0.93       149

    accuracy                           0.98      1115
   macro avg       0.97      0.96      0.96      1115
weighted avg       0.98      0.98      0.98      1115


Confusion Matrix:
 [[958   8]
 [ 12 137]]
