In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

In [4]:
import chardet

In [5]:
with open("/content/spam.csv", "rb") as f:
  rawdata = f.read(1024)
encoding = chardet.detect(rawdata)["encoding"]

In [6]:
data = pd.read_csv("/content/spam.csv", encoding=encoding)

In [7]:
# Separate labels and messages
labels = data["v1"]  # Assuming "v1" column contains labels (ham/spam)
messages = data["v2"] # Assuming "v2" column contains messages

TF-IDF Vectorization

In [8]:
vectorizer = TfidfVectorizer()
features = vectorizer.fit_transform(messages)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

Model Building

1. Naive Bayes

In [10]:
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)


2. Logistic Regression

In [11]:
lr_model = LogisticRegression(solver='lbfgs')
lr_model.fit(X_train, y_train)


3. Support Vector Machine

In [12]:
svm_model = SVC()
svm_model.fit(X_train, y_train)


Evaluation

In [14]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def evaluate_model(model, X_test, y_test):
  predictions = model.predict(X_test)
  accuracy = accuracy_score(y_test, predictions)
  precision = precision_score(y_test, predictions, pos_label='spam')
  recall = recall_score(y_test, predictions, pos_label='spam')
  f1 = f1_score(y_test, predictions, pos_label='spam')
  print(f"Model: {type(model).__name__}")
  print(f"Accuracy: {accuracy:.4f}")
  print(f"Precision: {precision:.4f}")
  print(f"Recall: {recall:.4f}")
  print(f"F1-Score: {f1:.4f}")
  print("-"*40)

evaluate_model(nb_model, X_test, y_test)
evaluate_model(lr_model, X_test, y_test)
evaluate_model(svm_model, X_test, y_test)


Model: MultinomialNB
Accuracy: 0.9623
Precision: 1.0000
Recall: 0.7200
F1-Score: 0.8372
----------------------------------------
Model: LogisticRegression
Accuracy: 0.9623
Precision: 1.0000
Recall: 0.7200
F1-Score: 0.8372
----------------------------------------
Model: SVC
Accuracy: 0.9767
Precision: 1.0000
Recall: 0.8267
F1-Score: 0.9051
----------------------------------------
