In [3]:
# Step 1: Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Step 2: Load the dataset
# You can replace this with your dataset
url = '/content/spam.csv'
df = pd.read_csv(url, encoding='latin-1')

# Remove unnecessary label names
df = df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1)
df = df.rename(columns={'v1': 'label', 'v2': 'text'})

print(df.columns)

# Step 3: Data preprocessing
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

# Step 4: Train-test split
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)

# Step 5: Feature extraction using TF-IDF
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Step 6: Train the models
# Naive Bayes
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)
nb_pred = nb_model.predict(X_test_tfidf)

# Logistic Regression
lr_model = LogisticRegression()
lr_model.fit(X_train_tfidf, y_train)
lr_pred = lr_model.predict(X_test_tfidf)

# Support Vector Machine
svm_model = SVC()
svm_model.fit(X_train_tfidf, y_train)
svm_pred = svm_model.predict(X_test_tfidf)

# Step 7: Evaluate the models
def evaluate_model(y_test, y_pred, model_name):
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    print(f'{model_name} Model')
    print(f'Accuracy: {accuracy:.2f}')
    print(f'Precision: {precision:.2f}')
    print(f'Recall: {recall:.2f}')
    print(f'F1 Score: {f1:.2f}')
    print('---')

# Evaluate Naive Bayes
evaluate_model(y_test, nb_pred, 'Naive Bayes')

# Evaluate Logistic Regression
evaluate_model(y_test, lr_pred, 'Logistic Regression')

# Evaluate Support Vector Machine
evaluate_model(y_test, svm_pred, 'Support Vector Machine')


Index(['label', 'text'], dtype='object')
Naive Bayes Model
Accuracy: 0.96
Precision: 1.00
Recall: 0.72
F1 Score: 0.84
---
Logistic Regression Model
Accuracy: 0.97
Precision: 0.99
Recall: 0.75
F1 Score: 0.86
---
Support Vector Machine Model
Accuracy: 0.98
Precision: 1.00
Recall: 0.87
F1 Score: 0.93
---
