In [71]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score

# Loading the data
data = pd.read_csv('mail_data.csv')

# Convert labels to numerical values
data['Category'] = data['Category'].map({'spam': 0, 'ham': 1})

# Splitting the data into training and test sets
X_train, X_test, Y_train, Y_test = train_test_split(data['Message'], data['Category'], random_state=2, test_size=0.15)

# Feature extraction using TfidfVectorizer
feature_extraction = TfidfVectorizer(min_df=1, stop_words='english', lowercase=True)

# Fit and transform on training data
X_train_featured = feature_extraction.fit_transform(X_train)

# Transform test data (do not fit again to avoid mismatched features)
X_test_featured = feature_extraction.transform(X_test)

# Initialize and train the Logistic Regression model
model = LogisticRegression()
model.fit(X_train_featured, Y_train)

# Evaluate on training data
train_pred = model.predict(X_train_featured)
train_acc = accuracy_score(Y_train, train_pred)
print('Training accuracy:', train_acc)

# Evaluate on test data
test_pred = model.predict(X_test_featured)
test_acc = accuracy_score(Y_test, test_pred)
print('Test accuracy:', test_acc)


Training accuracy: 0.9700168918918919
Test accuracy: 0.9509569377990431
