In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Load dataset
df = pd.read_csv("/content/spam.csv", encoding='ISO-8859-1')

# Keep only necessary columns
df = df[['v1', 'v2']]
df.columns = ['label', 'message']  # Rename columns

# Convert labels to binary (spam = 1, good = 0)
df["label"] = df["label"].map({"spam": 1, "good": 0})

# Text preprocessing & feature extraction
vectorizer = CountVectorizer(binary=False)  # Remove stop_words='english'
X = vectorizer.fit_transform(df["message"])
y = df["label"]

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Logistic Regression model with L1 regularization and class balancing
log_model = LogisticRegression(penalty='l1', solver='liblinear', class_weight='balanced')
log_model.fit(X_train, y_train)

# Function to classify a new message
def classify_with_logistic(message):
    message_vectorized = vectorizer.transform([message])
    prob = log_model.predict_proba(message_vectorized)
    pred = "Spam" if prob[0][1] > 0.5 else "Good"
    print(f"\nLogistic Regression Prediction for '{message}': {pred} (Confidence: {prob[0][1]:.4f})")

# Test Predictions
classify_with_logistic("You are amazing!")  # Expected: Good
classify_with_logistic("Free mobile update now available! Click on the Link!")  # Expected: Spam

# Evaluate model
y_pred = log_model.predict(X_test)
print("\nLogistic Regression Model Evaluation:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))



Logistic Regression Prediction for 'You are amazing!': Good (Confidence: 0.0126)

Logistic Regression Prediction for 'Free mobile update now available! Click on the Link!': Spam (Confidence: 0.7805)

Logistic Regression Model Evaluation:
Accuracy: 0.9820627802690582
Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99       965
           1       0.95      0.91      0.93       150

    accuracy                           0.98      1115
   macro avg       0.97      0.95      0.96      1115
weighted avg       0.98      0.98      0.98      1115

