In [2]:
# Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import StandardScaler
import os

# 1. Load the dataset
file_path = 'spam.csv'

# Check if file exists
if not os.path.exists(file_path):
    raise FileNotFoundError(f"'{file_path}' not found. Please make sure the file is in the correct directory.")

# Read the CSV
df = pd.read_csv(file_path)

# Check for the target column
if 'spam' not in df.columns:
    raise ValueError("The dataset must contain a 'spam' column as the target label (1 = spam, 0 = not spam).")

# Check for non-numeric features
if not all(df.drop('spam', axis=1).dtypes.apply(lambda dt: pd.api.types.is_numeric_dtype(dt))):
    raise ValueError("All features must be numeric. Please preprocess text or categorical data first.")

# Preview the dataset
print("Dataset Preview:")
print(df.head())

# 2. Prepare features and labels
X = df.drop('spam', axis=1)  # Features
y = df['spam']               # Target

# 3. Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 4. Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 5. Train Logistic Regression model
model = LogisticRegression(max_iter=1000)  # Increase max_iter to avoid convergence warnings
model.fit(X_train_scaled, y_train)

# 6. Predictions
y_pred = model.predict(X_test_scaled)

# 7. Evaluation
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nAccuracy Score:")
print(accuracy_score(y_test, y_pred))


Dataset Preview:
   word_freq_make  word_freq_address  word_freq_all  word_freq_3d  \
0            0.00               0.64           0.64           0.0   
1            0.21               0.28           0.50           0.0   
2            0.06               0.00           0.71           0.0   
3            0.00               0.00           0.00           0.0   
4            0.00               0.00           0.00           0.0   

   word_freq_our  word_freq_over  word_freq_remove  word_freq_internet  \
0           0.32            0.00              0.00                0.00   
1           0.14            0.28              0.21                0.07   
2           1.23            0.19              0.19                0.12   
3           0.63            0.00              0.31                0.63   
4           0.63            0.00              0.31                0.63   

   word_freq_order  word_freq_mail  ...  char_freq_;   char_freq_(   \
0             0.00            0.00  ...          0.0