In [16]:
from ucimlrepo import fetch_ucirepo
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import joblib



In [18]:

# Fetch dataset
spambase = fetch_ucirepo(id=94)

# Access the data as pandas DataFrames
X = spambase.data.features
y = spambase.data.targets

# Convert features and targets to DataFrames
X_df = pd.DataFrame(X, columns=spambase.variables)
y_df = pd.DataFrame(y, columns=['label'])

# Data Preprocessing
print("\nData Information:")
print(X_df.info())

print("\nFirst few rows of features:")
print(X_df.head())

print("\nFirst few rows of targets:")
print(y_df.head())

# Check for missing values
print("\nMissing values in features:")
print(X_df.isnull().sum())

print("\nMissing values in targets:")
print(y_df.isnull().sum())

# Ensure data types are correct
print("\nFeature types:\n", X_df.dtypes)
print("\nTarget types:\n", y_df.dtypes)

# Convert y_train to a 1D array
y_df = y_df.values.ravel()

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X_df, y_df, test_size=0.2, random_state=42)

# Ensure X_train and y_train have the correct shapes
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)

# Initialize and train the model
try:
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)
    print("Model trained successfully.")
except Exception as e:
    print(f"Error during model training: {e}")

# Model Evaluation
try:
    y_pred = model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)

    print("\nModel Evaluation:")
    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1-score: {f1}")
    print(f"Confusion Matrix:\n{conf_matrix}")
except Exception as e:
    print(f"Error during model evaluation: {e}")

# Save the model
try:
    joblib.dump(model, 'spam_classifier_model.pkl')
    print("Model saved successfully.")
except Exception as e:
    print(f"Error saving model: {e}")

# Prediction function
def predict_spam(features):
    try:
        prediction = model.predict([features])
        return "Spam" if prediction == 1 else "Ham"
    except Exception as e:
        return f"Error during prediction: {e}"

# Example usage
try:
    sample_features = X_test.iloc[0].values
    print("\nPrediction for sample features:")
    print(predict_spam(sample_features))
except Exception as e:
    print(f"Error during example prediction: {e}")



Data Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4601 entries, 0 to 4600
Data columns (total 58 columns):
 #   Column                                                                   Non-Null Count  Dtype  
---  ------                                                                   --------------  -----  
 0   (word_freq_make, Feature, Continuous, None, None, None, no)              0 non-null      float64
 1   (word_freq_address, Feature, Continuous, None, None, None, no)           0 non-null      float64
 2   (word_freq_all, Feature, Continuous, None, None, None, no)               0 non-null      float64
 3   (word_freq_3d, Feature, Continuous, None, None, None, no)                0 non-null      float64
 4   (word_freq_our, Feature, Continuous, None, None, None, no)               0 non-null      float64
 5   (word_freq_over, Feature, Continuous, None, None, None, no)              0 non-null      float64
 6   (word_freq_remove, Feature, Continuous, None, None, N