Select a binary classification dataset (e.g., predicting customer churn). Implement a
Logistic Regression model. Preprocess the data, including handling categorical
variables (if any). Evaluate the model's performance using metrics like accuracy,
precision, and recall.

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns  # <-- 1. Import seaborn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report
# We no longer need fetch_openml

# --- 1. Load and Prepare Data ---

# 2. Load the dataset from Seaborn (this is more stable)
df = sns.load_dataset('titanic')

# Define our features (X) and target (y)
target_col = 'survived'
# 3. Use lowercase column names to match the seaborn dataset
feature_cols = ['pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked']

X = df[feature_cols]
y = df[target_col] # Already in 0/1 integer format

print("--- Initial Data (X) ---")
print(X.head())
print("\n--- Data Info (Note missing values and object types) ---")
X.info()

# --- 2. Split Data ---
# Split *before* preprocessing to prevent data leakage from the test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"\nTraining set size: {X_train.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")


# --- 3. Define Preprocessing Pipelines ---

# Pipeline for NUMERIC features:
numeric_features = ['age', 'fare', 'sibsp', 'parch'] # <-- lowercase
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Pipeline for CATEGORICAL features:
categorical_features = ['embarked', 'sex', 'pclass'] # <-- lowercase
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# --- 4. Combine Pipelines with ColumnTransformer ---

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])


# --- 5. Create and Train the Full Model Pipeline ---

model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(random_state=42))
])

# Train the entire pipeline on the training data
print("\n--- Training the model ---")
model_pipeline.fit(X_train, y_train)
print("--- Model training complete ---")


# --- 6. Evaluate the Model ---

# Make predictions on the test set
y_pred = model_pipeline.predict(X_test)

# Calculate individual metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, pos_label=1)
recall = recall_score(y_test, y_pred, pos_label=1)

print("\n--- Model Performance Evaluation ---")
print(f"Accuracy:  {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")

# Display a full classification report
print("\n--- Full Classification Report ---")
print(classification_report(y_test, y_pred, target_names=['Did Not Survive (0)', 'Survived (1)']))

--- Initial Data (X) ---
   pclass     sex   age  sibsp  parch     fare embarked
0       3    male  22.0      1      0   7.2500        S
1       1  female  38.0      1      0  71.2833        C
2       3  female  26.0      0      0   7.9250        S
3       1  female  35.0      1      0  53.1000        S
4       3    male  35.0      0      0   8.0500        S

--- Data Info (Note missing values and object types) ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   pclass    891 non-null    int64  
 1   sex       891 non-null    object 
 2   age       714 non-null    float64
 3   sibsp     891 non-null    int64  
 4   parch     891 non-null    int64  
 5   fare      891 non-null    float64
 6   embarked  889 non-null    object 
dtypes: float64(2), int64(3), object(2)
memory usage: 48.9+ KB

Training set size: 712
Test set size: 179

--- Training the mo