# Baseline Models: Logistic Regression & Decision Tree

This notebook trains two simple baseline models (Logistic Regression, Decision Tree) wrapped in pipelines with the preprocessing saved earlier. Metrics: Precision, Recall, F1, ROC-AUC. Focus: Recall for churners (positive class).

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, classification_report
import warnings
warnings.filterwarnings('ignore')
sns.set_style('whitegrid')
print('imports ok')

imports ok


In [2]:
# Load raw data and prepare target

df = pd.read_csv('../data/Churn_Modelling.csv')
# drop RowNumber or unnamed first column if present
if 'RowNumber' in df.columns:
    df.drop(columns=['RowNumber'], inplace=True)
elif df.columns[0] == '' or str(df.columns[0]).startswith('Unnamed'):
    df.drop(df.columns[0], axis=1, inplace=True)

# detect churn column
possible = ['Exited','Churn','churn','is_churn','IsChurn']
churn_col = next((c for c in possible if c in df.columns), None)
if churn_col is None:
    raise KeyError('Could not find churn column; expected one of: ' + str(possible))
print('Using target column:', churn_col)

# Ensure binary numeric target 0/1
if df[churn_col].dtype == 'object':
    uniques = df[churn_col].unique()
    if set(uniques) == {'Yes','No'}:
        df[churn_col] = (df[churn_col] == 'Yes').astype(int)
    else:
        try:
            df[churn_col] = pd.to_numeric(df[churn_col], errors='raise')
        except Exception:
            mapping = {u: i for i, u in enumerate(uniques)}
            df[churn_col] = df[churn_col].map(mapping).astype(int)

y = df[churn_col]
X = df.drop(columns=[churn_col])
print('Shape:', X.shape)
print('Target distribution (normalized):\n', y.value_counts(normalize=True))

Using target column: Exited
Shape: (10000, 12)
Target distribution (normalized):
 Exited
0    0.7963
1    0.2037
Name: proportion, dtype: float64


In [3]:
# Load preprocessing pipeline and split
preproc_path = 'preprocessing_pipeline.joblib'
try:
    preprocessor = joblib.load(preproc_path)
    print('Loaded preprocessing pipeline from', preproc_path)
except Exception as e:
    print('Could not load preprocessing pipeline:', e)
    raise

# Train/test split (stratify to preserve imbalance)
try:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
except Exception:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print('Train/test sizes:', X_train.shape, X_test.shape)

Loaded preprocessing pipeline from preprocessing_pipeline.joblib
Train/test sizes: (8000, 12) (2000, 12)


In [4]:
# Define baseline models wrapped in pipelines to avoid leakage
lr = LogisticRegression(solver='liblinear', class_weight='balanced', random_state=42)
dt = DecisionTreeClassifier(class_weight='balanced', random_state=42)

pipe_lr = Pipeline([('preprocessor', preprocessor), ('clf', lr)])
pipe_dt = Pipeline([('preprocessor', preprocessor), ('clf', dt)])

models = {'LogisticRegression': pipe_lr, 'DecisionTree': pipe_dt}
print('models ready:', list(models.keys()))

models ready: ['LogisticRegression', 'DecisionTree']


In [5]:
import sys, os
proj_root = os.path.abspath('..')
if proj_root not in sys.path:
    sys.path.insert(0, proj_root)

try:
    from src.evaluation import evaluate_model
    print('Imported evaluate_model from src/evaluation.py')
except Exception as e:
    print('Import failed:', e)
    print('sys.path:', sys.path)
    raise

Imported evaluate_model from src/evaluation.py


In [6]:
from src.results import save_results

results = {}
for name, m in models.items():
    results[name] = evaluate_model(name, m, X_train, y_train, X_test, y_test)

res_df = save_results(results, out_dir='src', filename='results.csv')
res_df

ValueError: A given column is not a column of the dataframe