# Module 4 ? Evaluation Homework (2025 cohort, updated)

Lead scoring dataset (`course_lead_scoring.csv`) with target `converted`.
This notebook mirrors the new homework text exactly and prints the values you need to map to the multiple?choice options.


## 1. Setup

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, KFold
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, precision_score, recall_score

pd.set_option('display.float_format', lambda x: f'{x:.6f}')


## 2. Load dataset and basic prep

- Source: https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv
- Target: `converted` (already numeric).
- Imputation per brief: categorical ? 'NA'; numerical ? 0.0


In [2]:
URL = 'https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv'

df_raw = pd.read_csv(URL)

TARGET = 'converted'
feature_cols = [c for c in df_raw.columns if c != TARGET]

num_cols = [c for c in feature_cols if pd.api.types.is_numeric_dtype(df_raw[c])]
cat_cols = [c for c in feature_cols if c not in num_cols]

# Apply imputation rules
df = df_raw.copy()
for c in cat_cols:
    df[c] = df[c].astype('object').fillna('NA')
for c in num_cols:
    df[c] = df[c].astype('float64').fillna(0.0)

num_cols, cat_cols, df[TARGET].value_counts(normalize=True)


(['number_of_courses_viewed',
  'annual_income',
  'interaction_count',
  'lead_score'],
 ['lead_source', 'industry', 'employment_status', 'location'],
 converted
 1   0.619015
 0   0.380985
 Name: proportion, dtype: float64)

## 3. Split train/val/test (60/20/20, random_state=1)
We keep class balance via stratify.


In [3]:
X = df[feature_cols].copy()
y = df[TARGET].astype('int64').values

# non-stratified, random_state=1
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.4, random_state=1
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=1
)

len(X_train), len(X_val), len(X_test)


(877, 292, 293)

## 4. Q1 ? ROC AUC feature importance (numeric only)
Treat each numeric feature as a score vs `converted` on the training set; if AUC < 0.5, negate the feature.


In [4]:
def auc_for_numeric(series, y_true):
    s = series.values.astype('float64')
    auc = roc_auc_score(y_true, s)
    if auc < 0.5:
        auc = roc_auc_score(y_true, -s)
    return float(auc)

aucs = {c: auc_for_numeric(X_train[c], y_train) for c in num_cols}

# Show only the four asked
asked = ['lead_score','number_of_courses_viewed','interaction_count','annual_income']
aucs_filtered = {k: aucs.get(k, np.nan) for k in asked}
aucs, aucs_filtered


({'number_of_courses_viewed': 0.7652439024390244,
  'annual_income': 0.5446354552990968,
  'interaction_count': 0.7271914132379249,
  'lead_score': 0.6111168681007025},
 {'lead_score': 0.6111168681007025,
  'number_of_courses_viewed': 0.7652439024390244,
  'interaction_count': 0.7271914132379249,
  'annual_income': 0.5446354552990968})

## 5. Q2 ? Logistic Regression (DictVectorizer OHE)
Fit on train, evaluate AUC on validation.


In [5]:
train_dicts = X_train.to_dict(orient='records')
val_dicts = X_val.to_dict(orient='records')

dv = DictVectorizer(sparse=False)
X_train_dv = dv.fit_transform(train_dicts)
X_val_dv = dv.transform(val_dicts)

model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=1)
model.fit(X_train_dv, y_train)
val_scores = model.predict_proba(X_val_dv)[:, 1]
val_auc = roc_auc_score(y_val, val_scores)
val_auc, round(val_auc, 3)


(0.7944791666666666, 0.794)

## 6. Q3 ? Precision and Recall vs threshold
We scan thresholds 0.00?1.00 step 0.01 and find where precision and recall are equal (closest intersection).


In [6]:
thresholds = np.linspace(0, 1, 101)
precisions, recalls = [], []
for t in thresholds:
    y_bin = (val_scores >= t).astype('int64')
    p = precision_score(y_val, y_bin, zero_division=0)
    r = recall_score(y_val, y_bin)
    precisions.append(p)
    recalls.append(r)

idx = int(np.argmin(np.abs(np.array(precisions) - np.array(recalls))))
th_intersection = float(thresholds[idx])
th_intersection, precisions[idx], recalls[idx]


(0.59, 0.8072916666666666, 0.8072916666666666)

## 7. Q4 ? F1 vs threshold
Compute F1 across thresholds and pick the max.


In [7]:
def f1(p, r):
    return 0.0 if (p + r) == 0 else 2 * p * r / (p + r)

f1s = [f1(p, r) for p, r in zip(precisions, recalls)]
idx_f1 = int(np.argmax(f1s))
th_best_f1 = float(thresholds[idx_f1])
max_f1 = float(f1s[idx_f1])
th_best_f1, max_f1


(0.47000000000000003, 0.8484848484848485)

## 8. Q5 ? 5?Fold CV (AUC std)
We build `df_full_train = train ? val` and compute AUC on 5 folds; report standard deviation.


In [8]:
df_full_train = pd.concat([
    X_train.assign(converted=y_train),
    X_val.assign(converted=y_val)
]).reset_index(drop=True)

features = feature_cols

kf = KFold(n_splits=5, shuffle=True, random_state=1)
auc_scores = []

for tr_idx, va_idx in kf.split(df_full_train):
    df_tr = df_full_train.iloc[tr_idx]
    df_va = df_full_train.iloc[va_idx]

    X_tr = df_tr[features]
    y_tr = df_tr[TARGET].values
    X_va = df_va[features]
    y_va = df_va[TARGET].values

    dv_cv = DictVectorizer(sparse=False)
    X_tr_dv = dv_cv.fit_transform(X_tr.to_dict(orient='records'))
    X_va_dv = dv_cv.transform(X_va.to_dict(orient='records'))

    m = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=1)
    m.fit(X_tr_dv, y_tr)
    p = m.predict_proba(X_va_dv)[:, 1]
    auc_scores.append(roc_auc_score(y_va, p))

auc_std = float(np.std(auc_scores))
auc_scores, auc_std, round(auc_std, 3)


([0.806680393502631,
  0.8067501795260512,
  0.8648193508879363,
  0.8334380892520429,
  0.8153846153846154],
 0.021986552473681004,
 0.022)

## 9. Q6 ? Hyperparameter Tuning (5?Fold CV)
Try C in `[1e-6, 1e-3, 1]`; compute mean/std AUC and select best C per rules (best mean ? smallest std ? smallest C).


In [9]:
C_grid = [0.000001, 0.001, 1]
results = {}

for C in C_grid:
    scores = []
    for tr_idx, va_idx in kf.split(df_full_train):
        df_tr = df_full_train.iloc[tr_idx]
        df_va = df_full_train.iloc[va_idx]

        X_tr = df_tr[features]
        y_tr = df_tr[TARGET].values
        X_va = df_va[features]
        y_va = df_va[TARGET].values

        dv_cv = DictVectorizer(sparse=False)
        X_tr_dv = dv_cv.fit_transform(X_tr.to_dict(orient='records'))
        X_va_dv = dv_cv.transform(X_va.to_dict(orient='records'))

        m = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=1)
        m.fit(X_tr_dv, y_tr)
        p = m.predict_proba(X_va_dv)[:, 1]
        scores.append(roc_auc_score(y_va, p))

    results[C] = {
        'mean': round(float(np.mean(scores)), 3),
        'std': round(float(np.std(scores)), 3)
    }

best_mean = max(v['mean'] for v in results.values())
cands = [C for C, v in results.items() if v['mean'] == best_mean]
if len(cands) > 1:
    best_std = min(results[C]['std'] for C in cands)
    cands = [C for C in cands if results[C]['std'] == best_std]

best_C = min(cands)
results, best_C


({1e-06: {'mean': 0.543, 'std': 0.025},
  0.001: {'mean': 0.864, 'std': 0.014},
  1: {'mean': 0.825, 'std': 0.022}},
 0.001)

## 10. Summary for submission

In [10]:
summary = {
    'Q1_numeric_auc_filtered': aucs_filtered,
    'Q2_val_auc': round(val_auc, 3),
    'Q3_threshold_intersection': th_intersection,
    'Q4_threshold_best_f1': th_best_f1,
    'Q5_auc_std_5fold': round(auc_std, 3),
    'Q6_results': results,
    'Q6_best_C': best_C,
}
summary


{'Q1_numeric_auc_filtered': {'lead_score': 0.6111168681007025,
  'number_of_courses_viewed': 0.7652439024390244,
  'interaction_count': 0.7271914132379249,
  'annual_income': 0.5446354552990968},
 'Q2_val_auc': 0.794,
 'Q3_threshold_intersection': 0.59,
 'Q4_threshold_best_f1': 0.47000000000000003,
 'Q5_auc_std_5fold': 0.022,
 'Q6_results': {1e-06: {'mean': 0.543, 'std': 0.025},
  0.001: {'mean': 0.864, 'std': 0.014},
  1: {'mean': 0.825, 'std': 0.022}},
 'Q6_best_C': 0.001}