In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

In [2]:
data = 'https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv'

df = pd.read_csv(data)

In [3]:
df.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In [4]:
# 2. Data prep: fill missing values
# For categorical features -> 'NA', for numerical -> 0.0
df = df.copy()
cat_cols = df.select_dtypes(include=['object']).columns.tolist()
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()

for c in cat_cols:
    df[c] = df[c].fillna('NA')
for c in num_cols:
    df[c] = df[c].fillna(0.0)

In [5]:
df.isnull().sum()

lead_source                 0
industry                    0
number_of_courses_viewed    0
annual_income               0
employment_status           0
location                    0
interaction_count           0
lead_score                  0
converted                   0
dtype: int64

In [6]:
# Prepare X, y and splits for Q3-Q6 (ensure converted is removed from features)
df_model = df.copy()
y = df_model['converted'].astype(int).values
X = df_model.drop(columns=['converted'])

from sklearn.model_selection import train_test_split

# Split using sklearn train_test_split seed=42 (60/20/20)
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)
print("\nSplit sizes -> train:", df_train.shape[0], "val:", df_val.shape[0], "test:", df_test.shape[0])



Split sizes -> train: 876 val: 293 test: 293


In [7]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = df_train.converted.values
y_val = df_val.converted.values
y_test = df_test.converted.values

del df_train['converted']
del df_val['converted']
del df_test['converted']

In [8]:
print(f"cat cols: ${cat_cols} ----- num cols: ${num_cols}")

cat cols: $['lead_source', 'industry', 'employment_status', 'location'] ----- num cols: $['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score', 'converted']


In [9]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression

dv = DictVectorizer(sparse=False)

print(type(num_cols))
num_cols.remove('converted')

train_dict = df_train[cat_cols + num_cols].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

model = LogisticRegression()
model.fit(X_train, y_train)

<class 'list'>


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [11]:
val_dict = df_val[cat_cols + num_cols].to_dict(orient='records')
X_val = dv.transform(val_dict)

y_pred = model.predict_proba(X_val)[:, 1]
churn_decision = (y_pred >= 0.5)
(y_val == churn_decision).mean()

np.float64(0.8054607508532423)

In [12]:
from sklearn.metrics import roc_auc_score

roc_auc_score(y_val, y_pred)

0.8890806250599175

In [None]:
from sklearn.metrics import roc_auc_score

# ---------- Question 1: ROC AUC for numeric features ----------
# For each numeric variable, use its values as 'score' and compute AUC on training set.
numeric_features = df_train.select_dtypes(include=[np.number]).columns.tolist()
print("Numeric features:", numeric_features)

auc_scores = {}
for f in numeric_features:
    # Use column as score
    scores = X_train[f].values
    auc_val = roc_auc_score(y_train, scores)
    # If AUC < 0.5, invert the score per instructions
    if auc_val < 0.5:
        auc_val = roc_auc_score(y_train, -scores)
    auc_scores[f] = auc_val

# Print AUCs for the candidate 4 features
candidates_q1 = ['lead_score','number_of_courses_viewed','interaction_count','annual_income']
for c in candidates_q1:
    print(f"AUC({c}) = {auc_scores.get(c):.6f}")

best_q1 = max(candidates_q1, key=lambda c: auc_scores.get(c, -1))
print("Q1 best feature (highest AUC):", best_q1, "AUC=", auc_scores[best_q1])

In [13]:
from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score
from sklearn.model_selection import KFold
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
import numpy as np
import pandas as pd

# Load and prep dataset (same as before)
df = pd.read_csv("https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv")
df = df.fillna({'industry': 'NA', 'location': 'NA', 'lead_source': 'NA', 'employment_status': 'NA'})
for c in df.select_dtypes(include=[np.number]).columns:
    df[c] = df[c].fillna(0.0)

# Split train/test/val
from sklearn.model_selection import train_test_split
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)
y_train = df_train.converted.values
y_val = df_val.converted.values

# DictVectorizer + LogisticRegression
dv = DictVectorizer(sparse=False)
train_dicts = df_train.drop('converted', axis=1).to_dict(orient='records')
val_dicts = df_val.drop('converted', axis=1).to_dict(orient='records')

X_train = dv.fit_transform(train_dicts)
X_val = dv.transform(val_dicts)

model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000)
model.fit(X_train, y_train)
y_pred = model.predict_proba(X_val)[:, 1]

# AUC
auc = roc_auc_score(y_val, y_pred)
print("AUC:", round(auc, 3))

# Precision/Recall/F1 vs threshold
thresholds = np.linspace(0, 1, 101)
precisions, recalls, f1s = [], [], []
for t in thresholds:
    y_pred_bin = (y_pred >= t)
    precisions.append(precision_score(y_val, y_pred_bin))
    recalls.append(recall_score(y_val, y_pred_bin))
    f1s.append(f1_score(y_val, y_pred_bin))

best_threshold_f1 = thresholds[np.argmax(f1s)]
print("Best F1 threshold:", round(best_threshold_f1, 3))


AUC: 0.817
Best F1 threshold: 0.57


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
