In [189]:
import pandas as pd
import numpy as np

df = pd.read_csv('https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv')
df

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.80,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1
...,...,...,...,...,...,...,...,...,...
1457,referral,manufacturing,1,,self_employed,north_america,4,0.53,1
1458,referral,technology,3,65259.0,student,europe,2,0.24,1
1459,paid_ads,technology,1,45688.0,student,north_america,3,0.02,1
1460,referral,,5,71016.0,self_employed,north_america,0,0.25,1


In [190]:
df.columns

Index(['lead_source', 'industry', 'number_of_courses_viewed', 'annual_income',
       'employment_status', 'location', 'interaction_count', 'lead_score',
       'converted'],
      dtype='object')

In [191]:
categorical = ['lead_source', 'industry', 'employment_status', 'location']
numerical = ['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score']
value = ['converted']
df[categorical] = df[categorical].fillna('NA')
df[numerical] = df[numerical].fillna(0.0)

In [192]:
from sklearn.model_selection import train_test_split

X = df[categorical + numerical]
y = df[value]

X_full_train, X_test, y_full_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(X_full_train, y_full_train, test_size=0.2, random_state=1)

In [193]:
X_full_train.reset_index(drop=True)
X_test.reset_index(drop=True)
y_full_train.reset_index(drop=True)
y_test.reset_index(drop=True)
X_train.reset_index(drop=True)
X_val.reset_index(drop=True)
y_train.reset_index(drop=True)
y_val.reset_index(drop=True)

Unnamed: 0,converted
0,1
1,1
2,1
3,0
4,1
...,...
229,1
230,1
231,0
232,1


In [194]:
from sklearn.metrics import roc_auc_score

results = []

for col in numerical:
    scores = X_full_train_df[col].values
    auc = roc_auc_score(y_full_train, scores)
    
    if auc < 0.5:
        scores = -scores
        auc = roc_auc_score(y_full_train, scores)
    
    results.append((col, auc))

results

[('number_of_courses_viewed', 0.7578540402328948),
 ('annual_income', 0.5533332306460059),
 ('interaction_count', 0.7276793690890606),
 ('lead_score', 0.6163611718677798)]

In [195]:
from sklearn.feature_extraction import DictVectorizer

X_train_df = X_train
X_val_df   = X_val
X_test_df  = X_test

train_dicts = X_train_df[categorical + numerical].to_dict(orient='records')
val_dicts   = X_val_df[categorical + numerical].to_dict(orient='records')
test_dicts  = X_test_df[categorical + numerical].to_dict(orient='records')

dv = DictVectorizer(sparse=False)

X_train_enc = dv.fit_transform(train_dicts)
X_val_enc   = dv.transform(val_dicts)
X_test_enc  = dv.transform(test_dicts)

In [196]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

y_train = y_train.squeeze()
y_val = y_val.squeeze()

model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000)
model.fit(X_train_enc, y_train)
y_pred = model.predict(X_val_enc)

In [197]:
from sklearn.metrics import roc_auc_score

roc_auc_score(y_val, y_pred)

0.6568525273579989

In [198]:
from sklearn.metrics import recall_score, precision_score

thresholds = np.arange(0, 1.01, 0.01)
y_proba = model.predict_proba(X_val_enc)[:, 1]

for t in thresholds:
    y_pred = (y_proba >= t).astype(int)
    recall = recall_score(y_val, y_pred, zero_division=0)
    precision = precision_score(y_val, y_pred, zero_division=0)
    if recall - precision < 0.01:
        print(t)
        break

0.64


In [199]:
max = 0
max_t = 0

for t in thresholds:
    y_pred = (y_proba >= t).astype(int)
    recall = recall_score(y_val, y_pred, zero_division=0)
    precision = precision_score(y_val, y_pred, zero_division=0)
    if recall != 0 or precision != 0:
        f1 = 2 * recall * precision / (recall + precision)
        if f1 > max:
            max = f1
            max_t = t
max_t

np.float64(0.58)

In [200]:
from sklearn.model_selection import KFold

X_full_train, X_test, y_full_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

X_full_train.reset_index(drop=True)
X_test.reset_index(drop=True)
y_full_train.reset_index(drop=True)
y_test.reset_index(drop=True)

X_full_train_df = X_full_train
X_test_df = X_test

full_train_dicts = X_full_train_df[categorical + numerical].to_dict(orient='records')
test_dicts  = X_test_df[categorical + numerical].to_dict(orient='records')

dv = DictVectorizer(sparse=False)

X_full_train_enc = dv.fit_transform(full_train_dicts)
X_test_enc  = dv.transform(test_dicts)

kf = KFold(n_splits=5, shuffle=True, random_state=1)
roc_scores = []

for train_index, test_index in kf.split(X_full_train_enc):
    X_fold_train = X_full_train_enc[train_index]
    X_fold_test = X_full_train_enc[test_index]
    y_fold_train = y_full_train.iloc[train_index].squeeze()
    y_fold_test  = y_full_train.iloc[test_index].squeeze()
    
    model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000)
    model.fit(X_fold_train, y_fold_train)
    y_proba = model.predict_proba(X_fold_test)[:, 1]
    roc_scores.append(roc_auc_score(y_fold_test, y_proba))
roc_scores = np.asarray(roc_scores, dtype=float)
roc_scores.std()

np.float64(0.03580711942905165)

In [201]:
from sklearn.model_selection import cross_val_score

kf = KFold(n_splits=5, shuffle=True, random_state=1)
results = []

for c in [0.000001, 0.001, 1]:
    model = LogisticRegression(solver='liblinear', C=c, max_iter=1000)
    scores = cross_val_score(model, X_full_train_enc, y_full_train.squeeze(), cv=kf, scoring='roc_auc')
    
    mean_score = round(np.mean(scores), 3)
    std_score = round(np.std(scores), 3)
    results.append((c, mean_score, std_score))
results

[(1e-06, np.float64(0.56), np.float64(0.024)),
 (0.001, np.float64(0.867), np.float64(0.029)),
 (1, np.float64(0.822), np.float64(0.036))]