In [73]:
# !wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv

In [74]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression, RidgeClassifier

%matplotlib inline

In [75]:
df = pd.read_csv('course_lead_scoring.csv')

## Initial EDA

In [76]:
df.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In [77]:
df.describe()

Unnamed: 0,number_of_courses_viewed,annual_income,interaction_count,lead_score,converted
count,1462.0,1281.0,1462.0,1462.0,1462.0
mean,2.031464,59886.273224,2.976744,0.506108,0.619015
std,1.449717,15070.140389,1.681564,0.288465,0.485795
min,0.0,13929.0,0.0,0.0,0.0
25%,1.0,49698.0,2.0,0.2625,0.0
50%,2.0,60148.0,3.0,0.51,1.0
75%,3.0,69639.0,4.0,0.75,1.0
max,9.0,109899.0,11.0,1.0,1.0


In [78]:
# np.array(df.columns)
for col in df.columns.values:
    print(col)
    print(df[col].unique()[:5])
    print(df[col].nunique())
    print(df[col].dtype)
    print()

lead_source
['paid_ads' 'social_media' 'events' 'referral' 'organic_search']
5
object

industry
[nan 'retail' 'healthcare' 'education' 'manufacturing']
7
object

number_of_courses_viewed
[1 5 2 3 0]
10
int64

annual_income
[79450. 46992. 78796. 83843. 85012.]
1267
float64

employment_status
['unemployed' 'employed' nan 'self_employed' 'student']
4
object

location
['south_america' 'australia' 'europe' 'africa' 'middle_east']
7
object

interaction_count
[4 1 3 6 2]
12
int64

lead_score
[0.94 0.8  0.69 0.87 0.62]
101
float64

converted
[1 0]
2
int64



In [79]:
df.isnull().sum()

lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64

In [80]:
df.nunique()

lead_source                    5
industry                       7
number_of_courses_viewed      10
annual_income               1267
employment_status              4
location                       7
interaction_count             12
lead_score                   101
converted                      2
dtype: int64

In [81]:
round(df.converted.mean(), 2)

np.float64(0.62)

## Data Preparation

In [82]:
# categorical_columns = ['lead_source', 'industry', 'employment_status', 'location']
# numerical_columns = ['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score']

categorical_columns = ['lead_source', 'industry', 'employment_status', 'location', 'number_of_courses_viewed', 'interaction_count']
numerical_columns = ['annual_income', 'lead_score']

for c in df.columns:
    if c in categorical_columns:
        df.fillna({c: 'NA'}, inplace=True)
    elif c in numerical_columns:
        df.fillna({c: 0.0}, inplace=True)

In [83]:
df.isnull().sum()

lead_source                 0
industry                    0
number_of_courses_viewed    0
annual_income               0
employment_status           0
location                    0
interaction_count           0
lead_score                  0
converted                   0
dtype: int64

In [84]:
df.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


## Question 1

In [85]:
df.industry.value_counts(normalize=True)

industry
retail           0.138851
finance          0.136799
other            0.135431
healthcare       0.127907
education        0.127907
technology       0.122435
manufacturing    0.119015
NA               0.091655
Name: proportion, dtype: float64

In [86]:
# Answer is 'retail'

## Question 2

In [87]:
df_feat = df.copy()
del df_feat['converted']

n_cols = len(numerical_columns)
corr = np.zeros((n_cols, n_cols))

for i, c in enumerate(numerical_columns):
    mutual = df_feat[numerical_columns].corrwith(df_feat[c])
    corr[:, i] = mutual

In [88]:
corr
# numerical_columns -> ['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score']
# interaction_count and lead_score               ->  0.00988818
# number_of_courses_viewed and lead_score        -> -0.004879
# number_of_courses_viewed and interaction_count -> -0.02356522
# annual_income and interaction_count            ->  0.02703647 (this is the Answer, in absolute value is the greater correlation)

array([[1.        , 0.01560955],
       [0.01560955, 1.        ]])

In [89]:
mutual_info_score(df_feat['number_of_courses_viewed'], df_feat['number_of_courses_viewed'])

1.7145213380877795

In [90]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)
print(len(df_train), len(df_val), len(df_test))

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = df_train.converted.values
y_val = df_val.converted.values
y_test = df_test.converted.values

del df_train['converted']
del df_val['converted']
del df_test['converted']

876 293 293


## Question 3

In [91]:
# We are going to use the mutual_info_score from sklearn.metrics

In [92]:
for c in categorical_columns:
    print(c)
    mutual_info = mutual_info_score(df_train[c], y_train)
    print(round(mutual_info,2))
    print()

lead_source
0.04

industry
0.01

employment_status
0.01

location
0.0

number_of_courses_viewed
0.11

interaction_count
0.08



In [93]:
# Answer is lead_source

## Question 4

In [94]:
df_train

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score
0,paid_ads,retail,0,58472.0,student,middle_east,5,0.03
1,organic_search,manufacturing,3,71738.0,student,middle_east,6,0.77
2,paid_ads,technology,3,81973.0,employed,north_america,2,0.59
3,,technology,1,74956.0,employed,europe,3,0.34
4,organic_search,retail,3,59335.0,student,australia,1,0.98
...,...,...,...,...,...,...,...,...
871,organic_search,other,1,43907.0,employed,australia,4,0.33
872,social_media,retail,3,64969.0,employed,north_america,1,0.18
873,,education,3,89042.0,employed,asia,4,0.75
874,social_media,manufacturing,1,0.0,self_employed,europe,1,0.65


In [95]:
train_dict = df_train[categorical_columns + numerical_columns].to_dict(orient='records')
val_dict = df_val[categorical_columns + numerical_columns].to_dict(orient='records')

print(train_dict[:2])
print(val_dict[:2])

[{'lead_source': 'paid_ads', 'industry': 'retail', 'employment_status': 'student', 'location': 'middle_east', 'number_of_courses_viewed': 0, 'interaction_count': 5, 'annual_income': 58472.0, 'lead_score': 0.03}, {'lead_source': 'organic_search', 'industry': 'manufacturing', 'employment_status': 'student', 'location': 'middle_east', 'number_of_courses_viewed': 3, 'interaction_count': 6, 'annual_income': 71738.0, 'lead_score': 0.77}]
[{'lead_source': 'paid_ads', 'industry': 'healthcare', 'employment_status': 'unemployed', 'location': 'europe', 'number_of_courses_viewed': 3, 'interaction_count': 1, 'annual_income': 52220.0, 'lead_score': 0.07}, {'lead_source': 'organic_search', 'industry': 'technology', 'employment_status': 'unemployed', 'location': 'middle_east', 'number_of_courses_viewed': 3, 'interaction_count': 4, 'annual_income': 59656.0, 'lead_score': 0.65}]


In [96]:
dv = DictVectorizer(sparse=False) # We don't want to compress the zeros typical on Sparse Matrix

X_train = dv.fit_transform(train_dict)
X_val = dv.transform(val_dict)

In [97]:
dv.get_feature_names_out()

array(['annual_income', 'employment_status=NA',
       'employment_status=employed', 'employment_status=self_employed',
       'employment_status=student', 'employment_status=unemployed',
       'industry=NA', 'industry=education', 'industry=finance',
       'industry=healthcare', 'industry=manufacturing', 'industry=other',
       'industry=retail', 'industry=technology', 'interaction_count',
       'lead_score', 'lead_source=NA', 'lead_source=events',
       'lead_source=organic_search', 'lead_source=paid_ads',
       'lead_source=referral', 'lead_source=social_media', 'location=NA',
       'location=africa', 'location=asia', 'location=australia',
       'location=europe', 'location=middle_east',
       'location=north_america', 'location=south_america',
       'number_of_courses_viewed'], dtype=object)

In [98]:
X_train

array([[5.8472e+04, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        0.0000e+00],
       [7.1738e+04, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        3.0000e+00],
       [8.1973e+04, 0.0000e+00, 1.0000e+00, ..., 1.0000e+00, 0.0000e+00,
        3.0000e+00],
       ...,
       [8.9042e+04, 0.0000e+00, 1.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        3.0000e+00],
       [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        1.0000e+00],
       [5.0259e+04, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        4.0000e+00]], shape=(876, 31))

In [99]:
X_train.shape

(876, 31)

In [100]:
X_val.shape

(293, 31)

In [101]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)

In [102]:
model.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'liblinear'
,max_iter,1000


In [103]:
model.coef_[0] # This gives us the W (Weight vector)

array([-1.77843867e-05, -1.47154423e-02,  3.39095225e-02,  2.66248432e-03,
        1.15238518e-02, -1.02527697e-01, -2.48510995e-02,  4.93604222e-02,
       -2.01258344e-02, -1.34214865e-02, -3.00232200e-03, -9.25991830e-03,
       -3.17957304e-02, -1.60513114e-02,  3.11339155e-01,  5.12012528e-02,
        2.01511698e-02, -1.20346284e-02, -1.16021521e-02, -1.15251880e-01,
        7.95303436e-02, -2.99401329e-02,  3.95843295e-03, -1.14296944e-02,
       -1.12457415e-02, -5.59987025e-03,  8.26402635e-03,  5.58598769e-03,
       -3.33967159e-02, -2.52837052e-02,  4.53752887e-01])

In [104]:
model.predict(X_val) # Hard prediction (0/1)

array([1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1,
       0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1,
       0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 0, 1, 0, 1, 0])

In [105]:
model.predict_proba(X_val)[:, 1] # Soft prediction (0/1). Column 0 has negative prediction and 1 has positive prediction, how probable is going to be 1

array([0.61192163, 0.79982617, 0.53021344, 0.47131479, 0.57066132,
       0.44227169, 0.87127669, 0.84883115, 0.83290037, 0.61497801,
       0.54968027, 0.78153088, 0.69039786, 0.77017122, 0.5265944 ,
       0.91706425, 0.53170635, 0.42123049, 0.30146455, 0.84881583,
       0.79488653, 0.73670375, 0.44527211, 0.64838383, 0.4176882 ,
       0.75393418, 0.90166116, 0.33903049, 0.43181431, 0.9680681 ,
       0.92018714, 0.37487988, 0.652301  , 0.90650057, 0.75164117,
       0.64202121, 0.82250075, 0.83375553, 0.659116  , 0.30978853,
       0.78942264, 0.35546366, 0.96517758, 0.63389304, 0.51274195,
       0.53230533, 0.82287785, 0.744074  , 0.73452313, 0.68955217,
       0.46964443, 0.84539252, 0.55635243, 0.92637871, 0.65258021,
       0.61526273, 0.63816995, 0.28304018, 0.48049824, 0.57890618,
       0.35497342, 0.62175051, 0.38960778, 0.61156056, 0.85304278,
       0.75430136, 0.89185954, 0.71946459, 0.95387623, 0.89209517,
       0.75277088, 0.33850139, 0.61376593, 0.51622275, 0.64088

In [106]:
y_pred = model.predict_proba(X_val)[:,1]
prediction_converted = (y_pred >= 0.5)
prediction_converted = prediction_converted.astype('int')

In [107]:
# What accuracy did you get?

accuracy = round((prediction_converted == y_val).mean(), 4)
accuracy

# Answer ~= 0.7

np.float64(0.6997)

## Question 5

In [108]:
total = categorical_columns + numerical_columns

In [109]:
total

['lead_source',
 'industry',
 'employment_status',
 'location',
 'number_of_courses_viewed',
 'interaction_count',
 'annual_income',
 'lead_score']

In [110]:
def calc_log_regr_feature(df_train, df_val, y_train, y_val, cols):
    d = {}
    for c in cols:

        # Removing column
        df_train_aux = df_train[cols].copy()
        df_val_aux = df_val[cols].copy()
        del df_train_aux[c]
        del df_val_aux[c]
    
        # One-Hot Encoding
        train_dict = df_train_aux.to_dict(orient='records')
        val_dict = df_val_aux.to_dict(orient='records')        
        dv = DictVectorizer(sparse=False)
        X_train = dv.fit_transform(train_dict)
        X_val = dv.transform(val_dict)

        # Model
        model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
        model.fit(X_train, y_train)

        # Predict
        y_pred = model.predict_proba(X_val)[:,1]
        prediction_converted = (y_pred >= 0.5)
        prediction_converted = prediction_converted.astype('int')
        pred = round((prediction_converted == y_val).mean(), 4)
        d[c] = pred

    return d
    
ans = calc_log_regr_feature(df_train, df_val, y_train, y_val, categorical_columns + numerical_columns)

In [111]:
for feat, accuracy_removing_feat in ans.items():
    print(f"Different accuracy removing column {feat}", round(accuracy - accuracy_removing_feat, 4))

Different accuracy removing column lead_source -0.0034
Different accuracy removing column industry 0.0
Different accuracy removing column employment_status 0.0035
Different accuracy removing column location -0.0102
Different accuracy removing column number_of_courses_viewed 0.1434
Different accuracy removing column interaction_count 0.1434
Different accuracy removing column annual_income -0.1535
Different accuracy removing column lead_score -0.0068


In [112]:
# Answer is industry, the model accuracy seems to be unmodified if we remove this feature.

## Question 6

In [113]:
def calc_log_regr_reg(df_train, df_val, y_train, y_val, cols, regs):

    # One-Hot Encoding
    train_dict = df_train.to_dict(orient='records')
    val_dict = df_val.to_dict(orient='records')        
    dv = DictVectorizer(sparse=False)
    X_train = dv.fit_transform(train_dict)
    X_val = dv.transform(val_dict)

    d = {}
    for r in regs:

        # Model
        model = RidgeClassifier(max_iter=1000, random_state=42, alpha=r)
        model.fit(X_train, y_train)
    
        # Predict
        y_pred = model.predict(X_val)
        prediction_converted = (y_pred >= 0.5)
        prediction_converted = prediction_converted.astype('int')
        pred = round((prediction_converted == y_val).mean(), 3)
        d[r] = pred

    return d

In [114]:
ans = calc_log_regr_reg(
    df_train, 
    df_val, 
    y_train, 
    y_val, 
    categorical_columns + numerical_columns, 
    [0.01, 0.1, 1, 10, 100]
)

In [115]:
for c, accuracy_regularized in ans.items():
    print(f"Different accuracy removing column {c}", accuracy_regularized)

Different accuracy removing column 0.01 0.85
Different accuracy removing column 0.1 0.85
Different accuracy removing column 1 0.846
Different accuracy removing column 10 0.857
Different accuracy removing column 100 0.836


In [116]:
# Answer: C = 10