In [2]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

In [3]:
df = pd.read_csv('https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv')
df.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In [4]:
categorical_columns = list(df.dtypes[df.dtypes == 'object'].index)
for c in categorical_columns:
    df[c] = df[c].str.lower().str.replace(' ', '_')

In [5]:
categorical_columns

['lead_source', 'industry', 'employment_status', 'location']

In [6]:
df.isnull().sum()

lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64

In [7]:
for x in list(df.dtypes.index):
    if x in categorical_columns:
        df[x] = df[x].fillna('NA')
    else:
        df[x] = df[x].fillna(0.0)

In [8]:
df.isnull().sum()

lead_source                 0
industry                    0
number_of_courses_viewed    0
annual_income               0
employment_status           0
location                    0
interaction_count           0
lead_score                  0
converted                   0
dtype: int64

# Question 1

In [14]:
df.industry.mode()

0    retail
Name: industry, dtype: object

# Question 2

In [15]:
numeric_df = df.select_dtypes(include='number')

In [18]:
correlation_matrix = numeric_df.corr()
print(correlation_matrix)

                          number_of_courses_viewed  annual_income  \
number_of_courses_viewed                  1.000000       0.009770   
annual_income                             0.009770       1.000000   
interaction_count                        -0.023565       0.027036   
lead_score                               -0.004879       0.015610   
converted                                 0.435914       0.053131   

                          interaction_count  lead_score  converted  
number_of_courses_viewed          -0.023565   -0.004879   0.435914  
annual_income                      0.027036    0.015610   0.053131  
interaction_count                  1.000000    0.009888   0.374573  
lead_score                         0.009888    1.000000   0.193673  
converted                          0.374573    0.193673   1.000000  


# Question 3

In [9]:
from sklearn.model_selection import train_test_split

In [39]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

In [21]:
len(df_train), len(df_val), len(df_test)

(876, 293, 293)

In [40]:
y_train = df_train.converted.values
y_val = df_val.converted.values
y_test = df_test.converted.values

del df_train['converted']
del df_val['converted']
del df_test['converted']

In [35]:
from sklearn.metrics import mutual_info_score

In [44]:
mi = []
for c in categorical_columns:
    m = mutual_info_score(df_train[c], y_train)
    print(c, m)
    mi.append(m)

lead_source 0.03539624379726594
industry 0.011574521435657112
employment_status 0.012937677269442782
location 0.004464157884038034


In [43]:
mi.sort(reverse=True)
print(mi)

[0.03539624379726594, 0.012937677269442782, 0.011574521435657112, 0.004464157884038034]


# Question 4

In [14]:
from sklearn.preprocessing import OneHotEncoder

In [15]:
encoder = OneHotEncoder(handle_unknown='ignore')

In [57]:
encoder.fit(df_train)
encoder.categories_[:3]

[array(['NA', 'events', 'organic_search', 'paid_ads', 'referral',
        'social_media'], dtype=object),
 array(['NA', 'education', 'finance', 'healthcare', 'manufacturing',
        'other', 'retail', 'technology'], dtype=object),
 array([0, 1, 2, 3, 4, 5, 6, 7, 8])]

In [43]:
X_train = encoder.transform(df_train)
X_valid = encoder.transform(df_val)

In [18]:
from sklearn.linear_model import LogisticRegression

In [44]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)

In [45]:
model.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'liblinear'
,max_iter,1000


In [61]:
model.intercept_[0]

np.float64(1.234334273116103)

In [54]:
w = model.coef_[0]
w.round(3)[:10]

array([ 0.581,  0.157,  0.127, -1.109,  1.641, -0.161, -0.271,  1.29 ,
        0.04 ,  0.048])

In [46]:
y_pred = model.predict(X_valid)
y_pred

array([0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1,
       0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0,
       0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0,
       1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1,
       0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0,
       1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1,
       1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1,
       0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0,
       1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1,
       0, 1, 0, 0, 0, 1, 0])

In [49]:
round((y_val == y_pred).mean(), 2)

np.float64(0.82)

In [52]:
full_acc = (y_val == y_pred).mean()
full_acc

np.float64(0.8191126279863481)

# Question 5
Feature elimination technique

In [2]:
from sklearn.feature_extraction import DictVectorizer

In [75]:
dv = DictVectorizer(sparse=False)

In [76]:
train_dict = df_train.to_dict(orient='records')
train_dv = dv.fit(train_dict)

In [85]:
dict(zip(dv.get_feature_names_out(), w.round(3)))

{'annual_income': np.float64(0.581),
 'employment_status=NA': np.float64(0.157),
 'employment_status=employed': np.float64(0.127),
 'employment_status=self_employed': np.float64(-1.109),
 'employment_status=student': np.float64(1.641),
 'employment_status=unemployed': np.float64(-0.161),
 'industry=NA': np.float64(-0.271),
 'industry=education': np.float64(1.29),
 'industry=finance': np.float64(0.04),
 'industry=healthcare': np.float64(0.048),
 'industry=manufacturing': np.float64(0.248),
 'industry=other': np.float64(-0.026),
 'industry=retail': np.float64(-0.228),
 'industry=technology': np.float64(0.133),
 'interaction_count': np.float64(-2.969),
 'lead_score': np.float64(-1.77),
 'lead_source=NA': np.float64(-0.851),
 'lead_source=events': np.float64(0.553),
 'lead_source=organic_search': np.float64(1.466),
 'lead_source=paid_ads': np.float64(2.24),
 'lead_source=referral': np.float64(1.563),
 'lead_source=social_media': np.float64(0.652),
 'location=NA': np.float64(0.351),
 'locat

## Industry

In [10]:
df_ind = df.drop('industry', axis=1)

In [11]:
df_ind_full_train, df_ind_test = train_test_split(df_ind, test_size=0.2, random_state=42)
df_ind_train, df_ind_val = train_test_split(df_ind_full_train, test_size=0.25, random_state=42)

In [13]:
y_ind_train = df_ind_train.converted.values
y_ind_val = df_ind_val.converted.values
y_ind_test = df_ind_test.converted.values

del df_ind_train['converted']
del df_ind_val['converted']
del df_ind_test['converted']

In [16]:
encoder.fit(df_ind_train)

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'


In [17]:
X_ind_train = encoder.transform(df_ind_train)
X_ind_valid = encoder.transform(df_ind_val)

In [22]:
model_ind = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)

In [23]:
model_ind.fit(X_ind_train, y_ind_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'liblinear'
,max_iter,1000


In [32]:
y_ind_pred = model_ind.predict(X_ind_valid)

In [33]:
(y_ind_val == y_ind_pred).mean()

np.float64(0.7986348122866894)

## Employment

In [35]:
df_em = df.drop('employment_status', axis=1)

df_em_full_train, df_em_test = train_test_split(df_em, test_size=0.2, random_state=42)
df_em_train, df_em_val = train_test_split(df_em_full_train, test_size=0.25, random_state=42)

y_em_train = df_em_train.converted.values
y_em_val = df_em_val.converted.values
y_em_test = df_em_test.converted.values

del df_em_train['converted']
del df_em_val['converted']
del df_em_test['converted']

encoder.fit(df_em_train)

X_em_train = encoder.transform(df_em_train)
X_em_valid = encoder.transform(df_em_val)

model_em = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)

model_em.fit(X_em_train, y_em_train)

y_em_pred = model_em.predict(X_em_valid)

(y_em_val == y_em_pred).mean()

np.float64(0.8088737201365188)

## lead_score

In [53]:
df_ls = df.drop('lead_score', axis=1)

df_ls_full_train, df_ls_test = train_test_split(df_ls, test_size=0.2, random_state=42)
df_ls_train, df_ls_val = train_test_split(df_ls_full_train, test_size=0.25, random_state=42)

y_ls_train = df_ls_train.converted.values
y_ls_val = df_ls_val.converted.values
y_ls_test = df_ls_test.converted.values

del df_ls_train['converted']
del df_ls_val['converted']
del df_ls_test['converted']

encoder.fit(df_ls_train)

X_ls_train = encoder.transform(df_ls_train)
X_ls_valid = encoder.transform(df_ls_val)

model_ls = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)

model_ls.fit(X_ls_train, y_ls_train)

y_ls_pred = model_ls.predict(X_ls_valid)

(y_ls_val == y_ls_pred).mean()

np.float64(0.8225255972696246)

# Question 6
Regularization

In [59]:
for r in [0.01, 0.1, 1, 10, 100]:
    model = LogisticRegression(solver='liblinear', C = r, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_valid)
    print((y_val == y_pred).mean())

0.6348122866894198
0.8020477815699659
0.8191126279863481
0.7986348122866894
0.8088737201365188
