In [39]:
import pandas as pd
import numpy as np
import sklearn as sl
import seaborn as sns
from sklearn.metrics import mutual_info_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [2]:
print(pd.__version__)

2.3.1


In [59]:
df = pd.read_csv('https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv')
df.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In [4]:
print(df.shape)
print(df.describe)

(1462, 9)
<bound method NDFrame.describe of          lead_source       industry  number_of_courses_viewed  annual_income  \
0           paid_ads            NaN                         1        79450.0   
1       social_media         retail                         1        46992.0   
2             events     healthcare                         5        78796.0   
3           paid_ads         retail                         2        83843.0   
4           referral      education                         3        85012.0   
...              ...            ...                       ...            ...   
1457        referral  manufacturing                         1            NaN   
1458        referral     technology                         3        65259.0   
1459        paid_ads     technology                         1        45688.0   
1460        referral            NaN                         5        71016.0   
1461  organic_search        finance                         3        92855.0

In [5]:
for i in df:
    print([i])
    print(df[i].head().unique())
    print(df[i].nunique())

['lead_source']
['paid_ads' 'social_media' 'events' 'referral']
5
['industry']
[nan 'retail' 'healthcare' 'education']
7
['number_of_courses_viewed']
[1 5 2 3]
10
['annual_income']
[79450. 46992. 78796. 83843. 85012.]
1267
['employment_status']
['unemployed' 'employed' nan 'self_employed']
4
['location']
['south_america' 'australia' 'europe']
7
['interaction_count']
[4 1 3]
12
['lead_score']
[0.94 0.8  0.69 0.87 0.62]
101
['converted']
[1 0]
2


In [6]:
df.isnull().sum()

lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64

In [7]:
df.annual_income = df.annual_income.fillna(0)

In [8]:
#Q1
df.industry.mode()[0]

'retail'

In [9]:
df.converted.value_counts(normalize=True) # conversion rate

converted
1    0.619015
0    0.380985
Name: proportion, dtype: float64

In [10]:
global_conversion_rate = df.converted.mean()
round(global_conversion_rate,2)

np.float64(0.62)

In [52]:
print(df.dtypes)
df_treated = df.fillna(0)
categorical_df = df_treated.select_dtypes(include = 'object')
for c in categorical_df:
    categorical_df[c] = categorical_df[c].astype(str)
numerical_df = df_treated.select_dtypes(exclude = 'object')
categorical_df.nunique()

lead_source                  object
industry                     object
number_of_courses_viewed      int64
annual_income               float64
employment_status            object
location                     object
interaction_count             int64
lead_score                  float64
converted                     int64
dtype: object


lead_source          6
industry             8
employment_status    5
location             8
dtype: int64

In [53]:
#Q2
print(numerical_df.corrwith(df.interaction_count).abs())
print(numerical_df.corrwith(df.number_of_courses_viewed).abs())

number_of_courses_viewed    0.023565
annual_income               0.027036
interaction_count           1.000000
lead_score                  0.009888
converted                   0.374573
dtype: float64
number_of_courses_viewed    1.000000
annual_income               0.009770
interaction_count           0.023565
lead_score                  0.004879
converted                   0.435914
dtype: float64


In [54]:
print(numerical_df.nunique())
numerical_df = numerical_df.drop(columns = ["converted","annual_income","lead_score"])

number_of_courses_viewed      10
annual_income               1268
interaction_count             12
lead_score                   101
converted                      2
dtype: int64


In [55]:
#Q3
def mutual_info_churn_score(series):
    return mutual_info_score(series, df_treated.converted)
mi = categorical_df.apply(mutual_info_churn_score)
print(mi)
mi_numerical = numerical_df.apply(mutual_info_churn_score)
print(mi_numerical)

lead_source          0.026574
industry             0.007267
employment_status    0.011070
location             0.001427
dtype: float64
number_of_courses_viewed    0.117433
interaction_count           0.081480
dtype: float64


In [75]:
for i in df:
    #print(i)
    #print(df[i].mode()[0])
    df[i] = df[i].fillna(df[i].mode()[0])
df.isnull().sum()

lead_source                 0
industry                    0
number_of_courses_viewed    0
annual_income               0
employment_status           0
location                    0
interaction_count           0
lead_score                  0
converted                   0
dtype: int64

In [76]:
df_full_train, df_test = train_test_split(df, test_size = 0.2, random_state = 1)

In [77]:
df_train, df_val = train_test_split( df_full_train, test_size = 0.25, random_state = 1)

In [78]:
df_train = df_train.reset_index(drop = True)
df_test = df_test.reset_index(drop = True)
df_val = df_val.reset_index(drop = True)

In [79]:
y_train = df_train.converted.values
y_val = df_val.converted.values
y_test = df_test.converted.values

In [80]:
del df_train["converted"]
del df_val["converted"]
del df_test["converted"]

In [81]:
dv = DictVectorizer(sparse = False)

In [82]:
train_dicts = df_train[['lead_source','industry','employment_status','location','number_of_courses_viewed','interaction_count']].to_dict(orient = 'records')
train_dicts[0]

{'lead_source': 'events',
 'industry': 'manufacturing',
 'employment_status': 'unemployed',
 'location': 'europe',
 'number_of_courses_viewed': 2,
 'interaction_count': 3}

In [85]:
dv.fit(train_dicts)

0,1,2
,dtype,<class 'numpy.float64'>
,separator,'='
,sparse,False
,sort,True


In [86]:
dv.get_feature_names_out(train_dicts)

array(['employment_status=employed', 'employment_status=self_employed',
       'employment_status=student', 'employment_status=unemployed',
       'industry=education', 'industry=finance', 'industry=healthcare',
       'industry=manufacturing', 'industry=other', 'industry=retail',
       'industry=technology', 'interaction_count', 'lead_source=events',
       'lead_source=organic_search', 'lead_source=paid_ads',
       'lead_source=referral', 'lead_source=social_media',
       'location=africa', 'location=asia', 'location=australia',
       'location=europe', 'location=middle_east',
       'location=north_america', 'location=south_america',
       'number_of_courses_viewed'], dtype=object)

In [87]:
dv.transform(train_dicts)

array([[0., 0., 0., ..., 0., 0., 2.],
       [0., 0., 1., ..., 0., 1., 1.],
       [0., 0., 0., ..., 0., 0., 2.],
       ...,
       [0., 0., 0., ..., 1., 0., 1.],
       [0., 0., 1., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 0.]], shape=(876, 25))

In [88]:
X_train = dv.fit_transform(train_dicts)
val_dicts = df_val[['lead_source','industry','employment_status','location','number_of_courses_viewed','interaction_count']].to_dict(orient = 'records')
X_val = dv.transform(val_dicts)

In [89]:
model = LogisticRegression()
model.fit(X_train, y_train)
model.coef_[0].round(3)

array([ 0.73 ,  0.011,  0.166, -0.905,  0.834,  0.139, -0.382, -0.056,
       -0.267, -0.251, -0.015,  0.982, -0.078, -0.045, -1.117,  1.337,
       -0.096, -0.145, -0.175, -0.106,  0.222,  0.01 , -0.206,  0.403,
        1.345])

In [90]:
# hard prediction
model.predict(X_train)
# soft prediction
model.predict_proba(X_train)

array([[0.43221491, 0.56778509],
       [0.01279966, 0.98720034],
       [0.73161614, 0.26838386],
       ...,
       [0.90505457, 0.09494543],
       [0.30800412, 0.69199588],
       [0.93138509, 0.06861491]], shape=(876, 2))

In [91]:
#Q4
y_pred = model.predict_proba(X_val)[:,1]
converted_decision = (y_pred >= 0.5)
(y_val == converted_decision).mean()

np.float64(0.8327645051194539)

In [92]:
df_pred = pd.DataFrame()
df_pred['probability'] = y_pred
df_pred['prediction'] = converted_decision.astype(int)
df_pred['actual'] = y_val
df_pred['correct'] = df_pred.prediction == df_pred.actual

In [93]:
df_pred

Unnamed: 0,probability,prediction,actual,correct
0,0.030252,0,1,False
1,0.529553,1,1,True
2,0.742774,1,1,True
3,0.720121,1,0,False
4,0.863824,1,1,True
...,...,...,...,...
288,0.790102,1,1,True
289,0.379924,0,1,False
290,0.998458,1,1,True
291,0.671458,1,0,False


In [94]:
dict(zip(dv.get_feature_names_out(),model.coef_[0].round(3)))

{'employment_status=employed': np.float64(0.73),
 'employment_status=self_employed': np.float64(0.011),
 'employment_status=student': np.float64(0.166),
 'employment_status=unemployed': np.float64(-0.905),
 'industry=education': np.float64(0.834),
 'industry=finance': np.float64(0.139),
 'industry=healthcare': np.float64(-0.382),
 'industry=manufacturing': np.float64(-0.056),
 'industry=other': np.float64(-0.267),
 'industry=retail': np.float64(-0.251),
 'industry=technology': np.float64(-0.015),
 'interaction_count': np.float64(0.982),
 'lead_source=events': np.float64(-0.078),
 'lead_source=organic_search': np.float64(-0.045),
 'lead_source=paid_ads': np.float64(-1.117),
 'lead_source=referral': np.float64(1.337),
 'lead_source=social_media': np.float64(-0.096),
 'location=africa': np.float64(-0.145),
 'location=asia': np.float64(-0.175),
 'location=australia': np.float64(-0.106),
 'location=europe': np.float64(0.222),
 'location=middle_east': np.float64(0.01),
 'location=north_ameri

In [33]:
small = ['employment_status']
df_train_small = df_train[small].to_dict(orient= 'records')
dv_small = DictVectorizer(sparse = False)
dv_small.fit(df_train_small)
X_train_small = dv_small.transform(df_train_small)

In [34]:
model_small = LogisticRegression()
model_small.fit(X_train_small, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [35]:
w0 = model_small.intercept_[0]
w = model_small.coef_[0]
dict(zip(dv_small.get_feature_names_out(), w.round(3)))

{'employment_status': np.float64(0.0),
 'employment_status=employed': np.float64(0.743),
 'employment_status=self_employed': np.float64(0.504),
 'employment_status=student': np.float64(0.697),
 'employment_status=unemployed': np.float64(-0.136)}

In [36]:
def sigmoid(z):
    return 1/ (1 + np.exp(-z))

In [40]:
pipe = Pipeline([
    ('scaler', StandardScaler()),  # Normalize features
    ('logreg', LogisticRegression(max_iter=1000))  # Base model
])

In [41]:
param_grid = {
    'logreg__C': [0.01, 0.1, 1, 10, 100],
    'logreg__penalty': ['l1', 'l2'],
    'logreg__solver': ['liblinear'],  # Supports both l1 and l2
    'logreg__class_weight': [None, 'balanced']
}

In [42]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

grid_search = GridSearchCV(
    pipe,
    param_grid,
    cv=cv,
    scoring='roc_auc',  # Use ROC-AUC for binary classification
    n_jobs=-1
)

In [43]:
grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)
print("Best ROC-AUC Score:", grid_search.best_score_)

Best Parameters: {'logreg__C': 0.1, 'logreg__class_weight': None, 'logreg__penalty': 'l1', 'logreg__solver': 'liblinear'}
Best ROC-AUC Score: 0.9010443335172897


In [None]:
penalty='l1', solver='liblinear'