In [95]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

df = pd.read_csv('https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv')

In [96]:
df.dtypes
categorical = ['lead_source', 'industry', 'employment_status', 'location']
numerical = ['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score']
df[categorical] = df[categorical].fillna('NA')
df[numerical] = df[numerical].fillna(0)
df

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.80,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1
...,...,...,...,...,...,...,...,...,...
1457,referral,manufacturing,1,0.0,self_employed,north_america,4,0.53,1
1458,referral,technology,3,65259.0,student,europe,2,0.24,1
1459,paid_ads,technology,1,45688.0,student,north_america,3,0.02,1
1460,referral,,5,71016.0,self_employed,north_america,0,0.25,1


In [97]:
df['industry'].mode()

0    retail
Name: industry, dtype: object

In [98]:
correlation_matrix = df[numerical].corr()
correlation_matrix

Unnamed: 0,number_of_courses_viewed,annual_income,interaction_count,lead_score
number_of_courses_viewed,1.0,0.00977,-0.023565,-0.004879
annual_income,0.00977,1.0,0.027036,0.01561
interaction_count,-0.023565,0.027036,1.0,0.009888
lead_score,-0.004879,0.01561,0.009888,1.0


In [99]:
X = df[categorical + numerical]
y = df['converted']
X_full_train, X_test, y_full_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_full_train, y_full_train, test_size=0.25, random_state=42)

In [100]:
X_full_train.reset_index(drop=True)
X_test.reset_index(drop=True)
y_full_train.reset_index(drop=True)
y_test.reset_index(drop=True)
X_train.reset_index(drop=True)
X_val.reset_index(drop=True)
y_train.reset_index(drop=True)
y_val.reset_index(drop=True)

0      0
1      1
2      0
3      0
4      0
      ..
288    0
289    1
290    0
291    1
292    0
Name: converted, Length: 293, dtype: int64

In [123]:
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import mutual_info_classif

X_cat = X_train[categorical].copy()

for col in categorical:
    le = LabelEncoder()
    X_cat[col] = le.fit_transform(X_cat[col])

mi = mutual_info_classif(X_cat, y_train, random_state=42)
mi_scores = pd.Series(mi, index=categorical).sort_values(ascending=False)
print(mi_scores)

lead_source          0.036976
industry             0.009633
employment_status    0.000000
location             0.000000
dtype: float64


In [101]:
from sklearn.feature_extraction import DictVectorizer

X_train_df = X_train
X_val_df   = X_val
X_test_df  = X_test

train_dicts = X_train_df[categorical + numerical].to_dict(orient='records')
val_dicts   = X_val_df[categorical + numerical].to_dict(orient='records')
test_dicts  = X_test_df[categorical + numerical].to_dict(orient='records')

dv = DictVectorizer(sparse=False)

X_train_enc = dv.fit_transform(train_dicts)
X_val_enc   = dv.transform(val_dicts)
X_test_enc  = dv.transform(test_dicts)

In [127]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train_enc, y_train)
y_pred = model.predict(X_val_enc)
baseline_acc = accuracy_score(y_val, y_pred)
baseline_acc

0.6996587030716723

In [132]:
feature_names = dv.get_feature_names_out()
results = {}

for feat in ["industry", "employment_status", "lead_score"]:
    drop_idx = [i for i, f in enumerate(feature_names) if f.startswith(feat)]

    X_train_red = np.delete(X_train_enc, drop_idx, axis=1)
    X_val_red   = np.delete(X_val_enc, drop_idx, axis=1)

    model = LogisticRegression(max_iter=5000, random_state=42)
    model.fit(X_train_red, y_train)
    
    y_pred_val = model.predict(X_val_red)
    acc = accuracy_score(y_val, y_pred_val)

    results[feat] = baseline_acc - acc

results

{'industry': -0.13993174061433444,
 'employment_status': -0.13310580204778155,
 'lead_score': -0.11945392491467577}

In [162]:
accuracies = []

for c in [0.01, 0.1, 1, 10, 100]:
    regularized_model = LogisticRegression(solver='liblinear', C=c, max_iter=1000, random_state=42)

    regularized_model.fit(X_train_enc, y_train)
    y_pred_val = regularized_model.predict(X_val_enc)
    accuracy = (y_val == y_pred_val).mean()
    accuracies.append((c, accuracy))
    
best = max(accuracies, key=lambda x: x[1])
best

(0.01, np.float64(0.6996587030716723))