In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score, accuracy_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression

In [2]:
df = pd.read_csv("https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv")
df.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1462 entries, 0 to 1461
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   lead_source               1334 non-null   object 
 1   industry                  1328 non-null   object 
 2   number_of_courses_viewed  1462 non-null   int64  
 3   annual_income             1281 non-null   float64
 4   employment_status         1362 non-null   object 
 5   location                  1399 non-null   object 
 6   interaction_count         1462 non-null   int64  
 7   lead_score                1462 non-null   float64
 8   converted                 1462 non-null   int64  
dtypes: float64(2), int64(3), object(4)
memory usage: 102.9+ KB


In [4]:
df.isnull().sum()

lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64

In [5]:
categorical_columns = df.select_dtypes(include=object).columns.to_list()
categorical_columns

['lead_source', 'industry', 'employment_status', 'location']

In [6]:
numerical_columns = list(set(df.columns) - set(categorical_columns))
numerical_columns.remove("converted")
numerical_columns

['annual_income',
 'interaction_count',
 'lead_score',
 'number_of_courses_viewed']

In [7]:
df[numerical_columns] = df[numerical_columns].fillna(0.0)
df[categorical_columns] = df[categorical_columns].fillna("NA")

In [8]:
df.industry.mode()

0    retail
Name: industry, dtype: object

In [9]:
corr_matrix = df[numerical_columns].corr()
corr_matrix
#sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap="Blues")

Unnamed: 0,annual_income,interaction_count,lead_score,number_of_courses_viewed
annual_income,1.0,0.027036,0.01561,0.00977
interaction_count,0.027036,1.0,0.009888,-0.023565
lead_score,0.01561,0.009888,1.0,-0.004879
number_of_courses_viewed,0.00977,-0.023565,-0.004879,1.0


In [10]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_valid = train_test_split(df_full_train, test_size=0.25,random_state=42)

In [11]:
y_train = df_train["converted"].values
y_valid = df_valid["converted"].values
y_test = df_test["converted"].values

del df_train["converted"]
del df_valid["converted"]
del df_test["converted"]

In [12]:
def mutual_info_convert_score(series):
    return mutual_info_score(series, y_train)

In [13]:
mi = df_train[categorical_columns].apply(mutual_info_convert_score)
mi.sort_values(ascending=False)
mi.round(2)

lead_source          0.04
industry             0.01
employment_status    0.01
location             0.00
dtype: float64

In [14]:
ohe = OneHotEncoder(sparse_output=False, handle_unknown="ignore")

In [15]:
X_train_cat = ohe.fit_transform(df_train[categorical_columns])

In [16]:
ohe.get_feature_names_out()

array(['lead_source_NA', 'lead_source_events',
       'lead_source_organic_search', 'lead_source_paid_ads',
       'lead_source_referral', 'lead_source_social_media', 'industry_NA',
       'industry_education', 'industry_finance', 'industry_healthcare',
       'industry_manufacturing', 'industry_other', 'industry_retail',
       'industry_technology', 'employment_status_NA',
       'employment_status_employed', 'employment_status_self_employed',
       'employment_status_student', 'employment_status_unemployed',
       'location_NA', 'location_africa', 'location_asia',
       'location_australia', 'location_europe', 'location_middle_east',
       'location_north_america', 'location_south_america'], dtype=object)

In [17]:
X_train_num = df_train[numerical_columns].values

In [18]:
X_train = np.column_stack([X_train_num, X_train_cat])

In [19]:
model = LogisticRegression(solver='liblinear', C=100, max_iter=1000, random_state=42)

In [20]:
model.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,100
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'liblinear'
,max_iter,1000


In [21]:
X_valid_num = df_valid[numerical_columns].values
X_valid_cat = ohe.transform(df_valid[categorical_columns])
X_valid = np.column_stack([X_valid_num, X_valid_cat])
y_pred = model.predict(X_valid)

In [22]:
accuracy = (y_valid == y_pred).mean()
accuracy

np.float64(0.6996587030716723)

In [23]:

for feature in df_train.columns:
    f_train = df_train.drop(feature,axis=1)
    num_cols = f_train.select_dtypes(exclude="object").columns.values
    cat_cols = list(set(f_train.columns) - set(num_cols))
    ohe = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
    train_cat = ohe.fit_transform(f_train[cat_cols])
    train_num = f_train[num_cols]
    train = np.column_stack([train_num,train_cat])
    model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model.fit(train, y_train)
    f_val = df_valid.drop(feature,axis=1)
    val_cat = ohe.transform(f_val[cat_cols])
    val_num = f_val[num_cols]
    val = np.column_stack([val_num,val_cat])
    pred = model.predict(val)
    print(f"score without {feature}")
    score = accuracy_score(y_valid, pred)
    print(score)
    print(f"difference {accuracy-score}\n")

score without lead_source
0.7030716723549488
difference -0.0034129692832765013

score without industry
0.6996587030716723
difference 0.0

score without number_of_courses_viewed
0.5563139931740614
difference 0.14334470989761094

score without annual_income
0.8532423208191127
difference -0.15358361774744034

score without employment_status
0.6962457337883959
difference 0.0034129692832763903

score without location
0.7098976109215017
difference -0.010238907849829393

score without interaction_count
0.5563139931740614
difference 0.14334470989761094

score without lead_score
0.7064846416382252
difference -0.0068259385665528916



In [24]:
for c in [0.01, 0.1, 1, 10, 100]:
    model = LogisticRegression(solver='liblinear', C=c, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_valid)
    accuracy = (y_valid == y_pred).mean()
    print(f"value of c : {c}, accuracy : {accuracy}")

value of c : 0.01, accuracy : 0.6996587030716723
value of c : 0.1, accuracy : 0.6996587030716723
value of c : 1, accuracy : 0.6996587030716723
value of c : 10, accuracy : 0.6996587030716723
value of c : 100, accuracy : 0.6996587030716723
