In [28]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from math import sqrt
import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

In [29]:
url = "https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv"
df = pd.read_csv(url)

In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1462 entries, 0 to 1461
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   lead_source               1334 non-null   object 
 1   industry                  1328 non-null   object 
 2   number_of_courses_viewed  1462 non-null   int64  
 3   annual_income             1281 non-null   float64
 4   employment_status         1362 non-null   object 
 5   location                  1399 non-null   object 
 6   interaction_count         1462 non-null   int64  
 7   lead_score                1462 non-null   float64
 8   converted                 1462 non-null   int64  
dtypes: float64(2), int64(3), object(4)
memory usage: 102.9+ KB


In [31]:
#Check for missing values
df.isnull().sum()

lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64

In [32]:
# Identify categorical and numerical columns
categorical_cols = df.select_dtypes(include='object').columns
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns

# Replace missing values
df[categorical_cols] = df[categorical_cols].fillna('NA')
df[numerical_cols] = df[numerical_cols].fillna(0.0)

In [33]:
mode_industry = df['industry'].mode()[0]
mode_industry

'retail'

In [34]:
corr = df.corr(numeric_only=True)
corr

Unnamed: 0,number_of_courses_viewed,annual_income,interaction_count,lead_score,converted
number_of_courses_viewed,1.0,0.00977,-0.023565,-0.004879,0.435914
annual_income,0.00977,1.0,0.027036,0.01561,0.053131
interaction_count,-0.023565,0.027036,1.0,0.009888,0.374573
lead_score,-0.004879,0.01561,0.009888,1.0,0.193673
converted,0.435914,0.053131,0.374573,0.193673,1.0


In [35]:
corr_pairs = {
    "interaction_count and lead_score": corr.loc["interaction_count", "lead_score"],
    "number_of_courses_viewed and lead_score": corr.loc["number_of_courses_viewed", "lead_score"],
    "number_of_courses_viewed and interaction_count": corr.loc["number_of_courses_viewed", "interaction_count"],
    "annual_income and interaction_count": corr.loc["annual_income", "interaction_count"]
}

corr_pairs


{'interaction_count and lead_score': 0.009888182496913131,
 'number_of_courses_viewed and lead_score': -0.004878998354681276,
 'number_of_courses_viewed and interaction_count': -0.023565222882888037,
 'annual_income and interaction_count': 0.02703647240481443}

In [36]:
max_pair = max(corr_pairs, key=corr_pairs.get)
max_pair


'annual_income and interaction_count'

In [37]:
X = df.drop(columns=['converted'])
y = df['converted']


In [38]:
# First split (train = 60%, temp = 40%)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)

# Split temp into validation and test (20%)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

X_train.shape, X_val.shape, X_test.shape

((877, 8), (292, 8), (293, 8))

In [39]:
from sklearn.feature_selection import mutual_info_classif
import pandas as pd

# Categorical columns from training set
cat_cols = X_train.select_dtypes(include='object').columns

# 2. One-hot encoding for categorical columns
X_train_cat = pd.get_dummies(X_train[cat_cols], drop_first=False)

# 3. Compute mutual information between each categorical feature and target
mi_scores = mutual_info_classif(
    X_train_cat, y_train, discrete_features=True, random_state=42
)

mi_df = pd.DataFrame({
    'feature': X_train_cat.columns,
    'mutual_info': mi_scores
})

mi_df['mutual_info'] = mi_df['mutual_info'].round(2)
mi_df = mi_df.sort_values(by='mutual_info', ascending=False)

mi_df.head(10)

Unnamed: 0,feature,mutual_info
3,lead_source_paid_ads,0.02
4,lead_source_referral,0.02
18,employment_status_unemployed,0.02
7,industry_education,0.01
0,lead_source_NA,0.0
15,employment_status_employed,0.0
25,location_north_america,0.0
24,location_middle_east,0.0
23,location_europe,0.0
22,location_australia,0.0


In [40]:
mi_df['original_feature'] = mi_df['feature'].str.split('_').str[0]
mi_grouped = mi_df.groupby('original_feature')['mutual_info'].mean().sort_values(ascending=False)
mi_grouped

original_feature
lead          0.006667
employment    0.004000
industry      0.001250
location      0.000000
Name: mutual_info, dtype: float64

In [41]:
cat_cols = X_train.select_dtypes(include='object').columns
num_cols = X_train.select_dtypes(exclude='object').columns

# 2. Preprocessing: One-hot encode categorical variables, keeping numeric as-is
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
], remainder='passthrough')

# Using linear regression
model = LogisticRegression(
    solver='liblinear',
    C=1.0,
    max_iter=1000,
    random_state=42
)

clf = Pipeline([
    ('preprocessor', preprocessor),
    ('model', model)
])

clf.fit(X_train, y_train)
y_pred = clf.predict(X_val)
from sklearn.metrics import accuracy_score
val_accuracy = round(accuracy_score(y_val, y_pred), 2)

val_accuracy

0.74

In [42]:
def train_model(X_tr, y_tr, X_va, y_va):
    cat_cols = X_tr.select_dtypes(include='object').columns
    preprocessor = ColumnTransformer([
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
    ], remainder='passthrough')

    model = Pipeline([
        ('preprocessor', preprocessor),
        ('model', LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42))
    ])

    model.fit(X_tr, y_tr)
    y_pred = model.predict(X_va)
    
    return accuracy_score(y_va, y_pred)


base_acc = train_model(X_train, y_train, X_val, y_val)
print("Base accuracy:", base_acc)

acc_diff = {}

for col in X_train.columns:
    X_train_drop = X_train.drop(columns=[col])
    X_val_drop = X_val.drop(columns=[col])
    acc_drop = train_model(X_train_drop, y_train, X_val_drop, y_val)
    acc_diff[col] = base_acc - acc_drop

# Result
diff_df = pd.DataFrame(list(acc_diff.items()), columns=['feature', 'accuracy_diff'])
diff_df['accuracy_diff'] = diff_df['accuracy_diff'].round(6)
diff_df = diff_df.sort_values(by='accuracy_diff', ascending=True)

diff_df


Base accuracy: 0.7431506849315068


Unnamed: 0,feature,accuracy_diff
3,annual_income,-0.113014
4,employment_status,-0.003425
1,industry,0.0
5,location,0.0
7,lead_score,0.0
0,lead_source,0.013699
2,number_of_courses_viewed,0.065068
6,interaction_count,0.068493


In [43]:
C_values = [0.01, 0.1, 1, 10, 100]
cat_cols = X_train.select_dtypes(include='object').columns

val_acc_dict = {}

for C_val in C_values:
    preprocessor = ColumnTransformer([
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
    ], remainder='passthrough')
    
    model = Pipeline([
        ('preprocessor', preprocessor),
        ('model', LogisticRegression(solver='liblinear', C=C_val, max_iter=1000, random_state=42))
    ])
    
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_val)
    val_acc = round(accuracy_score(y_val, y_pred), 3)
    val_acc_dict[C_val] = val_acc

# Display results
val_acc_dict


{0.01: 0.743, 0.1: 0.743, 1: 0.743, 10: 0.743, 100: 0.743}