In [1]:
!wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv


--2025-10-11 01:28:13--  https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.108.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 80876 (79K) [text/plain]
Saving to: ‘course_lead_scoring.csv’


2025-10-11 01:28:14 (9.87 MB/s) - ‘course_lead_scoring.csv’ saved [80876/80876]



In [34]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import mutual_info_classif
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
import numpy as np





DATA PREP

In [18]:


# Load the dataset
df = pd.read_csv('course_lead_scoring.csv')

# Separate numerical and categorical features
num_cols = ['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score']
cat_cols = ['lead_source', 'industry', 'employment_status', 'location']

# Handle missing values
df[cat_cols] = df[cat_cols].fillna('NA')
df[num_cols] = df[num_cols].fillna(0.0)


In [19]:
df=pd.read_csv('course_lead_scoring.csv')
df.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


QUESTION 1

In [22]:
df.industry.mode()

0    retail
Name: industry, dtype: object

QUESTION 2

In [24]:

# Numerical columns
num_cols = ['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score']

# Compute correlation matrix
corr = df[num_cols].corr().abs()  # abs() → to sort by strength of correlation

# Unstack to get pairwise correlations
corr_unstacked = corr.unstack().reset_index()
corr_unstacked.columns = ['Feature_1', 'Feature_2', 'Correlation']

# Remove self correlations and duplicate pairs (A,B vs B,A)
corr_table = corr_unstacked[corr_unstacked['Feature_1'] != corr_unstacked['Feature_2']]
corr_table = corr_table.drop_duplicates(subset=['Correlation']).sort_values(by='Correlation', ascending=False)

# Reset index for clarity
corr_table = corr_table.reset_index(drop=True)

# Show result
corr_table


Unnamed: 0,Feature_1,Feature_2,Correlation
0,annual_income,interaction_count,0.048618
1,number_of_courses_viewed,annual_income,0.031551
2,number_of_courses_viewed,interaction_count,0.023565
3,interaction_count,lead_score,0.009888
4,annual_income,lead_score,0.005334
5,number_of_courses_viewed,lead_score,0.004879


DATA SPLIT

In [25]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)  # 0.25 x 0.8 = 0.2

# Reset indexes
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = df_train.converted
y_val = df_val.converted
y_test = df_test.converted

# Drop target column from features
X_train = df_train.drop('converted', axis=1)
X_val = df_val.drop('converted', axis=1)
X_test = df_test.drop('converted', axis=1)


QUESTION 3

In [29]:


def calc_mutual_info(df, y):
    mi = {}
    for c in cat_cols:
        # Convert categorical column to numeric labels
        enc = LabelEncoder()
        encoded = enc.fit_transform(df[c])
        mi[c] = mutual_info_classif(encoded.reshape(-1, 1), y, discrete_features=True)[0]
    return mi

mi_scores = calc_mutual_info(X_train, y_train)
mi_scores = {k: round(v, 2) for k, v in mi_scores.items()}
mi_scores


{'lead_source': np.float64(0.04),
 'industry': np.float64(0.01),
 'employment_status': np.float64(0.01),
 'location': np.float64(0.0)}

QUESTION 4

In [None]:
cat_cols = ['lead_source', 'industry', 'employment_status', 'location']
num_cols = ['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score']

X_train[cat_cols] = X_train[cat_cols].fillna('NA')
X_val[cat_cols] = X_val[cat_cols].fillna('NA')
X_test[cat_cols] = X_test[cat_cols].fillna('NA')

X_train[num_cols] = X_train[num_cols].fillna(0.0)
X_val[num_cols] = X_val[num_cols].fillna(0.0)
X_test[num_cols] = X_test[num_cols].fillna(0.0)


In [None]:


ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
X_train_ohe = ohe.fit_transform(X_train[cat_cols])
X_val_ohe = ohe.transform(X_val[cat_cols])

X_train_final = np.hstack([X_train[num_cols].values, X_train_ohe])
X_val_final = np.hstack([X_val[num_cols].values, X_val_ohe])

model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train_final, y_train)

y_pred = model.predict(X_val_final)
acc = round(accuracy_score(y_val, y_pred), 2)
print("Validation Accuracy:", acc)


Validation Accuracy: 0.7


QUESTION 5

In [35]:
base_acc = accuracy_score(y_val, model.predict(X_val_final))
print("Base accuracy:", base_acc)

differences = {}

for feature in num_cols + cat_cols:
    cols_to_use = [c for c in num_cols + cat_cols if c != feature]
    
    X_train_sub = X_train[cols_to_use]
    X_val_sub = X_val[cols_to_use]
    
    # One-hot encode categorical subset
    ohe_sub = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
    X_train_ohe_sub = ohe_sub.fit_transform(X_train_sub.select_dtypes('object'))
    X_val_ohe_sub = ohe_sub.transform(X_val_sub.select_dtypes('object'))
    
    X_train_final_sub = np.hstack([X_train_sub.select_dtypes('number').values, X_train_ohe_sub])
    X_val_final_sub = np.hstack([X_val_sub.select_dtypes('number').values, X_val_ohe_sub])
    
    model_sub = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model_sub.fit(X_train_final_sub, y_train)
    
    acc_sub = accuracy_score(y_val, model_sub.predict(X_val_final_sub))
    differences[feature] = base_acc - acc_sub

differences


Base accuracy: 0.6996587030716723


{'number_of_courses_viewed': 0.14334470989761094,
 'annual_income': -0.15358361774744034,
 'interaction_count': 0.14334470989761094,
 'lead_score': -0.0068259385665528916,
 'lead_source': -0.0034129692832765013,
 'industry': 0.0,
 'employment_status': 0.0034129692832763903,
 'location': -0.010238907849829393}

QUESTION 6

In [38]:
C_values = [0.01, 0.1, 1, 10, 100]
acc_scores = {}

for C in C_values:
    model = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)
    model.fit(X_train_final, y_train)
    acc=accuracy_score(y_val, model.predict(X_val_final))
    acc_scores[C] = acc

acc_scores


{0.01: 0.6996587030716723,
 0.1: 0.6996587030716723,
 1: 0.6996587030716723,
 10: 0.6996587030716723,
 100: 0.6996587030716723}