In [1]:
!wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv

--2025-10-10 16:47:50--  https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.108.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 80876 (79K) [text/plain]
Saving to: ‘course_lead_scoring.csv’


2025-10-10 16:47:50 (10.4 MB/s) - ‘course_lead_scoring.csv’ saved [80876/80876]



In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split


In [3]:
df = pd.read_csv('course_lead_scoring.csv')
df.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In [6]:
df.isnull().sum()

lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64

In [7]:
df.dtypes

lead_source                  object
industry                     object
number_of_courses_viewed      int64
annual_income               float64
employment_status            object
location                     object
interaction_count             int64
lead_score                  float64
converted                     int64
dtype: object

In [8]:
print("\n--- df.info() ---")
df.info()

print("\ncolumns:", df.columns.tolist())


--- df.info() ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1462 entries, 0 to 1461
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   lead_source               1334 non-null   object 
 1   industry                  1328 non-null   object 
 2   number_of_courses_viewed  1462 non-null   int64  
 3   annual_income             1281 non-null   float64
 4   employment_status         1362 non-null   object 
 5   location                  1399 non-null   object 
 6   interaction_count         1462 non-null   int64  
 7   lead_score                1462 non-null   float64
 8   converted                 1462 non-null   int64  
dtypes: float64(2), int64(3), object(4)
memory usage: 102.9+ KB

columns: ['lead_source', 'industry', 'number_of_courses_viewed', 'annual_income', 'employment_status', 'location', 'interaction_count', 'lead_score', 'converted']


In [9]:
# Separate numerical and categorical 
num_cols = df.select_dtypes(include=['int64', 'float64']).columns
cat_cols = df.select_dtypes(include=['object']).columns

print("Numerical:", list(num_cols))
print("Categorical:", list(cat_cols))

Numerical: ['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score', 'converted']
Categorical: ['lead_source', 'industry', 'employment_status', 'location']


In [10]:
# Fill missing values
df[num_cols] = df[num_cols].fillna(0.0)
df[cat_cols] = df[cat_cols].fillna('NA')

In [11]:
# Double check that everything is filled
df.isnull().sum().sum()

np.int64(0)

In [12]:
df['converted'].value_counts()

converted
1    905
0    557
Name: count, dtype: int64

In [15]:
# Question 1 — Most frequent value (mode) for industry
df['industry'].mode()[0]

'retail'

In [28]:
# Question 2 — Correlation matrix
# df.corr() → builds a correlation matrix — it checks how close the relationship is between every pair of number columns.
# +1 means “move together exactly the same way.” ; -1 means “opposite directions.” ; 0 means “no connection.”
# numeric_only=True → ignore text columns, only use numbers.

corr = df.corr(numeric_only=True)
corr

Unnamed: 0,number_of_courses_viewed,annual_income,interaction_count,lead_score,converted
number_of_courses_viewed,1.0,0.00977,-0.023565,-0.004879,0.435914
annual_income,0.00977,1.0,0.027036,0.01561,0.053131
interaction_count,-0.023565,0.027036,1.0,0.009888,0.374573
lead_score,-0.004879,0.01561,0.009888,1.0,0.193673
converted,0.435914,0.053131,0.374573,0.193673,1.0


In [17]:
# Compare pairs listed in the question:
# Correlation shows how two number columns move together.
# Positive (close to 1) → when one goes up, the other also goes up.
# Negative (close to -1) → when one goes up, the other goes down.
# Closer to 0 → almost no relationship.
pairs = [
    ('interaction_count', 'lead_score'),
    ('number_of_courses_viewed', 'lead_score'),
    ('number_of_courses_viewed', 'interaction_count'),
    ('annual_income', 'interaction_count')
]

for a, b in pairs:
    print(f"{a} vs {b}: {corr.loc[a, b]:.3f}")

interaction_count vs lead_score: 0.010
number_of_courses_viewed vs lead_score: -0.005
number_of_courses_viewed vs interaction_count: -0.024
annual_income vs interaction_count: 0.027


In [18]:
# Question 3 — Split data in train/val/test sets with 60%/20%/20% distribution
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

df_train.shape, df_val.shape, df_test.shape

((876, 9), (293, 9), (293, 9))

In [19]:
# Make sure that the target value y is not in your dataframe.
y_train = df_train['converted'].values
y_val = df_val['converted'].values
y_test = df_test['converted'].values

X_train = df_train.drop(columns=['converted'])
X_val = df_val.drop(columns=['converted'])
X_test = df_test.drop(columns=['converted'])

In [29]:
# Question 3. Calculate the mutual information score between y and other categorical variables in the dataset. Use the training set only.
# Round the scores to 2 decimals using round(score, 2). Which of these variables has the biggest mutual information score?

# Choose our categorical columns. Turn text into numbers (.cat.codes), because computers don’t understand words.
# mutual_info_classif() → measures how much knowing that feature helps guess converted. We make a Series to print the scores neatly

# Mutual information (MI) as a measure of how much a variable tells us about the target (y = has the client signed up or not).
# High MI → the feature is useful for predicting y.
#Low MI → the feature doesn’t help much.

from sklearn.feature_selection import mutual_info_classif

cat = ['industry', 'location', 'lead_source', 'employment_status']

# Convert each categorical column to category codes
X_train_cat = X_train[cat].copy()
for col in cat:
    X_train_cat[col] = X_train_cat[col].astype('category').cat.codes

# Now calculate mutual information
mi = mutual_info_classif(X_train_cat, y_train, discrete_features=True)

# Put results in a Series, sort, and round
mi_scores = pd.Series(mi, index=cat).sort_values(ascending=False)
mi_scores = mi_scores.round(2)
mi_scores

lead_source          0.04
employment_status    0.01
industry             0.01
location             0.00
dtype: float64

In [24]:
# Question 4. Now let's train a logistic regression.
# Remember that we have several categorical variables in the dataset. Include them using one-hot encoding. Fit the model on the training dataset.
# To make sure the results are reproducible across different versions of Scikit-Learn, fit the model with these parameters:
# model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42) Calculate the accuracy on the validation dataset 
# and round it to 2 decimal digits.

from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import accuracy_score

# combine categorical and numerical
dicts_train = X_train.to_dict(orient='records')
dicts_val = X_val.to_dict(orient='records')

# DictVectorizer changes text into 0s and 1s (for example: industry=tech → column “industry_tech”=1).
# Now X_train_encoded is all numeric and ready for the model.
dv = DictVectorizer(sparse=False)
X_train_encoded = dv.fit_transform(dicts_train)
X_val_encoded = dv.transform(dicts_val)

model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train_encoded, y_train)

y_pred = model.predict(X_val_encoded)
acc = accuracy_score(y_val, y_pred)
round(acc, 2)

0.7

In [25]:
# Question 5. Let's find the least useful feature using the feature elimination technique.
# Train a model using the same features and parameters as in Q4 (without rounding).
# Now exclude each feature from this set and train a model without it. Record the accuracy for each model.
# For each feature, calculate the difference between the original accuracy and the accuracy without the feature.
# Which of following feature has the smallest difference?

# If the difference is small or zero, removing that feature didn’t affect accuracy much → it’s least useful.
# If the difference is large (positive or negative), removing it changed accuracy a lot → it’s more important.
# In this case Industry is least useful.

base_acc = accuracy_score(y_val, y_pred)
print("Base accuracy:", base_acc)

feature_to_test = ['industry', 'employment_status', 'lead_score']
diffs = {}

for f in feature_to_test:
    X_train_sub = X_train.drop(columns=[f])
    X_val_sub = X_val.drop(columns=[f])

    dv_sub = DictVectorizer(sparse=False)
    X_train_enc_sub = dv_sub.fit_transform(X_train_sub.to_dict(orient='records'))
    X_val_enc_sub = dv_sub.transform(X_val_sub.to_dict(orient='records'))

    model_sub = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model_sub.fit(X_train_enc_sub, y_train)
    acc_sub = accuracy_score(y_val, model_sub.predict(X_val_enc_sub))
    diffs[f] = base_acc - acc_sub

diffs

Base accuracy: 0.6996587030716723


{'industry': 0.0,
 'employment_status': 0.0034129692832763903,
 'lead_score': -0.0068259385665528916}

In [26]:
# Question 6. Now let's train a regularized logistic regression.
# Let's try the following values of the parameter C: [0.01, 0.1, 1, 10, 100]. Train models using all the features as in Q4.
# Calculate the accuracy on the validation dataset and round it to 3 decimal digits. Which of these C leads to the best accuracy on the validation set?

# We remember our original accuracy.Then, one by one, we remove a feature and retrain. We see how much accuracy drops.The one with the smallest drop 
# is least useful.

for C in [0.01, 0.1, 1, 10, 100]:
    model = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)
    model.fit(X_train_encoded, y_train)
    acc = accuracy_score(y_val, model.predict(X_val_encoded))
    print(f"C={C}: {acc:.3f}")

C=0.01: 0.700
C=0.1: 0.700
C=1: 0.700
C=10: 0.700
C=100: 0.700
