<a href="https://colab.research.google.com/github/Thibault13320/machine-learning-zoomcamp/blob/main/03-classification/homework.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import accuracy_score, mutual_info_score

In [2]:
# Download the dataset
!wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv

--2025-10-11 11:11:58--  https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.110.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 80876 (79K) [text/plain]
Saving to: ‘course_lead_scoring.csv’


2025-10-11 11:11:59 (1.65 MB/s) - ‘course_lead_scoring.csv’ saved [80876/80876]



In [3]:
# Load the dataset
df = pd.read_csv('course_lead_scoring.csv')

print("Dataset shape:", df.shape)
print("\nFirst few rows:")
print(df.head())
print("\nDataset info:")
print(df.info())

Dataset shape: (1462, 9)

First few rows:
    lead_source    industry  number_of_courses_viewed  annual_income  \
0      paid_ads         NaN                         1        79450.0   
1  social_media      retail                         1        46992.0   
2        events  healthcare                         5        78796.0   
3      paid_ads      retail                         2        83843.0   
4      referral   education                         3        85012.0   

  employment_status       location  interaction_count  lead_score  converted  
0        unemployed  south_america                  4        0.94          1  
1          employed  south_america                  1        0.80          0  
2        unemployed      australia                  3        0.69          1  
3               NaN      australia                  1        0.87          0  
4     self_employed         europe                  3        0.62          1  

Dataset info:
<class 'pandas.core.frame.DataFrame'

In [4]:
# =============================================================================
# DATA PREPARATION
# =============================================================================

print("\n" + "="*80)
print("DATA PREPARATION")
print("="*80)



DATA PREPARATION


In [5]:
# Check for missing values
print("\nMissing values per column:")
print(df.isnull().sum())


Missing values per column:
lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64


In [6]:
# Identify categorical and numerical columns
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
numerical_cols = df.select_dtypes(include=['number']).columns.tolist()

In [7]:
# Remove 'converted' from categorical if present
if 'converted' in categorical_cols:
    categorical_cols.remove('converted')
if 'converted' in numerical_cols:
    numerical_cols.remove('converted')

print(f"\nCategorical columns: {categorical_cols}")
print(f"Numerical columns: {numerical_cols}")


Categorical columns: ['lead_source', 'industry', 'employment_status', 'location']
Numerical columns: ['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score']


In [8]:
# Handle missing values
# For categorical features: replace with 'NA'
for col in categorical_cols:
    df[col] = df[col].fillna('NA')

In [9]:
# For numerical features: replace with 0.0
for col in numerical_cols:
    df[col] = df[col].fillna(0.0)

print("\nMissing values after imputation:")
print(df.isnull().sum().sum())



Missing values after imputation:
0


In [10]:
# =============================================================================
# QUESTION 1: Most frequent observation (mode) for 'industry'
# =============================================================================

print("\n" + "="*80)
print("QUESTION 1")
print("="*80)

mode_industry = df['industry'].mode()[0]
print(f"\nMost frequent observation (mode) for 'industry': {mode_industry}")


QUESTION 1

Most frequent observation (mode) for 'industry': retail


In [11]:
# Show value counts for verification
print("\nIndustry value counts:")
print(df['industry'].value_counts())


Industry value counts:
industry
retail           203
finance          200
other            198
healthcare       187
education        187
technology       179
manufacturing    174
NA               134
Name: count, dtype: int64


In [12]:
# =============================================================================
# QUESTION 2: Correlation matrix for numerical features
# =============================================================================

print("\n" + "="*80)
print("QUESTION 2")
print("="*80)



QUESTION 2


In [13]:
# Create correlation matrix for numerical features
corr_matrix = df[numerical_cols].corr()
print("\nCorrelation matrix:")
print(corr_matrix)


Correlation matrix:
                          number_of_courses_viewed  annual_income  \
number_of_courses_viewed                  1.000000       0.009770   
annual_income                             0.009770       1.000000   
interaction_count                        -0.023565       0.027036   
lead_score                               -0.004879       0.015610   

                          interaction_count  lead_score  
number_of_courses_viewed          -0.023565   -0.004879  
annual_income                      0.027036    0.015610  
interaction_count                  1.000000    0.009888  
lead_score                         0.009888    1.000000  


In [14]:
# Check specific pairs
pairs = [
    ('interaction_count', 'lead_score'),
    ('number_of_courses_viewed', 'lead_score'),
    ('number_of_courses_viewed', 'interaction_count'),
    ('annual_income', 'interaction_count')
]

print("\nCorrelation coefficients for specific pairs:")
max_corr = 0
max_pair = None

for feat1, feat2 in pairs:
    if feat1 in numerical_cols and feat2 in numerical_cols:
        corr_val = corr_matrix.loc[feat1, feat2]
        print(f"{feat1} and {feat2}: {corr_val:.4f}")
        if abs(corr_val) > max_corr:
            max_corr = abs(corr_val)
            max_pair = (feat1, feat2)

print(f"\nPair with biggest correlation: {max_pair[0]} and {max_pair[1]} ({max_corr:.4f})")


Correlation coefficients for specific pairs:
interaction_count and lead_score: 0.0099
number_of_courses_viewed and lead_score: -0.0049
number_of_courses_viewed and interaction_count: -0.0236
annual_income and interaction_count: 0.0270

Pair with biggest correlation: annual_income and interaction_count (0.0270)


In [15]:
# =============================================================================
# SPLIT THE DATA
# =============================================================================

print("\n" + "="*80)
print("DATA SPLITTING")
print("="*80)


DATA SPLITTING


In [16]:
# First split: 60% train, 40% temp
df_train_full, df_temp = train_test_split(df, test_size=0.4, random_state=42)


In [17]:
# Second split: split temp into 50-50 to get 20% val and 20% test
df_val, df_test = train_test_split(df_temp, test_size=0.5, random_state=42)

print(f"\nTrain size: {len(df_train_full)} ({len(df_train_full)/len(df)*100:.1f}%)")
print(f"Validation size: {len(df_val)} ({len(df_val)/len(df)*100:.1f}%)")
print(f"Test size: {len(df_test)} ({len(df_test)/len(df)*100:.1f}%)")


Train size: 877 (60.0%)
Validation size: 292 (20.0%)
Test size: 293 (20.0%)


In [18]:
# Reset indices
df_train_full = df_train_full.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [19]:
# Extract target variable
y_train = df_train_full['converted'].values
y_val = df_val['converted'].values
y_test = df_test['converted'].values

In [20]:
# Remove target from features
df_train = df_train_full.drop(columns=['converted'])
df_val_features = df_val.drop(columns=['converted'])
df_test_features = df_test.drop(columns=['converted'])

print(f"\nTarget distribution in train: {pd.Series(y_train).value_counts().to_dict()}")


Target distribution in train: {1: 535, 0: 342}


In [21]:
# =============================================================================
# QUESTION 3: Mutual information score
# =============================================================================

print("\n" + "="*80)
print("QUESTION 3")
print("="*80)


QUESTION 3


In [22]:
# Calculate mutual information for categorical variables
mi_scores = {}

for col in categorical_cols:
    mi_score = mutual_info_score(y_train, df_train[col])
    mi_scores[col] = round(mi_score, 2)

print("\nMutual information scores:")
for col, score in sorted(mi_scores.items(), key=lambda x: x[1], reverse=True):
    print(f"{col}: {score}")

max_mi_feature = max(mi_scores, key=mi_scores.get)
print(f"\nFeature with highest mutual information: {max_mi_feature} ({mi_scores[max_mi_feature]})")


Mutual information scores:
lead_source: 0.03
industry: 0.02
employment_status: 0.02
location: 0.0

Feature with highest mutual information: lead_source (0.03)


In [23]:
# =============================================================================
# QUESTION 4: Train logistic regression with one-hot encoding
# =============================================================================

print("\n" + "="*80)
print("QUESTION 4")
print("="*80)


QUESTION 4


In [24]:
# Prepare data with one-hot encoding using DictVectorizer
train_dicts = df_train.to_dict(orient='records')
val_dicts = df_val_features.to_dict(orient='records')

dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(train_dicts)
X_val = dv.transform(val_dicts)

print(f"\nFeature matrix shape - Train: {X_train.shape}, Val: {X_val.shape}")


Feature matrix shape - Train: (877, 31), Val: (292, 31)


In [25]:
# Train logistic regression
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

In [26]:
# Calculate accuracy on validation set
y_pred_val = model.predict(X_val)
accuracy_val = accuracy_score(y_val, y_pred_val)
accuracy_val_rounded = round(accuracy_val, 2)

print(f"\nValidation accuracy: {accuracy_val:.4f}")
print(f"Validation accuracy (rounded to 2 decimals): {accuracy_val_rounded}")



Validation accuracy: 0.7432
Validation accuracy (rounded to 2 decimals): 0.74


In [27]:
# =============================================================================
# QUESTION 5: Feature elimination
# =============================================================================

print("\n" + "="*80)
print("QUESTION 5")
print("="*80)


QUESTION 5


In [32]:
# Get baseline accuracy (without rounding)
baseline_accuracy = accuracy_val

print(f"\nBaseline accuracy: {baseline_accuracy:.6f}")


Baseline accuracy: 0.743151


In [33]:
# Test each feature by excluding it
feature_importance = {}
all_features = df_train.columns.tolist()

print("\nTesting all features:")
for feature in all_features:
    # Create dataset without this feature
    df_train_subset = df_train.drop(columns=[feature])
    df_val_subset = df_val_features.drop(columns=[feature])

    # Prepare data
    train_dicts_subset = df_train_subset.to_dict(orient='records')
    val_dicts_subset = df_val_subset.to_dict(orient='records')

    dv_subset = DictVectorizer(sparse=False)
    X_train_subset = dv_subset.fit_transform(train_dicts_subset)
    X_val_subset = dv_subset.transform(val_dicts_subset)

    # Train model
    model_subset = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model_subset.fit(X_train_subset, y_train)

    # Calculate accuracy
    y_pred_subset = model_subset.predict(X_val_subset)
    accuracy_subset = accuracy_score(y_val, y_pred_subset)

    # Calculate difference
    diff = baseline_accuracy - accuracy_subset
    feature_importance[feature] = diff

    print(f"{feature}: accuracy={accuracy_subset:.6f}, diff={diff:.6f}")


Testing all features:
lead_source: accuracy=0.729452, diff=0.013699
industry: accuracy=0.743151, diff=0.000000
number_of_courses_viewed: accuracy=0.678082, diff=0.065068
annual_income: accuracy=0.856164, diff=-0.113014
employment_status: accuracy=0.746575, diff=-0.003425
location: accuracy=0.743151, diff=0.000000
interaction_count: accuracy=0.674658, diff=0.068493
lead_score: accuracy=0.743151, diff=0.000000


In [34]:
# Find feature with smallest difference among ALL features
min_diff_feature_all = min(feature_importance, key=feature_importance.get)
print(f"\nFeature with smallest difference (all features): {min_diff_feature_all} ({feature_importance[min_diff_feature_all]:.6f})")



Feature with smallest difference (all features): annual_income (-0.113014)


In [35]:
# Now check only the options given in the question
print("\n" + "-"*80)
print("Checking only the features mentioned in Question 5:")
features_to_check = ['industry', 'employment_status', 'lead_score']

for feature in features_to_check:
    if feature in feature_importance:
        print(f"{feature}: diff={feature_importance[feature]:.6f}")


--------------------------------------------------------------------------------
Checking only the features mentioned in Question 5:
industry: diff=0.000000
employment_status: diff=-0.003425
lead_score: diff=0.000000


In [36]:
# Find feature with smallest difference among the given options
min_diff_feature_q5 = min(features_to_check, key=lambda x: feature_importance.get(x, float('inf')))
print(f"\nAnswer to Question 5: {min_diff_feature_q5} ({feature_importance[min_diff_feature_q5]:.6f})")



Answer to Question 5: employment_status (-0.003425)


In [37]:
# =============================================================================
# QUESTION 6: Regularized logistic regression
# =============================================================================

print("\n" + "="*80)
print("QUESTION 6")
print("="*80)


QUESTION 6


In [42]:
# Test different C values
C_values = [0.01, 0.1, 1, 10, 100]
results = {}

for C in C_values:
    # Train model with this C value
    model_c = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)
    model_c.fit(X_train, y_train)

    # Calculate accuracy on validation set
    y_pred_c = model_c.predict(X_val)
    accuracy_c = accuracy_score(y_val, y_pred_c)
    accuracy_c_rounded = round(accuracy_c, 3)

    results[C] = accuracy_c_rounded
    print(f"C={C}: accuracy={accuracy_c_rounded}")

C=0.01: accuracy=0.743
C=0.1: accuracy=0.743
C=1: accuracy=0.743
C=10: accuracy=0.743
C=100: accuracy=0.743


In [43]:
# Find best C value
best_accuracy = max(results.values())
best_c_values = [c for c, acc in results.items() if acc == best_accuracy]
best_c_final = min(best_c_values)

print(f"\nBest accuracy: {best_accuracy}")
print(f"C values with best accuracy: {best_c_values}")
print(f"Best C value (smallest if tie): {best_c_final}")


Best accuracy: 0.743
C values with best accuracy: [0.01, 0.1, 1, 10, 100]
Best C value (smallest if tie): 0.01


In [44]:
# =============================================================================
# SUMMARY OF ANSWERS
# =============================================================================

print("\n" + "="*80)
print("SUMMARY OF ANSWERS")
print("="*80)

print(f"\nQuestion 1: Most frequent observation for 'industry': {mode_industry}")
print(f"Question 2: Pair with biggest correlation: {max_pair[0]} and {max_pair[1]}")
print(f"Question 3: Feature with highest mutual information: {max_mi_feature}")
print(f"Question 4: Validation accuracy: {accuracy_val_rounded}")
print(f"Question 5: Feature with smallest difference: {min_diff_feature_q5}")
print(f"Question 6: Best C value: {best_c_final}")


SUMMARY OF ANSWERS

Question 1: Most frequent observation for 'industry': retail
Question 2: Pair with biggest correlation: annual_income and interaction_count
Question 3: Feature with highest mutual information: lead_source
Question 4: Validation accuracy: 0.74
Question 5: Feature with smallest difference: employment_status
Question 6: Best C value: 0.01
