In [65]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import mutual_info_classif
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


import urllib.request
import os

# Download the dataset
url = "https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv"
filename = "course_lead_scoring.csv"

print(f"Downloading {filename}...")
# urllib.request.urlretrieve(url, filename)

# Verify the download
if os.path.exists(filename):
    file_size = os.path.getsize(filename)
    print(f"✅ Download successful!")
    print(f"File: {filename}")
    print(f"Size: {file_size} bytes")
else:
    print("❌ Download failed!")
#  desired target for classification task will be converted variable - has the client signed up to the platform or not.

Downloading course_lead_scoring.csv...
✅ Download successful!
File: course_lead_scoring.csv
Size: 80876 bytes


In [12]:
df = pd.read_csv('course_lead_scoring.csv')
len(df)
df.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In [13]:
# Check if the missing values are presented in the features.
# If there are missing values:
# For caterogiral features, replace them with 'NA'
# For numerical features, replace with with 0.0
df.isnull().sum()[df.isnull().sum() > 0]

lead_source          128
industry             134
annual_income        181
employment_status    100
location              63
dtype: int64

In [14]:
for col in df.columns:
    if df[col].dtype == 'object':
        df[col] = df[col].fillna('NA')
    else:
        df[col] = df[col].fillna(0.0)
df.isnull().sum()[df.isnull().sum() > 0]

Series([], dtype: int64)

# Question 1

What is the most frequent observation (mode) for the column industry?

Answer: 'retail'

In [15]:
df['industry'].mode()[0]

'retail'

# Question 2

Create the correlation matrix for the numerical features of your dataset. In a correlation matrix, you compute the correlation coefficient between every pair of features.

What are the two features that have the biggest correlation?

```
interaction_count and lead_score
number_of_courses_viewed and lead_score
number_of_courses_viewed and interaction_count
annual_income and interaction_count
```

Only consider the pairs above when answering this question.

Answer: annual_income and interaction_count

In [None]:
cols = [
    'interaction_count', 
    'lead_score', 
    'number_of_courses_viewed', 
    'annual_income'
]
correlation_matrix = df[cols].corr()
print(correlation_matrix)
corr_no_diag = correlation_matrix.copy()
np.fill_diagonal(corr_no_diag.values, np.nan)
corr_abs = corr_no_diag.abs()
max_corr = corr_no_diag.max().max()
print(f"Maximum correlation: {max_corr}")
# annual_income-interaction_count = 0.027036
# annual_income-lead_score = 0.015610
# 

                          interaction_count  lead_score  \
interaction_count                  1.000000    0.009888   
lead_score                         0.009888    1.000000   
number_of_courses_viewed          -0.023565   -0.004879   
annual_income                      0.027036    0.015610   

                          number_of_courses_viewed  annual_income  
interaction_count                        -0.023565       0.027036  
lead_score                               -0.004879       0.015610  
number_of_courses_viewed                  1.000000       0.009770  
annual_income                             0.009770       1.000000  
Maximum correlation: 0.027036472404814403


# Question 3

Calculate the mutual information score between y and other categorical variables in the dataset. Use the training set only.
Round the scores to 2 decimals using round(score, 2).
Which of these variables has the biggest mutual information score?

```
industry
location
lead_source
employment_status
```

Answer: lead_source

In [None]:
X = df.drop('converted', axis=1)
y = df['converted']
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)
X_train.shape, X_val.shape, X_test.shape

((877, 8), (292, 8), (293, 8))

In [34]:
# X_train['industry'].astype('category').cat.codes.values.reshape(-1, 1)

In [None]:
scores = {}
for col in ['industry', 'location', 'lead_source', 'employment_status']:
    X_encoded = X_train[col].astype('category').cat.codes.values.reshape(-1, 1)
    # print(X_encoded)
    score = mutual_info_classif(X_encoded, y_train, discrete_features=True, random_state=42)
    scores[col] = round(score[0], 2)
scores

{'industry': np.float64(0.02),
 'location': np.float64(0.0),
 'lead_source': np.float64(0.03),
 'employment_status': np.float64(0.02)}

In [36]:
max(scores, key=scores.get)

'lead_source'

# Question 4

Now let's train a logistic regression.

Remember that we have several categorical variables in the dataset. 
Include them using one-hot encoding.

Fit the model on the training dataset.

To make sure the results are reproducible across different versions of Scikit-Learn, fit the model with these parameters:

`model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)`

Calculate the accuracy on the validation dataset and round it to 2 decimal digits.

What accuracy did you get?
```
0.64
0.74
0.84
0.94
```

Answer: 0.74


In [None]:
cat_cols = ['industry', 'location', 'lead_source', 'employment_status']

X_train_encoded = pd.get_dummies(X_train, columns=cat_cols)
X_val_encoded = pd.get_dummies(X_val, columns=cat_cols)

X_val_encoded = X_val_encoded.reindex(columns=X_train_encoded.columns, fill_value=0)
X_val_encoded

Unnamed: 0,number_of_courses_viewed,annual_income,interaction_count,lead_score,industry_NA,industry_education,industry_finance,industry_healthcare,industry_manufacturing,industry_other,...,lead_source_events,lead_source_organic_search,lead_source_paid_ads,lead_source_referral,lead_source_social_media,employment_status_NA,employment_status_employed,employment_status_self_employed,employment_status_student,employment_status_unemployed
886,1,63127.0,6,0.70,False,False,False,False,True,False,...,False,True,False,False,False,False,False,True,False,False
619,6,75389.0,2,0.04,False,False,False,False,True,False,...,False,False,True,False,False,False,False,False,True,False
548,2,66519.0,4,0.33,False,False,False,True,False,False,...,True,False,False,False,False,False,False,False,False,True
1046,3,60910.0,3,0.32,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
618,0,63425.0,2,0.40,False,False,False,False,False,False,...,False,True,False,False,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1436,3,63032.0,3,0.79,False,False,False,False,False,False,...,False,True,False,False,False,False,False,True,False,False
754,2,55442.0,5,0.10,False,True,False,False,False,False,...,True,False,False,False,False,False,True,False,False,False
1244,4,26839.0,1,0.65,False,True,False,False,False,False,...,False,False,False,True,False,False,True,False,False,False
650,3,0.0,4,0.31,False,False,False,False,False,False,...,False,False,True,False,False,False,False,True,False,False


In [61]:
lr_model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)

In [67]:
lr_model.fit(X_train_encoded, y_train)
val_preds = lr_model.predict(X_val_encoded)
print(val_preds.shape, y_val.shape)
accuracy_score_val = accuracy_score(y_val, val_preds)
print("accuracy_score:", round(accuracy_score_val , 2))

(292,) (292,)
accuracy_score: 0.74


# Question 5

Let's find the least useful feature using the feature elimination technique.

Train a model using the same features and parameters as in Q4 (without rounding).

Now exclude each feature from this set and train a model without it. 
Record the accuracy for each model.

For each feature, calculate the difference between the original accuracy and the accuracy without the feature.
Which of following feature has the smallest difference?
```
'industry'
'employment_status'
'lead_score'
```

Answer: 'industry'

In [72]:
# cat_cols = ['industry', 'location', 'lead_source', 'employment_status']

features_to_test = ["industry", "employment_status", 'location', "lead_source"]
diffs = {}

for feature in features_to_test:
    cols_to_drop = [col for col in X_train_encoded.columns if col.startswith(feature + "_")]
    print(cols_to_drop)
    X_train_wo_f = X_train_encoded.drop(columns=cols_to_drop)
    X_val_wo_f = X_val_encoded.drop(columns=cols_to_drop)
    model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model.fit(X_train_wo_f, y_train)
    val_pred = model.predict(X_val_wo_f)
    # print(val_pred.shape, y_val.shape)
    acc_subset = accuracy_score(y_val, val_pred)
    print(acc_subset)
    diffs[feature] = abs(accuracy_score_val - acc_subset)
print(diffs)
min(diffs, key=diffs.get)

['industry_NA', 'industry_education', 'industry_finance', 'industry_healthcare', 'industry_manufacturing', 'industry_other', 'industry_retail', 'industry_technology']
0.7431506849315068
['employment_status_NA', 'employment_status_employed', 'employment_status_self_employed', 'employment_status_student', 'employment_status_unemployed']
0.7465753424657534
['location_NA', 'location_africa', 'location_asia', 'location_australia', 'location_europe', 'location_middle_east', 'location_north_america', 'location_south_america']
0.7431506849315068
['lead_source_NA', 'lead_source_events', 'lead_source_organic_search', 'lead_source_paid_ads', 'lead_source_referral', 'lead_source_social_media']
0.7294520547945206
{'industry': 0.0, 'employment_status': 0.003424657534246589, 'location': 0.0, 'lead_source': 0.013698630136986245}


'industry'

# Question 6

Now let's train a regularized logistic regression.
Let's try the following values of the parameter C: `[0.01, 0.1, 1, 10, 100]`

Train models using all the features as in Q4.

Calculate the accuracy on the validation dataset and round it to 3 decimal digits.
Which of these C leads to the best accuracy on the validation set?
```
0.01
0.1
1
10
100
```

Answer: 0.01 as all equal

In [75]:
c_params = [0.01, 0.1, 1, 10, 100]
results = {}

for cp in c_params:
    model = LogisticRegression(solver='liblinear', C=cp, max_iter=1000, random_state=42)
    model.fit(X_train_encoded, y_train)
    val_preds = model.predict(X_val_encoded)
    acc = accuracy_score(y_val, val_preds)
    print(acc)
    results[cp] = round(acc, 3)

results

0.7431506849315068
0.7431506849315068
0.7431506849315068
0.7431506849315068
0.7431506849315068


{0.01: 0.743, 0.1: 0.743, 1: 0.743, 10: 0.743, 100: 0.743}

0.01