In [3]:
import pandas as pd
import numpy as np

In [1]:
!wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv

--2025-10-15 17:58:59--  https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.111.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 80876 (79K) [text/plain]
Saving to: ‘course_lead_scoring.csv’


2025-10-15 17:58:59 (54.9 MB/s) - ‘course_lead_scoring.csv’ saved [80876/80876]



In [4]:
df = pd.read_csv('course_lead_scoring.csv')
df.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


### Data Preparation

Check if missing values are present in the features.

**If there are missing values:**
- For **categorical features**, replace them with `'NA'`
- For **numerical features**, replace them with `0.0`


In [5]:
df.isnull().sum()

lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64

In [6]:
df.dtypes


lead_source                  object
industry                     object
number_of_courses_viewed      int64
annual_income               float64
employment_status            object
location                     object
interaction_count             int64
lead_score                  float64
converted                     int64
dtype: object

In [7]:
categorical_features = list(df.dtypes[df.dtypes == 'object'].index)
numerical_features = list(df.dtypes[df.dtypes != 'object'].index)

# From the output below, numerical_fetures includes converted variable which is our target therefore the tagert variable has to be removed
features = (categorical_features + numerical_features)
features.remove('converted')

print('Categorical Columns')
print(categorical_features)
print()
print('Numerical Columns')
print(numerical_features)
print()
print('Features')
print(features)

Categorical Columns
['lead_source', 'industry', 'employment_status', 'location']

Numerical Columns
['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score', 'converted']

Features
['lead_source', 'industry', 'employment_status', 'location', 'number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score']


In [8]:
# To see only columns that have missing values
df.isnull().sum()[df.isnull().sum() != 0]


lead_source          128
industry             134
annual_income        181
employment_status    100
location              63
dtype: int64

In [9]:
# for categorical variables, missing values are replaced with 'NA'
df.lead_source = df.lead_source.fillna('NA')
df.industry = df.industry.fillna('NA')
df.employment_status = df.employment_status.fillna('NA')
df.location = df.location.fillna('NA')

# for numerical variables, missing values are replaced with 'NA'
df.annual_income = df.annual_income.fillna(0)

# Cleaner way
# df[categorical_features] = df[categorical_features].fillna('NA')
# df[numerical_features] = df[numerical_features].fillna(0.0)


In [10]:
df.isnull().sum()

lead_source                 0
industry                    0
number_of_courses_viewed    0
annual_income               0
employment_status           0
location                    0
interaction_count           0
lead_score                  0
converted                   0
dtype: int64

### Question 1

What is the most frequent observation (**mode**) for the column `industry`?

- NA  
- technology  
- healthcare  
- retail


In [11]:
df.industry.mode()

0    retail
Name: industry, dtype: object

In [12]:
print(f'The most frequent observation (mode) for the column industry is {df.industry.mode().values[0]}.')

The most frequent observation (mode) for the column industry is retail.


### Question 2

Create the **correlation matrix** for the numerical features of your dataset.  
In a correlation matrix, you compute the correlation coefficient between every pair of features.

**What are the two features that have the biggest correlation?**

- interaction_count and lead_score  
- number_of_courses_viewed and lead_score  
- number_of_courses_viewed and interaction_count  
- annual_income and interaction_count  

*Only consider the pairs above when answering this question.*


In [13]:
### Question 2

# Create the correlation matrix for numerical features
corr_matrix = df[numerical_features].corr()
#exclude the traget

# Display the correlation matrix
print("Correlation Matrix:")
print(corr_matrix)

# Extract correlation values for the specific pairs mentioned in the question
pairs = {
    'interaction_count and lead_score': corr_matrix.loc['interaction_count', 'lead_score'],
    'number_of_courses_viewed and lead_score': corr_matrix.loc['number_of_courses_viewed', 'lead_score'],
    'number_of_courses_viewed and interaction_count': corr_matrix.loc['number_of_courses_viewed', 'interaction_count'],
    'annual_income and interaction_count': corr_matrix.loc['annual_income', 'interaction_count']
}


# Print correlation values for these pairs
print("\nCorrelation values for selected pairs:")
for pair, corr_value in pairs.items():
    print(f"{pair}: {corr_value:.4f}")

# Alternatively Extract and print their correlations (The previous 2 steps)
# pairs = [
#     ('interaction_count', 'lead_score'),
#     ('number_of_courses_viewed', 'lead_score'),
#     ('number_of_courses_viewed', 'interaction_count'),
#     ('annual_income', 'interaction_count')
# ]

# print("\nSelected feature pair correlations:")
# for f1, f2 in pairs:
#     corr_value = corr_matrix.loc[f1, f2]
#     print(f"{f1} and {f2}: {corr_value:.3f}")

# Find the pair with the highest correlation
max_pair = max(pairs, key=pairs.get)

Correlation Matrix:
                          number_of_courses_viewed  annual_income  \
number_of_courses_viewed                  1.000000       0.009770   
annual_income                             0.009770       1.000000   
interaction_count                        -0.023565       0.027036   
lead_score                               -0.004879       0.015610   
converted                                 0.435914       0.053131   

                          interaction_count  lead_score  converted  
number_of_courses_viewed          -0.023565   -0.004879   0.435914  
annual_income                      0.027036    0.015610   0.053131  
interaction_count                  1.000000    0.009888   0.374573  
lead_score                         0.009888    1.000000   0.193673  
converted                          0.374573    0.193673   1.000000  

Correlation values for selected pairs:
interaction_count and lead_score: 0.0099
number_of_courses_viewed and lead_score: -0.0049
number_of_courses_vie

In [14]:
print(f"The two features with the highest correlation are: {max_pair}")


The two features with the highest correlation are: annual_income and interaction_count


Split the data
- Split your data in train/val/test sets with 60%/20%/20% distribution.
- Use Scikit-Learn for that (the train_test_split function) and set the seed to 42.
- Make sure that the target value y is not in your dataframe.

In [15]:
from sklearn.model_selection import train_test_split
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

In [16]:
len(df_train), len(df_val), len(df_test)

(876, 293, 293)

In [17]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [18]:
X_train = df_train[features]
X_val = df_val[features]
X_test = df_val[features]

In [19]:
y_train = df_train.converted.values
y_val = df_val.converted.values
y_test = df_test.converted.values

del df_train['converted']
del df_val['converted']
del df_test['converted']

### Question 3
 
- Calculate the **mutual information score** between the target variable `y` and other **categorical variables** in the dataset. Use the **training set only** for this calculation.
- Round the scores to **2 decimal places** using `round(score, 2)`.

**Which of these variable has the biggest mutual information score?**

- industry  
- location  
- lead_source  
- employment_status


In [20]:
from sklearn.metrics import mutual_info_score

In [21]:
# Funtion to get mutual information between a feature and the target variable
def mutual_info_churn_score(series):
    return mutual_info_score(series, df_full_train.converted)

In [22]:
# Applying the function to get the mutual information for each feature
mi = df_full_train[categorical_features].apply(mutual_info_churn_score)
mi.sort_values(ascending=False).apply(lambda x: round(x, 2))
# You can also round by
# mi = mi.sort_values(ascending=False).round(2)

lead_source          0.02
employment_status    0.01
industry             0.01
location             0.00
dtype: float64

In [23]:
print(f"The biggest mutual information score is {mi.idxmax()}")

The biggest mutual information score is lead_source


### Question 4
- Now let's train a logistic regression.
- Remember that we have several categorical variables in the dataset. Include them using one-hot encoding.
- Fit the model on the training dataset.
  - To make sure the results are reproducible across different versions of Scikit-Learn, fit the model with these parameters:
  - model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
- Calculate the accuracy on the validation dataset and round it to 2 decimal digits.

**What accuracy did you get?**

- 0.64
- 0.74
- 0.84
- 0.94

In [32]:
# One hot encoding
from sklearn.feature_extraction import DictVectorizer

In [33]:
dv = DictVectorizer(sparse=False)

In [34]:
X_train_dict = X_train.to_dict(orient='records')
X_train_Q4 = dv.fit_transform(X_train_dict)

X_val_dict = X_val.to_dict(orient='records')
X_val_Q4 = dv.transform(X_val_dict)


In [35]:
# Training logistic regression with Scikit-Learn
from sklearn.linear_model import LogisticRegression

In [36]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)

model.fit(X_train_Q4, y_train)


0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'liblinear'
,max_iter,1000


In [37]:
model.intercept_[0]

np.float64(-0.08556616376410939)

In [38]:
model.coef_[0].round(3)


array([-0.   , -0.024,  0.044, -0.01 ,  0.001, -0.096, -0.028,  0.038,
       -0.005, -0.034,  0.001, -0.018, -0.033, -0.006,  0.297,  0.048,
        0.008, -0.012, -0.012, -0.111,  0.073, -0.031, -0.001, -0.009,
       -0.011, -0.02 , -0.006, -0.009, -0.027, -0.003,  0.45 ])

In [40]:
# Hard Prediction on training data
model.predict(X_train_Q4)

array([1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1,
       0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0,
       1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1,

In [41]:
# Hard Prediction on validation data
model.predict(X_val_Q4)

array([1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
       1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1,
       1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 0, 0, 1, 1])

In [42]:
# Soft Prediction
# Returns probabiliy of negative class (not converted in this case) and probability of positive class (Converted in this case)
model.predict_proba(X_val_Q4)

array([[0.42772713, 0.57227287],
       [0.05254591, 0.94745409],
       [0.294524  , 0.705476  ],
       [0.6329651 , 0.3670349 ],
       [0.58156317, 0.41843683],
       [0.14783571, 0.85216429],
       [0.25284517, 0.74715483],
       [0.51709348, 0.48290652],
       [0.24475747, 0.75524253],
       [0.28720036, 0.71279964],
       [0.40677406, 0.59322594],
       [0.6624048 , 0.3375952 ],
       [0.4658819 , 0.5341181 ],
       [0.35191745, 0.64808255],
       [0.37917035, 0.62082965],
       [0.48376376, 0.51623624],
       [0.19337052, 0.80662948],
       [0.22722103, 0.77277897],
       [0.34958558, 0.65041442],
       [0.45010573, 0.54989427],
       [0.12352981, 0.87647019],
       [0.41500454, 0.58499546],
       [0.59501465, 0.40498535],
       [0.36408221, 0.63591779],
       [0.36030834, 0.63969166],
       [0.558458  , 0.441542  ],
       [0.55469964, 0.44530036],
       [0.65486698, 0.34513302],
       [0.27454181, 0.72545819],
       [0.27289076, 0.72710924],
       [0.

In [43]:
# Selecting the probability of positive class
y_pred = model.predict_proba(X_val_Q4)[:, 1]

In [44]:
converted_decision = (y_pred >= 0.5)

In [45]:
accuracy = (y_val == converted_decision).mean().round(2)
accuracy

np.float64(0.73)

In [46]:
print(f"The Model Accuracy: {accuracy}")

The Model Accuracy: 0.73


### Question 5
- Let's find the least useful feature using the feature elimination technique.
- Train a model using the same features and parameters as in Q4 (without rounding).
- Now exclude each feature from this set and train a model without it. Record the accuracy for each model.
- For each feature, calculate the difference between the original accuracy and the accuracy without the feature.

**Which of following feature has the smallest difference?**

- 'industry'
- 'employment_status'
- 'lead_score'

   **Note: The difference doesn't have to be positive.**

In [47]:
# Each time a feature is excluded, the features data is modified and one-hot encoding has to be done again
accuracy_differences = {}

features_Q5 = [f for f in features if f not in ['lead_source', 'location', 'number_of_courses_viewed', 'annual_income', 'interaction_count']]

for feature in features_Q5:
    # feature exclusion
    features_left = [f for f in features_Q5 if f != feature]

    X_train_i = df_train[features_left].reset_index(drop=True)
    X_val_i = df_val[features_left].reset_index(drop=True)
    X_test_i = df_test[features_left].reset_index(drop=True)

    # One-hot Encoding
    dv_i = DictVectorizer(sparse=False)
    X_train_i = dv_i.fit_transform(X_train_i.to_dict(orient='records'))
    X_val_i = dv_i.transform(X_val_i.to_dict(orient='records'))

    # Train the model using the modified features each time a feature is excluded
    model_i = LogisticRegression(solver="liblinear", C=1.0, max_iter=1_000, random_state=42)
    model_i.fit(X_train_i, y_train)
    y_pred_i = model_i.predict_proba(X_val_i)[:, 1]
    converted_decision_i = (y_pred_i >= 0.5)
    accuracy_i = (y_val == converted_decision_i).mean()

    # Calculate and store the difference in accuracy
    diff = (accuracy - accuracy_i)
    accuracy_differences[feature] = diff

    print(f"Model without {feature:<26} Accuracy = {accuracy_i:.4f}   Difference = {diff:.4f}")

Model without industry                   Accuracy = 0.6314   Difference = 0.0986
Model without employment_status          Accuracy = 0.5836   Difference = 0.1464
Model without lead_score                 Accuracy = 0.6143   Difference = 0.1157


In [49]:
smallest_diff = min(accuracy_differences, key=accuracy_differences.get)

print(f'From the given options, the feature with the smallest difference is {smallest_diff}')


From the given options, the feature with the smallest difference is industry


### Question 6
- Now let's train a regularized logistic regression.
- Let's try the following values of the parameter C: [0.01, 0.1, 1, 10, 100].
- Train models using all the features as in Q4.
- Calculate the accuracy on the validation dataset and round it to 3 decimal digits.

**Which of these C leads to the best accuracy on the validation set?**

- 0.01
- 0.1
- 1
- 10
- 100

In [50]:
accuracies = []

C = [0.01, 0.1, 1, 10, 100]

for C_i in C:
    model = LogisticRegression(solver='liblinear', C=C_i, max_iter=1000, random_state=42)

    model.fit(X_train_Q4, y_train)

    model.predict_proba(X_val_Q4)

    y_pred_i = model.predict_proba(X_val_Q4)[:, 1]

    converted_decision_i = (y_pred_i >= 0.5)

    accuracy_i = (y_val == converted_decision_i).mean()

    accuracies.append({'C': C_i, 'accuracy': accuracy_i})

accuracies

[{'C': 0.01, 'accuracy': np.float64(0.7303754266211604)},
 {'C': 0.1, 'accuracy': np.float64(0.7303754266211604)},
 {'C': 1, 'accuracy': np.float64(0.7303754266211604)},
 {'C': 10, 'accuracy': np.float64(0.7303754266211604)},
 {'C': 100, 'accuracy': np.float64(0.7303754266211604)}]

In [51]:
# Best regularization parameter
best_accuracy = max(accuracies, key=lambda x: x['accuracy'])

best_C = best_accuracy['C']


print(f'Best Regularization parameter is {best_C}')

Best Regularization parameter is 0.01
