## 03 Classification Homework

Carry out to see if the client has signed up for the platform or not.

### 3.1 Has the client signed up project

- Dataset: https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv

### 3.2 Data Preparation

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

In [2]:
data = "https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv"

In [3]:
!wget $data -O course_lead_scoring.csv

--2025-10-15 19:30:22--  https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.110.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 80876 (79K) [text/plain]
Saving to: ‘course_lead_scoring.csv’


2025-10-15 19:30:22 (2.84 MB/s) - ‘course_lead_scoring.csv’ saved [80876/80876]



In [4]:
df = pd.read_csv("course_lead_scoring.csv")
df.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In [5]:
df.dtypes

lead_source                  object
industry                     object
number_of_courses_viewed      int64
annual_income               float64
employment_status            object
location                     object
interaction_count             int64
lead_score                  float64
converted                     int64
dtype: object

In [6]:
df.isnull().sum()

lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64

In [7]:
#Separate categorical and numerical columns
categorical = df.select_dtypes(include=['object']).columns
numerical = df.select_dtypes(exclude=['object']).columns
print("Categorical columns:", categorical)
print("Numerical columns:", numerical)

Categorical columns: Index(['lead_source', 'industry', 'employment_status', 'location'], dtype='object')
Numerical columns: Index(['number_of_courses_viewed', 'annual_income', 'interaction_count',
       'lead_score', 'converted'],
      dtype='object')


In [8]:
#Fill missing values
df[categorical] = df[categorical].fillna('NA')
df[numerical] = df[numerical].fillna(0.0)

In [9]:
df.isnull().sum()

lead_source                 0
industry                    0
number_of_courses_viewed    0
annual_income               0
employment_status           0
location                    0
interaction_count           0
lead_score                  0
converted                   0
dtype: int64

### 3.3: Q1 Unique Value and Most Frequent in column 'industry'

In [10]:
df['industry'].unique()

array(['NA', 'retail', 'healthcare', 'education', 'manufacturing',
       'technology', 'other', 'finance'], dtype=object)

In [11]:
df['industry'].nunique()

8

In [12]:
df['industry'].value_counts()

industry
retail           203
finance          200
other            198
healthcare       187
education        187
technology       179
manufacturing    174
NA               134
Name: count, dtype: int64

In [13]:
df['industry'].mode()[0]

'retail'

### 3.4: Q2 Correlation Matrix for Numerical Features

In [14]:
df.corr(numeric_only=True)

Unnamed: 0,number_of_courses_viewed,annual_income,interaction_count,lead_score,converted
number_of_courses_viewed,1.0,0.00977,-0.023565,-0.004879,0.435914
annual_income,0.00977,1.0,0.027036,0.01561,0.053131
interaction_count,-0.023565,0.027036,1.0,0.009888,0.374573
lead_score,-0.004879,0.01561,0.009888,1.0,0.193673
converted,0.435914,0.053131,0.374573,0.193673,1.0


Ans: annual_income and interaction_count

### 3.5 Setting up the validation framework

- perform `train/val/test` sets with `60% / 20% / 20%` using scikit-learn

In [15]:
from sklearn.model_selection import train_test_split

In [16]:
df_full_train, df_test = train_test_split(df, test_size = 0.2, random_state = 1)
df_train, df_val = train_test_split(df_full_train, test_size = 0.25, random_state = 1)

In [17]:
len(df_train), len(df_val), len(df_test)

(876, 293, 293)

In [18]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [19]:
df_train.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,events,manufacturing,2,95543.0,unemployed,europe,3,0.78,0
1,referral,,1,54924.0,student,south_america,6,0.39,1
2,organic_search,healthcare,2,77352.0,unemployed,europe,2,0.22,0
3,paid_ads,other,2,34600.0,employed,south_america,2,0.31,0
4,paid_ads,education,0,43615.0,unemployed,south_america,2,0.01,0


In [20]:
df_val.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,organic_search,manufacturing,1,0.0,,asia,0,0.73,1
1,referral,education,2,58777.0,,north_america,0,0.94,1
2,paid_ads,technology,3,78148.0,employed,middle_east,2,0.8,1
3,social_media,technology,3,63854.0,employed,africa,1,0.1,0
4,referral,education,1,69099.0,unemployed,africa,4,0.98,1


In [21]:
df_test.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,organic_search,technology,1,0.0,employed,middle_east,2,0.9,0
1,paid_ads,finance,1,47129.0,,south_america,1,0.93,0
2,,education,0,0.0,unemployed,asia,5,0.97,1
3,events,retail,0,64775.0,self_employed,south_america,3,0.7,1
4,social_media,finance,3,46934.0,,australia,3,0.26,1


### 3.6 Q3: Calculate the mutual information score between `converted` and others categorical values

In [22]:
from sklearn.metrics import mutual_info_score

In [23]:
def mutual_info_score_series(series):
    return mutual_info_score(series, df_train['converted'])

In [24]:
mi = df_train[categorical].apply(mutual_info_score_series)
mi = mi.sort_values(ascending=False)
mi

lead_source          0.024803
employment_status    0.016345
industry             0.006161
location             0.001453
dtype: float64

Ans: lead_source

### 3.7: One hot Encoding for Logistic Regression

In [25]:
from sklearn.feature_extraction import DictVectorizer

In [26]:
target = 'converted'
feature_cols = [c for c in df.columns if c != target]

In [27]:
feature_categorical = df_train[feature_cols].select_dtypes(include=['object']).columns.tolist()
feature_numerical = df_train[feature_cols].select_dtypes(exclude=['object']).columns.tolist()
print("Feature Categorical columns:", feature_categorical)
print("Feature Numerical columns:", feature_numerical)

Feature Categorical columns: ['lead_source', 'industry', 'employment_status', 'location']
Feature Numerical columns: ['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score']


In [28]:
dv = DictVectorizer(sparse=False)

train_dicts = df_train[list(feature_categorical) + list(feature_numerical)].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)
y_train = df_train[target].values
X_train

array([[9.5543e+04, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        2.0000e+00],
       [5.4924e+04, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 1.0000e+00,
        1.0000e+00],
       [7.7352e+04, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        2.0000e+00],
       ...,
       [7.3702e+04, 0.0000e+00, 0.0000e+00, ..., 1.0000e+00, 0.0000e+00,
        1.0000e+00],
       [9.3341e+04, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        1.0000e+00],
       [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        0.0000e+00]], shape=(876, 31))

In [29]:
val_dicts = df_val[list(feature_categorical) + list(feature_numerical)].to_dict(orient='records')
X_val = dv.transform(val_dicts)
y_val = df_val[target].values
X_val

array([[0.0000e+00, 1.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        1.0000e+00],
       [5.8777e+04, 1.0000e+00, 0.0000e+00, ..., 1.0000e+00, 0.0000e+00,
        2.0000e+00],
       [7.8148e+04, 0.0000e+00, 1.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        3.0000e+00],
       ...,
       [9.2215e+04, 0.0000e+00, 1.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        4.0000e+00],
       [5.3087e+04, 0.0000e+00, 1.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        2.0000e+00],
       [6.0375e+04, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        3.0000e+00]], shape=(293, 31))

In [30]:
print(X_train.shape, X_val.shape)  

(876, 31) (293, 31)


In [31]:
print(type(feature_categorical), type(feature_numerical))

<class 'list'> <class 'list'>


### 3.9 Q4: Logistic Regression using sklearn

In [32]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [33]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)

In [34]:
model.fit(X_train, y_train)
y_pred = model.predict(X_val)

accuracy = accuracy_score(y_val, y_pred)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.70


### 3.10 Q5: Finding Least useful feature using Feature Elimination Technique

In [35]:
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression

In [36]:
base_accuracy = accuracy
results = {}

for col in ['industry', 'employment_status', 'lead_score']:
    features = [c for c in df_train.columns if c not in ['converted', col]]

    dv_temp = DictVectorizer(sparse=False)
    train_dicts_temp = df_train[features].to_dict(orient='records')
    X_train_temp = dv_temp.fit_transform(train_dicts_temp)

    val_dicts_temp = df_val[features].to_dict(orient='records')
    X_val_temp = dv_temp.transform(val_dicts_temp)

    model_temp = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model_temp.fit(X_train_temp, y_train)
    y_pred_temp = model_temp.predict(X_val_temp)

    accuracy_temp = accuracy_score(y_val, y_pred_temp)
    diff = base_accuracy - accuracy_temp
    results[col] = diff

results

{'industry': 0.0,
 'employment_status': -0.0034129692832765013,
 'lead_score': 0.0}

Ans: employment_status

### 3.11 Q5: Tuning the model with different C values to find the one with the best accuracy

In [37]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

C_values = [0.01, 0.1, 1, 10, 100]
scores = {}

for c in C_values:
    model = LogisticRegression(solver='liblinear', C=c, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    accuracy = accuracy_score(y_val, y_pred)
    scores[c] = round(accuracy, 3)
scores

{0.01: 0.7, 0.1: 0.7, 1: 0.7, 10: 0.7, 100: 0.7}

Ans: 0.01