In [39]:
#importing relevant libraries
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

In [5]:
#displaying the application data set as a dataframe
df = pd.read_csv("https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv")
pd.set_option('display.max_columns',50)
display(df.head())

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In [7]:
df.describe()

Unnamed: 0,number_of_courses_viewed,annual_income,interaction_count,lead_score,converted
count,1462.0,1281.0,1462.0,1462.0,1462.0
mean,2.031464,59886.273224,2.976744,0.506108,0.619015
std,1.449717,15070.140389,1.681564,0.288465,0.485795
min,0.0,13929.0,0.0,0.0,0.0
25%,1.0,49698.0,2.0,0.2625,0.0
50%,2.0,60148.0,3.0,0.51,1.0
75%,3.0,69639.0,4.0,0.75,1.0
max,9.0,109899.0,11.0,1.0,1.0


In [8]:
# How many columns in the dataset have missing values?
print(df.isnull().sum()[df.isnull().sum() > 0])

lead_source          128
industry             134
annual_income        181
employment_status    100
location              63
dtype: int64


In [9]:
df.columns

Index(['lead_source', 'industry', 'number_of_courses_viewed', 'annual_income',
       'employment_status', 'location', 'interaction_count', 'lead_score',
       'converted'],
      dtype='object')

In [16]:
df.dtypes

lead_source                  object
industry                     object
number_of_courses_viewed      int64
annual_income               float64
employment_status            object
location                     object
interaction_count             int64
lead_score                  float64
converted                     int64
dtype: object

In [11]:
# Replace missing values correctly
for col in df.columns:
    if df[col].dtype == 'object':      # Categorical
        df[col] = df[col].fillna('NA')
    else:                              # Numerical
        df[col] = df[col].fillna(0.0)

In [14]:
#What is the most frequent observation (mode) for the column industry?
df['industry'].mode()[0]

'retail'

In [21]:
#numerical_features
numerical_features = ["number_of_courses_viewed", "interaction_count", "converted"]

# Categorical features
categorical_features = ['industry', 'location', 'lead_source', 'employment_status']

In [45]:
#What are the two features that have the biggest correlation?
# Compute correlation matrix for numerical features
corr_matrix = df[numerical_features].corr()

# Find the two features with the highest correlation (excluding self-correlation)
corr_unstacked = corr_matrix.unstack().sort_values(ascending=False)
corr_unstacked = corr_unstacked[corr_unstacked < 1]  # remove self-correlation
top_pair = corr_unstacked.idxmax()
top_value = corr_unstacked.max()

top_pair, top_value

(('number_of_courses_viewed', 'interaction_count'), -0.023565222882888037)

In [23]:
# Separate features and target
X = df.drop(columns=['converted'])
y = df['converted']

In [26]:
# First split: train (60%) and temp (40%)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)

In [27]:
# Second split: val (20%) and test (20%)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [36]:
# Encode categorical features
X_train_encoded = X_train[categorical_features].copy()
for col in categorical_features:
    le = LabelEncoder()
    X_train_encoded[col] = le.fit_transform(X_train_encoded[col])

In [46]:
# Which of these variables has the biggest mutual information score?
# Copy X_train to not change the original
X_train_encoded = X_train[categorical_features].copy()

# Encode categorical features
for col in categorical_features:
    le = LabelEncoder()
    X_train_encoded[col] = le.fit_transform(X_train_encoded[col])

# Compute mutual information
mi_scores = mutual_info_classif(X_train_encoded, y_train, discrete_features=True, random_state=42)

# Round and pair
mi_dict = {feature: round(score, 2) for feature, score in zip(categorical_features, mi_scores)}

# Feature with highest MI
max_feature = max(mi_dict, key=mi_dict.get)
max_feature, mi_dict


('lead_source',
 {'industry': 0.02,
  'location': 0.0,
  'lead_source': 0.03,
  'employment_status': 0.02})

In [47]:
#What accuracy did you get?
# Preprocessing: one-hot encode categorical variables
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ],
    remainder='passthrough'  # keep numerical features
)

# Define model
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)

# Pipeline: preprocessing + model
pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', model)])

# Fit model on training data
pipeline.fit(X_train, y_train)

# Predict on validation set
y_val_pred = pipeline.predict(X_val)

# Calculate accuracy
accuracy = round(accuracy_score(y_val, y_val_pred), 2)
accuracy


0.74

In [43]:
# Get full list of features after one-hot encoding
ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
X_train_ohe = ohe.fit_transform(X_train[categorical_features])
X_val_ohe = ohe.transform(X_val[categorical_features])

In [50]:
# Which of following feature has the smallest difference?
# Preprocessing: one-hot encode categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ],
    remainder='passthrough'
)

# Original model
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', model)])
pipeline.fit(X_train, y_train)

# Accuracy on validation set
y_val_pred = pipeline.predict(X_val)
original_acc = accuracy_score(y_val, y_val_pred)

# Get feature names after one-hot encoding
ohe = preprocessor.named_transformers_['cat']
ohe_features = ohe.get_feature_names_out(categorical_features)
all_features = list(ohe_features) + numerical_features

# Transform training and validation sets
X_train_transformed = preprocessor.fit_transform(X_train)
X_val_transformed = preprocessor.transform(X_val)

# Store accuracy differences
accuracy_diff = {}

for i, feature in enumerate(all_features):
    # Remove feature column
    X_train_drop = np.delete(X_train_transformed, i, axis=1)
    X_val_drop = np.delete(X_val_transformed, i, axis=1)
    
    # Train model
    model.fit(X_train_drop, y_train)
    y_val_pred_drop = model.predict(X_val_drop)
    
    # Difference in accuracy
    accuracy_diff[feature] = original_acc - accuracy_score(y_val, y_val_pred_drop)

# Convert to DataFrame for easier inspection
accuracy_diff_df = pd.DataFrame.from_dict(accuracy_diff, orient='index', columns=['accuracy_diff']).sort_values('accuracy_diff')
accuracy_diff_df

Unnamed: 0,accuracy_diff
interaction_count,-0.113014
employment_status_unemployed,-0.003425
employment_status_self_employed,-0.003425
lead_source_paid_ads,-0.003425
employment_status_student,0.0
employment_status_employed,0.0
employment_status_NA,0.0
lead_source_social_media,0.0
lead_source_referral,0.0
lead_source_organic_search,0.0


In [49]:
# Which of these C leads to the best accuracy on the validation set?
# Preprocessing: one-hot encoding for categorical, passthrough for numerical
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ],
    remainder='passthrough'
)

# Values of regularization parameter C
C_values = [0.01, 0.1, 1, 10, 100]

# Store accuracy results
results = {}

for C in C_values:
    model = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)
    pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', model)])
    pipeline.fit(X_train, y_train)
    y_val_pred = pipeline.predict(X_val)
    results[C] = round(accuracy_score(y_val, y_val_pred), 3)

results

{0.01: 0.743, 0.1: 0.743, 1: 0.743, 10: 0.743, 100: 0.743}