In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, mutual_info_score
from sklearn.preprocessing import OneHotEncoder
import warnings
warnings.filterwarnings('ignore')

# Download the dataset
url = "https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv"
df = pd.read_csv(url)

df.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


### Data preparation
- Check if the missing values are presented in the features.
- If there are missing values:
    - For caterogiral features, replace them with 'NA'
    - For numerical features, replace with with 0.0

In [2]:
# Check for missing values
print("Missing values before processing:")
print(df.isnull().sum().sort_values(ascending=False))

Missing values before processing:
annual_income               181
industry                    134
lead_source                 128
employment_status           100
location                     63
number_of_courses_viewed      0
interaction_count             0
lead_score                    0
converted                     0
dtype: int64


In [3]:
# Separate categorical and numerical features
categorical_features = ['lead_source', 'industry', 'employment_status', 'location']
numerical_features = ['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score']

# Fill missing values
for col in categorical_features:
    df[col] = df[col].fillna('NA')
    
for col in numerical_features:
    df[col] = df[col].fillna(0.0)

print("\nMissing values after processing:")
print(df.isnull().sum())


Missing values after processing:
lead_source                 0
industry                    0
number_of_courses_viewed    0
annual_income               0
employment_status           0
location                    0
interaction_count           0
lead_score                  0
converted                   0
dtype: int64


### Question 1
What is the most frequent observation (mode) for the column industry?

In [4]:
# Question 1
industry_mode = df['industry'].mode()[0]
print(f"Q1 - Most frequent industry: {industry_mode}")

Q1 - Most frequent industry: retail


### Question 2: Correlation matrix

In [5]:
# Question 2
correlation_matrix = df[numerical_features].corr()
print("Correlation matrix:")
correlation_matrix

Correlation matrix:


Unnamed: 0,number_of_courses_viewed,annual_income,interaction_count,lead_score
number_of_courses_viewed,1.0,0.00977,-0.023565,-0.004879
annual_income,0.00977,1.0,0.027036,0.01561
interaction_count,-0.023565,0.027036,1.0,0.009888
lead_score,-0.004879,0.01561,0.009888,1.0


In [6]:
# Find the pair with highest absolute correlation (excluding diagonal)
corr_pairs = {}
for i in range(len(numerical_features)):
    for j in range(i+1, len(numerical_features)):
        feat1, feat2 = numerical_features[i], numerical_features[j]
        corr = abs(correlation_matrix.loc[feat1, feat2])
        corr_pairs[(feat1, feat2)] = corr

max_pair = max(corr_pairs, key=corr_pairs.get)
print(f"\nQ2 - Pair with highest correlation: {max_pair} with value {corr_pairs[max_pair]:.3f}")


Q2 - Pair with highest correlation: ('annual_income', 'interaction_count') with value 0.027


#### Split the data

In [7]:
# Split the data
X = df.drop('converted', axis=1)
y = df['converted']

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

print(f"Training set: {X_train.shape[0]} samples")
print(f"Validation set: {X_val.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")

Training set: 877 samples
Validation set: 292 samples
Test set: 293 samples


### Question 3: Mutual Information

In [8]:
# Question 3
def calculate_mi(series):
    return mutual_info_score(series, y_train)

mi_scores = {}
for col in categorical_features:
    mi = calculate_mi(X_train[col])
    mi_scores[col] = round(mi, 2)

print("Q3 - Mutual Information scores:")
for col, score in mi_scores.items():
    print(f"{col}: {score}")

Q3 - Mutual Information scores:
lead_source: 0.03
industry: 0.02
employment_status: 0.02
location: 0.0


In [9]:
max_mi_feature = max(mi_scores, key=mi_scores.get)
print(f"Feature with highest MI: {max_mi_feature}")

Feature with highest MI: lead_source


### Question 4: Logistic Regression

In [10]:
# Question 4
# Prepare features with one-hot encoding
def prepare_features(X_df):
    X_encoded = X_df.copy()
    
    # One-hot encode categorical variables
    for col in categorical_features:
        dummies = pd.get_dummies(X_df[col], prefix=col)
        X_encoded = pd.concat([X_encoded, dummies], axis=1)
        X_encoded = X_encoded.drop(col, axis=1)
    
    return X_encoded

X_train_encoded = prepare_features(X_train)
X_val_encoded = prepare_features(X_val)

# Train logistic regression
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train_encoded, y_train)

# Predict on validation set
y_val_pred = model.predict(X_val_encoded)
accuracy = accuracy_score(y_val, y_val_pred)
accuracy_rounded = round(accuracy, 2)

In [11]:
print(f"Q4 - Validation accuracy: {accuracy}")
print(f"Q4 - Rounded accuracy: {accuracy_rounded}")

Q4 - Validation accuracy: 0.7431506849315068
Q4 - Rounded accuracy: 0.74


### Question 5: Feature Elimination

In [12]:
# Question 5: Feature Elimination - Fixed Version
def prepare_features_selected(X_df, categorical_features_to_use):
    X_encoded = X_df.copy()
    
    # One-hot encode only the categorical variables that exist in the dataframe
    for col in categorical_features_to_use:
        if col in X_df.columns:
            dummies = pd.get_dummies(X_df[col], prefix=col)
            X_encoded = pd.concat([X_encoded, dummies], axis=1)
            X_encoded = X_encoded.drop(col, axis=1)
    
    return X_encoded

# Get original accuracy with all features
X_train_encoded = prepare_features_selected(X_train, categorical_features)
X_val_encoded = prepare_features_selected(X_val, categorical_features)

model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train_encoded, y_train)
y_val_pred = model.predict(X_val_encoded)
original_accuracy = accuracy_score(y_val, y_val_pred)

print(f"Original accuracy with all features: {original_accuracy:.4f}")

Original accuracy with all features: 0.7432


In [13]:
feature_differences = {}
features_to_test = ['industry', 'employment_status', 'lead_score']

for feature in features_to_test:
    # Remove the feature from training and validation sets
    X_train_reduced = X_train.drop(feature, axis=1)
    X_val_reduced = X_val.drop(feature, axis=1)
    
    # Update categorical features list if we removed a categorical feature
    if feature in categorical_features:
        categorical_features_reduced = [f for f in categorical_features if f != feature]
    else:
        categorical_features_reduced = categorical_features.copy()
    
    # Prepare features with updated categorical features list
    X_train_encoded_reduced = prepare_features_selected(X_train_reduced, categorical_features_reduced)
    X_val_encoded_reduced = prepare_features_selected(X_val_reduced, categorical_features_reduced)
    
    # Train model
    model_reduced = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model_reduced.fit(X_train_encoded_reduced, y_train)
    
    # Predict and calculate accuracy
    y_val_pred_reduced = model_reduced.predict(X_val_encoded_reduced)
    reduced_accuracy = accuracy_score(y_val, y_val_pred_reduced)
    
    # Calculate difference
    difference = original_accuracy - reduced_accuracy
    feature_differences[feature] = difference
    print(f"Without '{feature}': accuracy = {reduced_accuracy:.4f}, difference = {difference:.4f}")

Without 'industry': accuracy = 0.7432, difference = 0.0000
Without 'employment_status': accuracy = 0.7466, difference = -0.0034
Without 'lead_score': accuracy = 0.7432, difference = 0.0000


In [14]:
# Find the feature with smallest absolute difference
smallest_diff_feature = min(feature_differences, key=lambda x: abs(feature_differences[x]))
print(f"\nQ5 - Feature with smallest difference: '{smallest_diff_feature}'")


Q5 - Feature with smallest difference: 'industry'


In [15]:
print(f"Differences: {feature_differences}")

Differences: {'industry': 0.0, 'employment_status': -0.003424657534246589, 'lead_score': 0.0}


### Question 6: Regularized Logistic Regression

In [16]:
# Question 6
C_values = [0.01, 0.1, 1, 10, 100]
best_accuracy = 0
best_C = None

for C in C_values:
    model_reg = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)
    model_reg.fit(X_train_encoded, y_train)
    
    y_val_pred_reg = model_reg.predict(X_val_encoded)
    accuracy_reg = accuracy_score(y_val, y_val_pred_reg)
    accuracy_rounded_reg = round(accuracy_reg, 3)
    
    print(f"C = {C}: accuracy = {accuracy_reg:.5f}, rounded = {accuracy_rounded_reg}")
    
    if accuracy_reg > best_accuracy:
        best_accuracy = accuracy_reg
        best_C = C

C = 0.01: accuracy = 0.74315, rounded = 0.743
C = 0.1: accuracy = 0.74315, rounded = 0.743
C = 1: accuracy = 0.74315, rounded = 0.743
C = 10: accuracy = 0.74315, rounded = 0.743
C = 100: accuracy = 0.74315, rounded = 0.743


In [17]:
print(f"Q6 - Best C: {best_C} with accuracy {best_accuracy:.4f}")

Q6 - Best C: 0.01 with accuracy 0.7432
