In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import mutual_info_classif

# Load dataset
url = "https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv"
df = pd.read_csv(url)

# Handle missing values
for col in df.columns:
    if df[col].dtype == 'object':
        df[col] = df[col].fillna('NA')
    else:
        df[col] = df[col].fillna(0.0)

In [2]:
mode_industry = df['industry'].mode()[0]
print(f"Mode of 'industry': {mode_industry}")

Mode of 'industry': retail


In [3]:
# Calculate correlation matrix for numerical features
numerical_features = ['interaction_count', 'lead_score', 'number_of_courses_viewed', 'annual_income']
corr_matrix = df[numerical_features].corr()

# Check specified pairs
pairs = [
    ('interaction_count', 'lead_score'),
    ('number_of_courses_viewed', 'lead_score'),
    ('number_of_courses_viewed', 'interaction_count'),
    ('annual_income', 'interaction_count')
]

max_corr = -1
max_pair = None

for feat1, feat2 in pairs:
    corr = corr_matrix.loc[feat1, feat2]
    if abs(corr) > max_corr:
        max_corr = abs(corr)
        max_pair = (feat1, feat2)

print(f"Highest correlation pair: {max_pair} with correlation {max_corr:.2f}")

Highest correlation pair: ('annual_income', 'interaction_count') with correlation 0.03


In [4]:
# Split data
X = df.drop('converted', axis=1)
y = df['converted']

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [5]:
# Identify categorical features
categorical_features = ['industry', 'location', 'lead_source', 'employment_status']

# Calculate mutual information
mi_scores = {}
for feature in categorical_features:
    # Prepare data
    train_df = pd.concat([X_train[feature], y_train], axis=1)
    
    # One-hot encode
    dv = DictVectorizer(sparse=False)
    train_dict = train_df[[feature]].to_dict(orient='records')
    X_encoded = dv.fit_transform(train_dict)
    
    # Calculate MI
    mi = mutual_info_classif(X_encoded, y_train)[0]
    mi_scores[feature] = round(mi, 2)

# Find feature with highest MI
highest_mi_feature = max(mi_scores, key=mi_scores.get)
print(f"Feature with highest MI: {highest_mi_feature} (MI = {mi_scores[highest_mi_feature]})")

Feature with highest MI: location (MI = 0.03)


In [6]:
# Prepare data
train_dict = X_train.to_dict(orient='records')
val_dict = X_val.to_dict(orient='records')

# One-hot encode
dv = DictVectorizer(sparse=False)
X_train_encoded = dv.fit_transform(train_dict)
X_val_encoded = dv.transform(val_dict)

# Train model
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train_encoded, y_train)

# Predict and calculate accuracy
y_pred = model.predict(X_val_encoded)
accuracy = accuracy_score(y_val, y_pred)
print(f"Validation accuracy: {accuracy:.2f}")

Validation accuracy: 0.74


In [7]:
# Features to test
features_to_test = ['industry', 'employment_status', 'lead_score']
original_accuracy = accuracy

# Calculate accuracy difference for each feature
accuracy_diff = {}
for feature in features_to_test:
    # Create feature set without current feature
    X_train_reduced = X_train.drop(feature, axis=1)
    X_val_reduced = X_val.drop(feature, axis=1)
    
    # Prepare data
    train_dict = X_train_reduced.to_dict(orient='records')
    val_dict = X_val_reduced.to_dict(orient='records')
    
    # One-hot encode
    dv = DictVectorizer(sparse=False)
    X_train_encoded = dv.fit_transform(train_dict)
    X_val_encoded = dv.transform(val_dict)
    
    # Train model
    model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model.fit(X_train_encoded, y_train)
    
    # Calculate accuracy
    y_pred = model.predict(X_val_encoded)
    acc = accuracy_score(y_val, y_pred)
    accuracy_diff[feature] = original_accuracy - acc

# Find feature with smallest difference
least_useful = min(accuracy_diff, key=lambda x: abs(accuracy_diff[x]))
print(f"Least useful feature: {least_useful} (diff = {accuracy_diff[least_useful]:.4f})")

Least useful feature: industry (diff = 0.0000)


In [8]:
# Test different C values
C_values = [0.01, 0.1, 1, 10, 100]
best_accuracy = 0
best_C = None

for C in C_values:
    # Train model
    model = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)
    model.fit(X_train_encoded, y_train)
    
    # Calculate accuracy
    y_pred = model.predict(X_val_encoded)
    acc = accuracy_score(y_val, y_pred)
    
    if acc > best_accuracy:
        best_accuracy = acc
        best_C = C

print(f"Best C: {best_C} with accuracy: {best_accuracy:.3f}")

Best C: 1 with accuracy: 0.743
