In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
df = pd.read_csv('course_lead_scoring.csv')
df.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In [3]:
df.isna().sum()

lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64

In [4]:
df.dtypes

lead_source                  object
industry                     object
number_of_courses_viewed      int64
annual_income               float64
employment_status            object
location                     object
interaction_count             int64
lead_score                  float64
converted                     int64
dtype: object

In [5]:
categorical_cols = df.select_dtypes(include=['object']).columns
numerical_cols = df.select_dtypes(include=['number']).columns

print("Categorical: ", categorical_cols)
print("Numerical: ", numerical_cols)

Categorical:  Index(['lead_source', 'industry', 'employment_status', 'location'], dtype='object')
Numerical:  Index(['number_of_courses_viewed', 'annual_income', 'interaction_count',
       'lead_score', 'converted'],
      dtype='object')


In [6]:
df[categorical_cols] = df[categorical_cols].fillna('NA')
df[numerical_cols] = df[numerical_cols].fillna(0.0)

In [7]:
df.isna().sum()

lead_source                 0
industry                    0
number_of_courses_viewed    0
annual_income               0
employment_status           0
location                    0
interaction_count           0
lead_score                  0
converted                   0
dtype: int64

In [8]:
mode_value = df['industry'].mode()[0]
print(mode_value)

retail


In [9]:
numerical_df = df.select_dtypes(include=['number'])
corr_matrix = numerical_df.corr()

print(corr_matrix)

                          number_of_courses_viewed  annual_income  \
number_of_courses_viewed                  1.000000       0.009770   
annual_income                             0.009770       1.000000   
interaction_count                        -0.023565       0.027036   
lead_score                               -0.004879       0.015610   
converted                                 0.435914       0.053131   

                          interaction_count  lead_score  converted  
number_of_courses_viewed          -0.023565   -0.004879   0.435914  
annual_income                      0.027036    0.015610   0.053131  
interaction_count                  1.000000    0.009888   0.374573  
lead_score                         0.009888    1.000000   0.193673  
converted                          0.374573    0.193673   1.000000  


In [10]:
pairs = [
    ("interaction_count", "lead_score"),
    ("number_of_courses_viewed", "lead_score"),
    ("number_of_courses_viewed", "interaction_count"),
    ("annual_income", "interaction_count"),
]

# Get absolute correlation values for each pair
correlations = {pair: abs(corr_matrix.loc[pair[0], pair[1]]) for pair in pairs}

# Find the pair with the largest correlation
max_pair = max(correlations, key=correlations.get)
print(f"The two features with the biggest correlation are: {max_pair}")

The two features with the biggest correlation are: ('annual_income', 'interaction_count')


In [11]:
# Separate features and target
X = df.drop('converted', axis=1)
y = df['converted']

# Split 60% train, 40% temp
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.4, random_state=42
)

# Split temp into 50% val, 50% test (which is 20% each of original)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42
)

# Check the proportions
print(X_train.shape, X_val.shape, X_test.shape)

(877, 8) (292, 8) (293, 8)


In [12]:
# Assume X_train and y_train are your training features and target
categorical_vars = ['industry', 'location', 'lead_source', 'employment_status']

# Encode categorical variables
X_train_encoded = X_train[categorical_vars].apply(LabelEncoder().fit_transform)

# Calculate mutual information scores
mi_scores = mutual_info_classif(X_train_encoded, y_train, discrete_features=True)

# Round and display scores
for var, score in zip(categorical_vars, mi_scores):
    print(f"{var}: {round(score, 2)}")

industry: 0.02
location: 0.0
lead_source: 0.03
employment_status: 0.02


In [13]:
# One-hot encode categorical variables
X_train_enc = pd.get_dummies(X_train)
X_val_enc = pd.get_dummies(X_val)

# Align columns in case some categories are missing in val
X_val_enc = X_val_enc.reindex(columns=X_train_enc.columns, fill_value=0)

# Fit logistic regression
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train_enc, y_train)

# Predict and calculate accuracy
y_pred = model.predict(X_val_enc)
accuracy = accuracy_score(y_val, y_pred)
print(round(accuracy, 2))

0.74


In [14]:
# One-hot encode categorical variables
X_train_enc = pd.get_dummies(X_train)
X_val_enc = pd.get_dummies(X_val)
X_val_enc = X_val_enc.reindex(columns=X_train_enc.columns, fill_value=0)

# Baseline model
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train_enc, y_train)
baseline_acc = accuracy_score(y_val, model.predict(X_val_enc))

# Features to test
features_to_test = ['industry', 'employment_status', 'lead_score']
diffs = {}

for feature in features_to_test:
    # Remove feature columns (for one-hot, remove all columns starting with feature name)
    cols_to_remove = [col for col in X_train_enc.columns if col.startswith(feature)]
    X_train_drop = X_train_enc.drop(columns=cols_to_remove)
    X_val_drop = X_val_enc.drop(columns=cols_to_remove)
    
    model.fit(X_train_drop, y_train)
    acc = accuracy_score(y_val, model.predict(X_val_drop))
    diffs[feature] = baseline_acc - acc

# Print differences
for feature, diff in diffs.items():
    print(f"{feature}: {diff}")

industry: 0.0
employment_status: -0.003424657534246589
lead_score: 0.0


In [15]:
C_values = [0.01, 0.1, 1, 10, 100]
accuracies = {}

for C in C_values:
    model = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)
    model.fit(X_train_enc, y_train)
    y_pred = model.predict(X_val_enc)
    acc = accuracy_score(y_val, y_pred)
    accuracies[C] = round(acc, 3)

# Find the best C (smallest C if tie)
best_C = min([c for c, acc in accuracies.items() if acc == max(accuracies.values())])

print(f"Accuracies: {accuracies}")
print(f"Best C: {best_C}")

Accuracies: {0.01: 0.743, 0.1: 0.743, 1: 0.743, 10: 0.743, 100: 0.743}
Best C: 0.01
