# Data preparation

In [1]:
import  pandas as pd, numpy as np, matplotlib.pyplot as plt

In [None]:
data = 'https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv'
!wget $data

In [104]:
df = pd.read_csv('course_lead_scoring.csv')
df.head()


Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In [105]:
df.shape


(1462, 9)

In [106]:
df.head()
df.columns.str.replace(' ','_').str.lower()

Index(['lead_source', 'industry', 'number_of_courses_viewed', 'annual_income',
       'employment_status', 'location', 'interaction_count', 'lead_score',
       'converted'],
      dtype='object')

In [107]:
mode_industry = df['industry'].mode()[0]
mode_industry

'retail'

In [108]:
df.dtypes

lead_source                  object
industry                     object
number_of_courses_viewed      int64
annual_income               float64
employment_status            object
location                     object
interaction_count             int64
lead_score                  float64
converted                     int64
dtype: object

In [109]:
print("Initial Missing Values:")

df.isnull().sum()[df.isnull().sum() > 0]

Initial Missing Values:


lead_source          128
industry             134
annual_income        181
employment_status    100
location              63
dtype: int64

In [110]:
categorical = list(df.dtypes[df.dtypes == 'object'].index)
categorical

['lead_source', 'industry', 'employment_status', 'location']

In [111]:
numerical = list(df.dtypes[df.dtypes != 'object'].index)
numerical


['number_of_courses_viewed',
 'annual_income',
 'interaction_count',
 'lead_score',
 'converted']

In [112]:
for col in categorical:
    df[col] = df[col].str.lower().replace(' ','_')

In [113]:
df.nunique()

lead_source                    5
industry                       7
number_of_courses_viewed      10
annual_income               1267
employment_status              4
location                       7
interaction_count             12
lead_score                   101
converted                      2
dtype: int64

In [114]:
# For categorical features, replace with 'NA'
df[categorical] = df[categorical].fillna('NA')

In [115]:
# For numerical features, replace with 0.0
df[numerical] = df[numerical].fillna(0.0)

In [116]:
print("\nMissing Values After Imputation:")
df.isnull().sum()


Missing Values After Imputation:


lead_source                 0
industry                    0
number_of_courses_viewed    0
annual_income               0
employment_status           0
location                    0
interaction_count           0
lead_score                  0
converted                   0
dtype: int64

In [117]:
df.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In [118]:
df.isnull().sum()

lead_source                 0
industry                    0
number_of_courses_viewed    0
annual_income               0
employment_status           0
location                    0
interaction_count           0
lead_score                  0
converted                   0
dtype: int64

In [119]:
# Select only numerical features
numerical_df = df.select_dtypes(include=['number'])

# Create the correlation matrix
correlation_matrix = numerical_df.corr()

# Define the pairs to check
pairs_to_check = [
    ('interaction_count', 'lead_score'),
    ('number_of_courses_viewed', 'lead_score'),
    ('number_of_courses_viewed', 'interaction_count'),
    ('annual_income', 'interaction_count')
]

# Find the maximum absolute correlation
max_correlation = -1
most_correlated_pair = None

for feature1, feature2 in pairs_to_check:
    corr_value = correlation_matrix.loc[feature1, feature2]
    abs_corr_value = abs(corr_value)

    if abs_corr_value > max_correlation:
        max_correlation = abs_corr_value
        most_correlated_pair = (feature1, feature2)

print(f"Max absolute correlation: {max_correlation}")
print(f"Pair: {most_correlated_pair}")

Max absolute correlation: 0.02703647240481443
Pair: ('annual_income', 'interaction_count')


The split happens as follows:

Dataset is divided into 80% full_train + 20% test

full_train is divided into 60% train + 20% validation (total 80% of dataset)

train set = 75% of full_train (60% / 80%)

validation set = 25% of 80% full_train (20% / 80%)

(75% of full_train + 25% of full_train = 100% of full_train = 80% of dataset)

In [120]:
from sklearn.model_selection import train_test_split

n = len(df)

idx = np.arange(n)
print(idx)

np.random.seed(42)


df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)
len(df_train), len(df_val), len(df_test)
#print(len(df_train)+ len(df_val) +len(df_test))


[   0    1    2 ... 1459 1460 1461]


(876, 293, 293)

In [121]:
df.isnull().sum()

lead_source                 0
industry                    0
number_of_courses_viewed    0
annual_income               0
employment_status           0
location                    0
interaction_count           0
lead_score                  0
converted                   0
dtype: int64

In [122]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [123]:
y_train = df_train.converted.values
y_val = df_val.converted.values
y_test = df_test.converted.values

del df_train['converted']
del df_val['converted']
del df_test['converted']

In [132]:
df_full_train = df_full_train.reset_index(drop=True)
df_full_train.isnull().sum()
print(df_full_train.converted.value_counts(normalize=True))
print(df_full_train.converted.mean())




converted
1    0.611634
0    0.388366
Name: proportion, dtype: float64
0.611633875106929


In [133]:
df_full_train[categorical].nunique()


lead_source          6
industry             8
employment_status    5
location             8
dtype: int64

In [138]:
from sklearn.metrics import mutual_info_score
def mutual_info_converted_score(series):
    return mutual_info_score(series, df_full_train.converted)

In [139]:
mi = df_full_train[categorical].apply(mutual_info_converted_score)
mi.sort_values(ascending=False)

lead_source          0.024562
employment_status    0.012690
industry             0.008173
location             0.001212
dtype: float64

In [148]:
from sklearn.feature_extraction import DictVectorizer



# Convert DataFrames to a list of dictionaries (one dict per row)
# This is the standard input format for DictVectorizer
train_dicts = df_train.to_dict(orient='records')
val_dicts = df_val.to_dict(orient='records')

# Initialize and fit the DictVectorizer on the TRAINING set only
dv = DictVectorizer(sparse=False) # sparse=False for easier inspection; True is default
X_train = dv.fit_transform(train_dicts)

# Transform the validation set using the FITTED vectorizer
X_val = dv.transform(val_dicts)

print(f"Number of features after OHE: {X_train.shape[1]}")




# 1. One-hot encode the training and validation sets
# df_train_ohe = pd.get_dummies(df_train, columns=categorical)
# df_val_ohe = pd.get_dummies(df_val, columns=categorical)

# # 2. Get the full set of OHE column names from the training set
# train_cols = df_train_ohe.columns

# # 3. Align the validation set to the training set columns
# missing_cols = set(train_cols) - set(df_val_ohe.columns)
# for c in missing_cols:
#     df_val_ohe[c] = 0

# # 4. Filter and reorder the validation set columns
# df_val_ohe = df_val_ohe[train_cols]

# # 5. Extract values (this is your X matrix)
# X_train = df_train_ohe.values
# X_val = df_val_ohe.values

Number of features after OHE: 31


In [149]:
df.isnull().sum()

lead_source                 0
industry                    0
number_of_courses_viewed    0
annual_income               0
employment_status           0
location                    0
interaction_count           0
lead_score                  0
converted                   0
dtype: int64

In [151]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import  accuracy_score

# Train the model
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

# Predict and calculate accuracy on validation set
y_pred_val = model.predict(X_val)
accuracy_q4 = accuracy_score(y_val, y_pred_val)
rounded_accuracy_q4 = round(accuracy_q4, 2)

print("\n--- Question 4 Result ---")
print(f"Validation Accuracy: {accuracy_q4:.4f}")
print(f"Rounded Accuracy: {rounded_accuracy_q4}")




--- Question 4 Result ---
Validation Accuracy: 0.6997
Rounded Accuracy: 0.7


In [152]:
# Original accuracy from Q4 (unrounded)
original_accuracy = accuracy_q4

elimination_features = ['industry', 'employment_status', 'lead_score']
accuracy_diffs = {}
full_feature_names = dv.get_feature_names_out()

for feature_to_remove in elimination_features:
    # --- Identify columns to drop from the OHE matrix ---
    if feature_to_remove == 'lead_score':
        # Numerical feature: column name is exact
        cols_to_drop = [feature_to_remove]
    else:
        # Categorical feature: drop all OHE columns starting with the feature name
        cols_to_drop = [col for col in full_feature_names if col.startswith(f'{feature_to_remove}=')]

    # --- Create a new feature list without the dropped columns ---
    features_subset_names = [col for col in full_feature_names if col not in cols_to_drop]

    # --- Find the indices for the subset of features ---
    # Convert names to indices for selection
    indices = [np.where(full_feature_names == name)[0][0] for name in features_subset_names]
    
    # --- Prepare data subset ---
    X_train_subset = X_train[:, indices]
    X_val_subset = X_val[:, indices]

    # --- Train the model (same parameters as Q4) ---
    model_subset = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model_subset.fit(X_train_subset, y_train)

    # --- Calculate accuracy ---
    y_pred_val_subset = model_subset.predict(X_val_subset)
    accuracy_subset = accuracy_score(y_val, y_pred_val_subset)

    # --- Calculate difference ---
    # Difference = Original Accuracy - Accuracy without the feature
    diff = original_accuracy - accuracy_subset
    accuracy_diffs[feature_to_remove] = diff

In [153]:
# Find the feature with the smallest difference (closest to zero or most negative)
smallest_diff_feature = min(accuracy_diffs, key=accuracy_diffs.get)

print(accuracy_diffs)
print(f"Smallest difference feature: {smallest_diff_feature}")

{'industry': 0.0, 'employment_status': -0.0034129692832765013, 'lead_score': 0.0}
Smallest difference feature: employment_status


In [154]:

C_values = [0.01, 0.1, 1, 10, 100]
best_accuracy = -1
best_C = None
results_q6 = {}

for C in C_values:
    # Train the model with the current C
    model = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)

    # Predict and calculate accuracy on validation set
    y_pred_val = model.predict(X_val)
    accuracy_C = accuracy_score(y_val, y_pred_val)
    rounded_accuracy = round(accuracy_C, 3)
    results_q6[C] = rounded_accuracy
    
    print(f"C={C:5}: Accuracy={rounded_accuracy:.3f}")

    # Check for best C (and select smallest C if accuracies are tied)
    if accuracy_C > best_accuracy:
        best_accuracy = accuracy_C
        best_C = C
    elif accuracy_C == best_accuracy:
        best_C = min(best_C, C)

print(f"\nBest C for maximum accuracy is: {best_C}")

C= 0.01: Accuracy=0.700
C=  0.1: Accuracy=0.700
C=    1: Accuracy=0.700
C=   10: Accuracy=0.700
C=  100: Accuracy=0.700

Best C for maximum accuracy is: 0.01
