In [111]:
import pandas as pd
import numpy as np

In [112]:
df=pd.read_csv('https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv')
len(df)

1462

In [113]:
df.dtypes

lead_source                  object
industry                     object
number_of_courses_viewed      int64
annual_income               float64
employment_status            object
location                     object
interaction_count             int64
lead_score                  float64
converted                     int64
dtype: object

In [114]:
categorical=df.dtypes[df.dtypes=='object'].index
numerical=df.dtypes[df.dtypes!='object'].index
categorical,numerical

(Index(['lead_source', 'industry', 'employment_status', 'location'], dtype='object'),
 Index(['number_of_courses_viewed', 'annual_income', 'interaction_count',
        'lead_score', 'converted'],
       dtype='object'))

In [115]:
df.isnull().sum()

lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64

In [116]:
for col in df.columns:
    if df[col].dtype == 'object':  # Categorical
        df[col]=df[col].fillna('NA')
    else:  # Numerical (int, float, etc.)
        df[col]=df[col].fillna(0)


# Question 1
# What is the most frequent observation (mode) for the column industry?

In [117]:
df['industry'].value_counts().iloc[0]

np.int64(203)

In [118]:
df['industry'].mode()

0    retail
Name: industry, dtype: object

# Question 2
``````````
Create the correlation matrix for the numerical features of your dataset. In a correlation matrix, you compute the correlation coefficient between every pair of features.

What are the two features that have the biggest correlation?

interaction_count and lead_score
number_of_courses_viewed and lead_score
number_of_courses_viewed and interaction_count
annual_income and interaction_count
Only consider the pairs above when answering this question.

``````````

In [119]:
# Compute pairwise correlations
print("interaction_count vs lead_score:",
      df['interaction_count'].corr(df['lead_score']))

print("number_of_courses_viewed vs lead_score:",
      df['number_of_courses_viewed'].corr(df['lead_score']))

print("number_of_courses_viewed vs interaction_count:",
      df['number_of_courses_viewed'].corr(df['interaction_count']))

print("annual_income vs interaction_count:",
      df['annual_income'].corr(df['interaction_count']))

interaction_count vs lead_score: 0.009888182496913077
number_of_courses_viewed vs lead_score: -0.00487899835468127
number_of_courses_viewed vs interaction_count: -0.023565222882888103
annual_income vs interaction_count: 0.027036472404814337


``````````

Split the data
Split your data in train/val/test sets with 60%/20%/20% distribution.
Use Scikit-Learn for that (the train_test_split function) and set the seed to 42.
Make sure that the target value y is not in your dataframe.

````````````````

In [120]:
from sklearn.model_selection import train_test_split

In [121]:
df_full_train,df_test = train_test_split(df,test_size=0.2,random_state=42)
df_train,df_val=train_test_split(df_full_train,test_size=0.25,random_state=42)
len(df_train),len(df_val),len(df_test)

(876, 293, 293)

In [122]:
df_train

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
1077,paid_ads,retail,0,58472.0,student,middle_east,5,0.03,0
463,organic_search,manufacturing,3,71738.0,student,middle_east,6,0.77,1
842,paid_ads,technology,3,81973.0,employed,north_america,2,0.59,1
835,,technology,1,74956.0,employed,europe,3,0.34,1
837,organic_search,retail,3,59335.0,student,australia,1,0.98,1
...,...,...,...,...,...,...,...,...,...
725,organic_search,other,1,43907.0,employed,australia,4,0.33,1
401,social_media,retail,3,64969.0,employed,north_america,1,0.18,0
957,,education,3,89042.0,employed,asia,4,0.75,1
992,social_media,manufacturing,1,0.0,self_employed,europe,1,0.65,0


In [123]:
y_train=df_train.converted.values
y_val=df_val.converted.values
y_test=df_test.converted.values

In [124]:
del df_train['converted']
del df_val['converted']
del df_test['converted']

In [125]:
df_train

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score
1077,paid_ads,retail,0,58472.0,student,middle_east,5,0.03
463,organic_search,manufacturing,3,71738.0,student,middle_east,6,0.77
842,paid_ads,technology,3,81973.0,employed,north_america,2,0.59
835,,technology,1,74956.0,employed,europe,3,0.34
837,organic_search,retail,3,59335.0,student,australia,1,0.98
...,...,...,...,...,...,...,...,...
725,organic_search,other,1,43907.0,employed,australia,4,0.33
401,social_media,retail,3,64969.0,employed,north_america,1,0.18
957,,education,3,89042.0,employed,asia,4,0.75
992,social_media,manufacturing,1,0.0,self_employed,europe,1,0.65


# Question 3
````````````
Calculate the mutual information score between y and other categorical variables in the dataset. Use the training set only.
Round the scores to 2 decimals using round(score, 2).
Which of these variables has the biggest mutual information score?

industry
location
lead_source
employment_status



In [126]:
from sklearn.metrics import mutual_info_score

In [127]:
for col in df_full_train.columns:
    if df_full_train[col].dtype == 'object':  # Categorical
       df_full_train[col]= df_full_train[col].fillna('NA')
    else:  # Numerical (int, float, etc.)
       df_full_train[col]= df_full_train[col].fillna(0)

In [128]:
df_full_train.isnull().sum()

lead_source                 0
industry                    0
number_of_courses_viewed    0
annual_income               0
employment_status           0
location                    0
interaction_count           0
lead_score                  0
converted                   0
dtype: int64

In [129]:
def mutual_info_churn_score(series):
    return mutual_info_score(series,df_full_train.converted)

In [130]:
Mi=df_full_train[categorical].apply(mutual_info_churn_score)
Mi=Mi.round(2)
Mi.sort_values(ascending=False)

lead_source          0.03
industry             0.01
employment_status    0.01
location             0.00
dtype: float64

# Question 4
````
Now let's train a logistic regression.
Remember that we have several categorical variables in the dataset. Include them using one-hot encoding.
Fit the model on the training dataset.
To make sure the results are reproducible across different versions of Scikit-Learn, fit the model with these parameters:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
Calculate the accuracy on the validation dataset and round it to 2 decimal digits.
What accuracy did you get?

In [131]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression

In [132]:
train_dicts= df_train[categorical].to_dict(orient='records')
val_dicts= df_val[categorical].to_dict(orient='records')


In [133]:
dv= DictVectorizer(sparse=False)

In [134]:
dv.fit(train_dicts)
X_val = dv.transform(val_dicts)

In [135]:
dv.get_feature_names_out()

array(['employment_status=NA', 'employment_status=employed',
       'employment_status=self_employed', 'employment_status=student',
       'employment_status=unemployed', 'industry=NA',
       'industry=education', 'industry=finance', 'industry=healthcare',
       'industry=manufacturing', 'industry=other', 'industry=retail',
       'industry=technology', 'lead_source=NA', 'lead_source=events',
       'lead_source=organic_search', 'lead_source=paid_ads',
       'lead_source=referral', 'lead_source=social_media', 'location=NA',
       'location=africa', 'location=asia', 'location=australia',
       'location=europe', 'location=middle_east',
       'location=north_america', 'location=south_america'], dtype=object)

In [136]:
X_train=dv.transform(train_dicts)
X_train.shape

(876, 27)

In [137]:
model =LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train,y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'liblinear'
,max_iter,1000


In [138]:
model.predict_proba(X_train)

array([[0.53017506, 0.46982494],
       [0.25551729, 0.74448271],
       [0.58887959, 0.41112041],
       ...,
       [0.10095463, 0.89904537],
       [0.2974653 , 0.7025347 ],
       [0.40666948, 0.59333052]], shape=(876, 2))

In [139]:
y_pred=model.predict_proba(X_train)[:,1]

In [140]:

y_pred_val = model.predict_proba(X_val)[:, 1] >= 0.5
acc = (y_val == y_pred_val).mean()
round(acc, 2)
     

np.float64(0.61)

# Question 5
````
Let's find the least useful feature using the feature elimination technique.
Train a model using the same features and parameters as in Q4 (without rounding).
Now exclude each feature from this set and train a model without it. Record the accuracy for each model.
For each feature, calculate the difference between the original accuracy and the accuracy without the feature.
Which of following feature has the smallest difference?

'industry'
'employment_status'
'lead_score'
Note: The difference doesn't have to be positive.

In [141]:
# X_train = pd.DataFrame(X_train, columns=[f"feature_{i}" for i in range(X_train.shape[1])])
# X_val = pd.DataFrame(X_val, columns=[f"feature_{i}" for i in range(X_val.shape[1])])

In [142]:
# Train Logistic Regression
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

all_features = df_train.columns
# Function to train the model and calculate accuracy
def train_and_evaluate(train, val, y_train, y_val, features):
    train_dict = train[features].to_dict(orient='records')
    val_dict = val[features].to_dict(orient='records')

    dv = DictVectorizer(sparse=False)
    x_train = dv.fit_transform(train_dict)
    x_val = dv.transform(val_dict)

    # Train logistic regression model
    model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model.fit(x_train, y_train)

    # Predict on validation set
    y_pred = model.predict(x_val)

    # Calculate accuracy
    return accuracy_score(y_val, y_pred)


# Base model with all features
base_accuracy = train_and_evaluate(df_train, df_val, y_train, y_val, all_features)
print(f"Base model accuracy with all features: {base_accuracy:.4f}")

Base model accuracy with all features: 0.6997


In [143]:
# Feature Importance

# Features to evaluate
features_to_evaluate = ["industry", "employment_status", "lead_score"]
results = {}

# Iterate through each feature and exclude it
for feature in features_to_evaluate:
    features_subset = [f for f in all_features if f != feature]
    accuracy = train_and_evaluate(df_train, df_val, y_train, y_val, features_subset)
    accuracy_diff = base_accuracy - accuracy
    results[feature] = accuracy_diff
    print(f"Accuracy without {feature}: {accuracy:.4f} (Difference: {accuracy_diff:.4f})")

# Find the feature with the smallest difference
least_useful_feature = min(results, key=results.get)
print(f"\nFeature with the smallest difference in accuracy: {least_useful_feature}")



Accuracy without industry: 0.6997 (Difference: 0.0000)
Accuracy without employment_status: 0.6962 (Difference: 0.0034)
Accuracy without lead_score: 0.7065 (Difference: -0.0068)

Feature with the smallest difference in accuracy: lead_score


# Question 6
````
Now let's train a regularized logistic regression.
Let's try the following values of the parameter C: [0.01, 0.1, 1, 10, 100].
Train models using all the features as in Q4.
Calculate the accuracy on the validation dataset and round it to 3 decimal digits.
Which of these C leads to the best accuracy on the validation set?

In [144]:
C_list = [0.01, 0.1, 1, 10, 100]
acc_C = []
for C in C_list:
    model = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    y_pred_val = model.predict_proba(X_val)[:, 1] >= 0.5
    acc_C.append(round((y_val == y_pred_val).mean(), 3))

pd.DataFrame(data={'C':C_list, 'accuracy':acc_C}).sort_values(by='accuracy', ascending=False)

Unnamed: 0,C,accuracy
2,1.0,0.608
4,100.0,0.604
3,10.0,0.604
1,0.1,0.601
0,0.01,0.56
