In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# Note: sometimes your answer doesn't match one of the options exactly. That's fine. Select the option that's closest to your solution.
# Dataset

# In this homework, we will use the lead scoring dataset Bank Marketing dataset. Download it from here.

# Or you can do it with wget:

# wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv
# In this dataset our desired target for classification task will be converted variable - has the client signed up to the platform or not.
df = pd.read_csv('https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv')

# Data preparation

# Check if the missing values are presented in the features.
# If there are missing values:
# For caterogiral features, replace them with 'NA'
# For numerical features, replace with with 0.0

df.columns

#df.columns = df.columns.str.lower().str.replace(' ', '_')
# df.churn = (df.churn == 'yes'.).astype(int)

columns_with_missing_values = df.columns[df.isna().any()]
print(f"There are {len(columns_with_missing_values)} columns with missing values: {columns_with_missing_values}")

for column in columns_with_missing_values:
    if df[column].dtype == 'object': # replace missing values in categorical features with 'NA'
        #df[column] = df[column].str.lower().str.replace(' ', '_')
        df[column] = df[column].fillna('NA')
    else: # replace missing values in numerical features with 0.0
        df[column] = df[column].fillna(0.0)

There are 5 columns with missing values: Index(['lead_source', 'industry', 'annual_income', 'employment_status',
       'location'],
      dtype='object')


In [3]:
# Question 1

# What is the most frequent observation (mode) for the column industry?

# NA
# technology
# healthcare
# retail

industry_column_mode = df['industry'].mode()
print(f"The most frequent observation (mode) for the column industry: {industry_column_mode}")

The most frequent observation (mode) for the column industry: 0    retail
Name: industry, dtype: object


In [4]:
# Question 2

# Create the correlation matrix for the numerical features of your dataset. In a correlation matrix, you compute the correlation coefficient between every pair of features.

# What are the two features that have the biggest correlation?

# interaction_count and lead_score
# number_of_courses_viewed and lead_score
# number_of_courses_viewed and interaction_count
# annual_income and interaction_count
# Only consider the pairs above when answering this question.

#print(df_full_train.dtypes)
numerical_features = ['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score']
categorical_features = ['lead_source', 'industry', 'employment_status', 'location']

df_numerical = df[numerical_features]
#print(f"df_numerical: {df_numerical}")

correlation_matrix = df_numerical.corr()
#print(f"correlation matrix: {correlation_matrix}")
# From correlation matrix
print(f"Correlation between interaction_count and lead_score: {correlation_matrix['interaction_count']['lead_score']}")
print(f"Correlation between number_of_courses_viewed and lead_score: {correlation_matrix['number_of_courses_viewed']['lead_score']}")
print(f"Correlation between number_of_courses_viewed and interaction_count: {correlation_matrix['number_of_courses_viewed']['interaction_count']}")
print(f"Correlation between annual_income and interaction_count: {correlation_matrix['annual_income']['interaction_count']}")

# Using corr
print(f"\nCorrelation between interaction_count and lead_score: {df_numerical['interaction_count'].corr(df_numerical['lead_score'])}")
print(f"Correlation between number_of_courses_viewed and lead_score: {df_numerical['number_of_courses_viewed'].corr(df_numerical['lead_score'])}")
print(f"Correlation between number_of_courses_viewed and interaction_count: {df_numerical['number_of_courses_viewed'].corr(df_numerical['interaction_count'])}")
print(f"Correlation between annual_income and interaction_count: {df_numerical['annual_income'].corr(df_numerical['interaction_count'])}")

print("The two features that have the biggest correlation from the given pairs are annual_income and interaction_count")


# Split the data

# Split your data in train/val/test sets with 60%/20%/20% distribution.
# Use Scikit-Learn for that (the train_test_split function) and set the seed to 42.
# Make sure that the target value y is not in your dataframe.

df_full_train, df_test = train_test_split(df, test_size=0.20, random_state=42)
df_train, df_valid = train_test_split(df_full_train, test_size=0.25, random_state=42)
print(f"\nlen(df_full_train): {len(df_full_train)}, len(df_train): {len(df_train)}, len(df_valid): {len(df_valid)}, len(df_test): {len(df_test)}")

df_train = df_train.reset_index(drop=True)
df_valid = df_valid.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = df_train['converted'].values
y_valid = df_valid['converted'].values
y_test = df_test['converted'].values

del df_train['converted']
del df_valid['converted']
del df_test['converted']

#print(f"df_full_train missing values: {df_full_train.isnull().sum()}")
print(f"\ndf_full_train[categorical].nunique(): {df_full_train[categorical_features].nunique()}")

print(f"\nconverted value counts: {df_full_train.converted.value_counts()}")
print(f"converted value counts normalized: {df_full_train.converted.value_counts(normalize=True)}")

#df_full_train['converted'].mean()
global_converted_rate = df_full_train.converted.mean()
print(f"Global converted mean, aka global converted rate: {global_converted_rate}")

# Looking at lead score means by different groups
#df_full_train[df_full_train['industry'] == 'retail'].lead_score
print(f"\nconverted mean for 'retail' industry: {df_full_train[df_full_train.industry == 'retail'].converted.mean()}")
print(f"converted mean for 'education' industry: {df_full_train[df_full_train.industry == 'education'].converted.mean()}")
print(f"converted mean for 'healthcare' industry: {df_full_train[df_full_train.industry == 'healthcare'].converted.mean()}")

print(f"\nconverted mean for 'australia' location: {df_full_train[df_full_train.location == 'australia'].converted.mean()}")
print(f"converted mean for 'north_america' location: {df_full_train[df_full_train.location == 'north_america'].converted.mean()}")
print(f"converted mean for 'europe' location: {df_full_train[df_full_train.location == 'europe'].converted.mean()}")

Correlation between interaction_count and lead_score: 0.009888182496913131
Correlation between number_of_courses_viewed and lead_score: -0.004878998354681276
Correlation between number_of_courses_viewed and interaction_count: -0.023565222882888037
Correlation between annual_income and interaction_count: 0.02703647240481443

Correlation between interaction_count and lead_score: 0.009888182496913084
Correlation between number_of_courses_viewed and lead_score: -0.004878998354681257
Correlation between number_of_courses_viewed and interaction_count: -0.023565222882888117
Correlation between annual_income and interaction_count: 0.02703647240481436
The two features that have the biggest correlation from the given pairs are annual_income and interaction_count

len(df_full_train): 1169, len(df_train): 876, len(df_valid): 293, len(df_test): 293

df_full_train[categorical].nunique(): lead_source          6
industry             8
employment_status    5
location             8
dtype: int64

convert

In [5]:
# Question 3

# Calculate the mutual information score between y and other categorical variables in the dataset. Use the training set only.
# Round the scores to 2 decimals using round(score, 2).
# Which of these variables has the biggest mutual information score?

# industry
# location
# lead_source
# employment_status

from sklearn.metrics import mutual_info_score

def mutual_info_converted_score(series):
    return mutual_info_score(y_train, series)

#df_train[categorical_features].apply(mutual_info_converted_score)

# Only train set
print(f"\nThe mutual information score between y & industry: {mutual_info_score(y_train, df_train.industry)}")
print(f"The mutual information score between y & location: {mutual_info_score(y_train, df_train.location)}")
print(f"The mutual information score between y & lead_source: {mutual_info_score(y_train, df_train.lead_source)}")
print(f"The mutual information score between y & employment_status: {mutual_info_score(y_train, df_train.employment_status)}")

# all_mutual_info = df_train[categorical_features].apply(mutual_info_converted_score)
# print(f"Mutual information score from all categorical variables listed: {all_mutual_info.sort_values(ascending=False)}")

print(f"\nThe variable with the biggest mutual information score is lead_source")


The mutual information score between y & industry: 0.011574521435657112
The mutual information score between y & location: 0.004464157884038034
The mutual information score between y & lead_source: 0.03539624379726594
The mutual information score between y & employment_status: 0.012937677269442782

The variable with the biggest mutual information score is lead_source


In [6]:
# Question 4

# Now let's train a logistic regression.
# Remember that we have several categorical variables in the dataset. Include them using one-hot encoding.
# Fit the model on the training dataset.
# To make sure the results are reproducible across different versions of Scikit-Learn, fit the model with these parameters:
# model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
# Calculate the accuracy on the validation dataset and round it to 2 decimal digits.
# What accuracy did you get?

# 0.64
# 0.74
# 0.84
# 0.94
# Question

from sklearn.feature_extraction import DictVectorizer

train_dicts = df_train[categorical_features + numerical_features].to_dict(orient='records')
#print(f"dicts: {dicts}")

dv = DictVectorizer(sparse=False)
# dv.fit(train_dicts)
# dv.get_feature_names_out() # deprecated dv.get_feature_names()
# list(dv.transform(train_dicts)[0])

X_train = dv.fit_transform(train_dicts)
X_train.shape
print(f"X_train.shape: {X_train.shape}")


valid_dicts = df_valid[categorical_features + numerical_features].to_dict(orient='records')
X_valid = dv.transform(valid_dicts)

def sigmoid(z):
    return 1 / (1 + np.exp(-z))

z = np.linspace(-5, 5, 51)
sigmoid(z)
#plt.plot(z, sigmoid(z))

def linear_regression(xi):
    result = w0
    
    for j in range(len(w)):
        result = result + xi[j] * w[j]

    return result

def logistic_regression(xi):
    score = w0
    
    for j in range(len(w)):
        score = score + xi[j] * w[j]

    result = sigmoid(score)
    return result

from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)
#model.coef_.round(3)
model.intercept_[0]

#model.predict_proba(X_train)
y_pred = model.predict_proba(X_valid)[:, 1]
converted_decision = (y_pred >= 0.5)

y_valid
converted_decision.astype(int)

#round((y_valid == converted_decision).mean(), 2)
original_accuracy = (y_valid == converted_decision).mean()
print(f"acurracy rounded to 2 decimal digits: {round(original_accuracy, 2)}")
print(f"acurracy without rounding: {(original_accuracy)}")

X_train.shape: (876, 31)
acurracy rounded to 2 decimal digits: 0.7
acurracy without rounding: 0.6996587030716723


In [7]:
# Question 5

# Let's find the least useful feature using the feature elimination technique.
# Train a model using the same features and parameters as in Q4 (without rounding).
# Now exclude each feature from this set and train a model without it. Record the accuracy for each model.
# For each feature, calculate the difference between the original accuracy and the accuracy without the feature.
# Which of following feature has the smallest difference?

# 'industry'
# 'employment_status'
# 'lead_score'
# Note: The difference doesn't have to be positive.

exclude_dict = {'industry': "categorical", 'employment_status': "categorical", 'lead_score': "numerical"}
accuracy_scores = np.array([])

import copy
for column_key in exclude_dict:
    print(f"\nexcluded column: {column_key}")
    # Setting up train/valid/test split data sets
    df_full_train_exclude, df_test_exclude = train_test_split(df, test_size=0.20, random_state=42)
    df_train_exclude, df_valid_exclude = train_test_split(df_full_train_exclude, test_size=0.25, random_state=42)
    
    df_train_exclude = df_train_exclude.reset_index(drop=True)
    df_valid_exclude = df_valid_exclude.reset_index(drop=True)
    df_test_exclude = df_test_exclude.reset_index(drop=True)

    y_train_exclude = df_train_exclude['converted'].values
    y_valid_exclude = df_valid_exclude['converted'].values
    y_test_exclude = df_test_exclude['converted'].values

    del df_train_exclude['converted']
    del df_valid_exclude['converted']
    del df_test_exclude['converted']

    train_dicts_exclude = []
    valid_dicts_exclude = []
    
    features_list_copy = []
    if exclude_dict[column_key] == "categorical": # duplicate the categorical columns list in order to delete this current column later
        features_list_copy = copy.deepcopy(categorical_features)
        features_list_copy.remove(column_key) # column deletion
        train_dicts_exclude = df_train_exclude[features_list_copy + numerical_features].to_dict(orient='records')
        valid_dicts_exclude = df_valid_exclude[features_list_copy + numerical_features].to_dict(orient='records')
    else: # duplicate the numerical columns list in order to delete this current column later
        features_list_copy = copy.deepcopy(numerical_features)
        features_list_copy.remove(column_key) # column deletion
        train_dicts_exclude = df_train_exclude[categorical_features + features_list_copy].to_dict(orient='records')
        valid_dicts_exclude = df_valid_exclude[categorical_features + features_list_copy].to_dict(orient='records')

    #print(f"train_dicts_exclude: {train_dicts_exclude}")
    
    dv_exclude = DictVectorizer(sparse=False)
    
    X_train_exclude = dv_exclude.fit_transform(train_dicts_exclude)
    print(f"X_train.shape: {X_train.shape}")
    
    X_valid_exclude = dv_exclude.transform(valid_dicts_exclude)
    print(f"X_valid_shape: {X_valid.shape}")

    model_exclude = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model_exclude.fit(X_train_exclude, y_train_exclude)

    y_pred_exclude = model_exclude.predict_proba(X_valid_exclude)[:, 1]
    converted_decision_exclude = (y_pred_exclude >= 0.5)
    accuracy_exclude = (y_valid_exclude == converted_decision_exclude).mean()
    print(f"accuracy_exclude: {accuracy_exclude}")
    accuracy_scores = np.append(accuracy_scores, accuracy_exclude)

print(f"\naccuracy_scores: {accuracy_scores}")
print(f"For 'industry' feature, the difference between the original accuracy and the accuracy without the feature: {abs(accuracy_scores[0] - original_accuracy)}")
print(f"For 'employment_status' feature, the difference between the original accuracy and the accuracy without the feature: {abs(accuracy_scores[1] - original_accuracy)}")
print(f"For 'lead_score' feature, the difference between the original accuracy and the accuracy without the feature: {abs(accuracy_scores[2] - original_accuracy)}")
print(f"Hence, 'industry' is the feature with the smallest difference!")
    
# EXCLUDE industry


# df_full_train_exclude_industry, df_test_exclude_industry = train_test_split(df, test_size=0.20, random_state=42)
# df_train_exclude_industry, df_valid_exclude_industry = train_test_split(df_full_train_exclude_industry, test_size=0.25, random_state=42)

# df_train_exclude_industry = df_train_exclude_industry.reset_index(drop=True)
# df_valid_exclude_industry = df_valid_exclude_industry.reset_index(drop=True)
# df_test_exclude_industry = df_test_exclude_industry.reset_index(drop=True)

# del df_train_exclude_industry['


# EXCLUDE employment_status


# EXCLUDE lead_score



excluded column: industry
X_train.shape: (876, 31)
X_valid_shape: (293, 31)
accuracy_exclude: 0.6996587030716723

excluded column: employment_status
X_train.shape: (876, 31)
X_valid_shape: (293, 31)
accuracy_exclude: 0.6962457337883959

excluded column: lead_score
X_train.shape: (876, 31)
X_valid_shape: (293, 31)
accuracy_exclude: 0.7064846416382252

accuracy_scores: [0.6996587  0.69624573 0.70648464]
For 'industry' feature, the difference between the original accuracy and the accuracy without the feature: 0.0
For 'employment_status' feature, the difference between the original accuracy and the accuracy without the feature: 0.0034129692832763903
For 'lead_score' feature, the difference between the original accuracy and the accuracy without the feature: 0.0068259385665528916
Hence, 'industry' is the feature with the smallest difference!


In [10]:
# Question 6

# Now let's train a regularized logistic regression.
# Let's try the following values of the parameter C: [0.01, 0.1, 1, 10, 100].
# Train models using all the features as in Q4.
# Calculate the accuracy on the validation dataset and round it to 3 decimal digits.
# Which of these C leads to the best accuracy on the validation set?

# 0.01
# 0.1
# 1
# 10
# 100
# Note: If there are multiple options, select the smallest C.

model_with_001_C = LogisticRegression(solver='liblinear', C=0.01, max_iter=1000, random_state=42)
model_with_001_C.fit(X_train, y_train)

y_pred_for_001_C = model_with_001_C.predict_proba(X_valid)[:, 1]
converted_decision_for_001_C = (y_pred_for_001_C >= 0.5)
accuracy_for_001_C = (y_valid == converted_decision_for_001_C).mean()

print(f"Rounded to 3 decimal digits, accuracy for C value 0.01: {accuracy_for_001_C}")

model_with_01_C = LogisticRegression(solver='liblinear', C=0.1, max_iter=1000, random_state=42)
model_with_01_C.fit(X_train, y_train)

y_pred_for_01_C = model_with_01_C.predict_proba(X_valid)[:, 1]
converted_decision_for_01_C = (y_pred_for_01_C >= 0.5)
accuracy_for_01_C = (y_valid == converted_decision_for_01_C).mean()

print(f"Rounded to 3 decimal digits, accuracy for C value 0.1: {accuracy_for_01_C}")

model_with_1_C = LogisticRegression(solver='liblinear', C=1, max_iter=1000, random_state=42)
model_with_1_C.fit(X_train, y_train)

y_pred_for_1_C = model_with_1_C.predict_proba(X_valid)[:, 1]
converted_decision_for_1_C = (y_pred_for_1_C >= 0.5)
accuracy_for_1_C = (y_valid == converted_decision_for_1_C).mean()

print(f"Rounded to 3 decimal digits, accuracy for C value 1: {accuracy_for_1_C}")

model_with_10_C = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
model_with_10_C.fit(X_train, y_train)

y_pred_for_10_C = model_with_10_C.predict_proba(X_valid)[:, 1]
converted_decision_for_10_C = (y_pred_for_10_C >= 0.5)
accuracy_for_10_C = (y_valid == converted_decision_for_10_C).mean()

print(f"Rounded to 3 decimal digits, accuracy for C value 10: {accuracy_for_10_C}")

model_with_100_C = LogisticRegression(solver='liblinear', C=100, max_iter=1000, random_state=42)
model_with_100_C.fit(X_train, y_train)

y_pred_for_100_C = model_with_100_C.predict_proba(X_valid)[:, 1]
converted_decision_for_100_C = (y_pred_for_100_C >= 0.5)
accuracy_for_100_C = (y_valid == converted_decision_for_100_C).mean()

print(f"Rounded to 3 decimal digits, accuracy for C value 10: {accuracy_for_100_C}")

print(f"Since all values of C in [0.01, 0.1, 1, 10, 100] yield the same accuracy, the smallest C is 0.01")

Rounded to 3 decimal digits, accuracy for C value 0.01: 0.6996587030716723
Rounded to 3 decimal digits, accuracy for C value 0.1: 0.6996587030716723
Rounded to 3 decimal digits, accuracy for C value 1: 0.6996587030716723
Rounded to 3 decimal digits, accuracy for C value 10: 0.6996587030716723
Rounded to 3 decimal digits, accuracy for C value 10: 0.6996587030716723
Since all values of C in [0.01, 0.1, 1, 10, 100] yield the same accuracy, the smallest C is 0.01
