In [None]:
import pandas as pd
import numpy as np
import seaborn as sns

from matplotlib import pyplot as plt
%matplotlib inline

In [None]:
lead = pd.read_csv('leads.csv')
lead.T

## Exploratory Data Analysis(EDA)

In [None]:
lead.dtypes

In [None]:
lead.columns = lead.columns.str.lower().str.replace(' ', '_')

categorical_columns = list(lead.dtypes[lead.dtypes == 'object'].index)


for c in categorical_columns:
    lead[c] = lead[c].str.lower().str.replace(' ', '_')
    lead[c] = lead[c].replace(to_replace=np.NaN, value='unk')
    

In [None]:
lead.describe().round()

In [None]:
#for c in ['asymmetrique_profile_score', 'asymmetrique_activity_score']:
    #lead[c] = lead[c].replace(to_replace="16", value=np.nan)

In [None]:
lead.isnull().sum()

In [None]:
lead

In [None]:
tts = pd.to_numeric(lead.total_time_spent_on_website, errors='coerce')

In [None]:
lead.total_time_spent_on_website = pd.to_numeric(lead.total_time_spent_on_website, errors='coerce')

In [None]:
tts.isnull()

all false shown their is no single row with nan. we can always check in following cell.

In [None]:
lead[tts.isnull()][['prospect_id', 'total_time_spent_on_website']]

In [None]:
lead.converted.head()

In [None]:
lead.converted = (lead.converted == 1).astype(int)
lead.converted

In [None]:
for c in ['asymmetrique_profile_score', 'asymmetrique_activity_score']:
        lead[c] = lead[c].replace(to_replace=np.nan, value=16)
for c in ['page_views_per_visit', 'totalvisits']:
        lead[c] = lead[c].replace(to_replace=np.nan, value=3)

## SETTING UP VALDATION FRAMEWORK

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
df_full_train, df_test = train_test_split(lead, test_size=0.2, random_state=1)

The above code train_test_split split the dataset into two. So, we will need to split it again.

In [None]:
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

In [None]:
len(df_train), len(df_val), len(df_test)

In [None]:
y_train = df_train.reset_index(drop=True)
y_val = df_val.reset_index(drop=True)
y_test = df_test.reset_index(drop=True)

In [None]:
y_train = df_train.converted.values
y_val = df_val.converted.values
y_test = df_test.converted.values

In [None]:
del df_train['converted']
del df_val['converted']
del df_test['converted']

In [None]:
df_full_train = df_full_train.reset_index(drop=True)

In [None]:
df_full_train.converted.value_counts(normalize=True)

In [None]:
global_converted_rate = df_full_train.converted.mean()
round(global_converted_rate, 3)

In [None]:
lead.dtypes[lead.dtypes == "object"]

In [None]:
numerical = ['totalvisits', 'total_time_spent_on_website', 'page_views_per_visit', 
             'asymmetrique_activity_score', 'asymmetrique_profile_score']
categorical = [ 'lead_origin', 'lead_source', 'do_not_email','do_not_call',                                      
               'last_activity', 'country', 'specialization', 'how_did_you_hear_about_x_education', 
               'what_is_your_current_occupation',                  
               'what_matters_most_to_you_in_choosing_a_course', 'search', 'magazine', 'newspaper_article', 
               'x_education_forums',                               
               'newspaper', 'digital_advertisement', 'through_recommendations', 'receive_more_updates_about_our_courses', 
               'tags',                                             
               'lead_quality', 'update_me_on_supply_chain_content', 'get_updates_on_dm_content', 'lead_profile', 
               'city', 'asymmetrique_activity_index', 'asymmetrique_profile_index', 
               'i_agree_to_pay_the_amount_through_cheque', 'a_free_copy_of_mastering_the_interview', 
               'last_notable_activity']


In [None]:
df_full_train[categorical].nunique()

## Feature engineering

looking at the lead rate within the group

In [None]:
df_full_train

In [None]:
high_mean = df_full_train[df_full_train.asymmetrique_profile_index == '01.high'].converted.mean()
print('asymmetrique_profile_index == 01.high:', round(high_mean, 3))

medium_mean = df_full_train[df_full_train.asymmetrique_profile_index == '02.medium'].converted.mean()
print('asymmetrique_profile_index == 02.medium:  ', round(medium_mean, 3))

In [None]:
no_email_access_mean = df_full_train[df_full_train.do_not_email == 'yes'].converted.mean()
print('do_not_email == yes:', round(no_email_access_mean, 3))

email_access_mean = df_full_train[df_full_train.do_not_email == 'no'].converted.mean()
print('do_not_email == no:  ', round(email_access_mean, 3))

converted rate = global mean - group

In [None]:
global_converted_rate - no_email_access_mean

Risk Ratio 

(group / global) mean they are literally the same but different method

In [None]:
 no_email_access_mean / global_converted_rate

In [None]:
 email_access_mean / global_converted_rate

In [None]:
df_group = df_full_train.groupby(by='asymmetrique_profile_index').converted.agg(['mean'])
df_group['diff'] = df_group['mean'] - global_converted_rate
df_group['risk'] = df_group['mean'] / global_converted_rate
df_group

In [None]:
for col in categorical:
    print(col)
    df_group = df_full_train.groupby(by=col).converted.agg(['mean'])
    df_group['diff'] = df_group['mean'] - global_converted_rate
    df_group['risk'] = df_group['mean'] / global_converted_rate
    display(df_group)
    print()
    print()

# Mutual Information
it the order of how important feature is to the target variables

In [None]:
from sklearn.metrics import mutual_info_score

In [None]:
mutual_info_score(df_full_train.converted, df_full_train.do_not_call)

In [None]:
mutual_info_score(df_full_train.converted, df_full_train.asymmetrique_profile_index)

In [None]:
mutual_info_score(df_full_train.converted, df_full_train.do_not_email)

In [None]:
def mutual_info_converted_score(series):
    return mutual_info_score(series, df_full_train.converted)

In [None]:
mi = df_full_train[categorical].apply(mutual_info_converted_score)
mi.sort_values(ascending=True)

# feature correlation

In [None]:
df_full_train.totalvisits.max()

In [None]:
df_full_train[numerical].corrwith(df_full_train.converted)

very high correlation

# One-Hot encoding

In [None]:
from sklearn.feature_extraction import DictVectorizer

In [None]:
dv = DictVectorizer(sparse=False)

train_dict = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dict)


In [None]:
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

In [None]:
z = np.linspace(-7, 7, 51)

In [None]:
sigmoid(100000)

In [None]:
plt.plot(z, sigmoid(z))

# Logistic Regression
* Binary Classification
* Linear vs Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000)
model.fit(X_train, y_train)

In [None]:
model.intercept_[0]

In [None]:
model.coef_[0].round(3)

In [None]:
y_pred = model.predict_proba(X_val)[:, 1]

In [None]:
lead_decision = (y_pred >= 0.5)
lead_decision

In [None]:
y_val

In [None]:
(lead_decision == y_val).mean()

In [None]:
lead_pred = pd.DataFrame()
lead_pred['probability'] = y_pred
lead_pred['prediction'] = lead_decision.astype(int)
lead_pred['actual'] = y_val

In [None]:
lead_pred['correct'] = lead_pred.prediction == lead_pred.actual

In [None]:
lead_pred.correct.mean()

In [None]:
lead_decision.astype(int)

# Model Interpretation

In [None]:
dict(zip(dv.get_feature_names(), model.coef_[0].round(3)))

In [None]:
small = ['asymmetrique_activity_score', 'total_time_spent_on_website', 'asymmetrique_profile_score']

In [None]:
df_train[small].iloc[:10].to_dict(orient='records')

In [None]:
dicts_train_small = df_train[small].to_dict(orient='records')
dicts_val_small = df_val[small].to_dict(orient='records')

In [None]:
dv_small = DictVectorizer(sparse=False)
dv_small.fit(dicts_train_small)

In [None]:
dv_small.get_feature_names()

In [None]:
X_train_small = dv_small.transform(dicts_train_small)

Let train the model

In [None]:
model_small = LogisticRegression()
model_small.fit(X_train_small, y_train)

In [None]:
dicts_full_train = df_full_train[categorical + numerical].to_dict(orient='records')

In [None]:
dv = DictVectorizer(sparse=False)
X_full_train = dv.fit_transform(dicts_full_train)

In [None]:
y_full_train = df_full_train.converted.values

In [None]:
model_train = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000)
model_train.fit(X_full_train, y_full_train)

In [None]:
dicts_test = df_test[categorical + numerical].to_dict(orient='records')

In [None]:
X_test = dv.transform(dicts_test)

In [None]:
model.intercept_[0]

In [None]:
model.coef_[0].round(3)

checking the number of features is in train and test

In [None]:
print("Number of features in training data:", X_train.shape[1])

In [None]:
print("Number of features in test data:", X_test.shape[1])

In [None]:
X_test_subset = X_test[:, :215] 

now we can predict with subset of test

In [None]:
y_pred = model.predict_proba(X_test_subset)[:, 1]

In [None]:
lead_decision = (y_pred >= 0.5)

In [None]:
(lead_decision == y_test)

In [None]:
customer = dicts_test[-1]
customer

In [None]:
X_small = dv.transform([customer])
X_small_subset = X_small[:, :215]
model.predict_proba(X_small_subset)[0, 1]

In [None]:
y_test[-1]

In [None]:
lead_decision

In [None]:
y_val

In [None]:
pip install streamlit