In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from IPython.display import display
from matplotlib.pyplot import xticks

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import seaborn as sns
from matplotlib import pyplot as plt

In [None]:
df = pd.read_csv('/kaggle/input/leads-dataset/Leads.csv')

In [None]:
df.head().T

In [None]:
df.columns = df.columns.str.lower().str.replace(' ','_')

In [None]:
string_columns = list(df.dtypes[df.dtypes == 'object'].index)

for column in string_columns:
    df[column] = df[column].str.lower().str.replace(' ', '_')

In [None]:
df.head().T

In [None]:
# Additional Data formatting
for column in ['asymmetrique_activity_index', 'asymmetrique_profile_index']:
    df[column] = df[column].str.lower().str.replace('.', '_')

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
# Select is a bad value, so we will replace it with NaN
df = df.replace('select', np.nan)

In [None]:
df.isnull().sum()

In [None]:
df = df.drop(df.loc[:,list(round(100*(df.isnull().sum()/len(df.index)), 2)>70)].columns, 1)

In [None]:
df.shape

In [None]:
df['lead_quality'].describe()

In [None]:
lead_quality = df['lead_quality']
sns.countplot(df['lead_quality'])

In [None]:
df['lead_quality'] = df['lead_quality'].replace(np.nan, 'not_sure')
df['lead_quality'].describe()

In [None]:
sns.countplot(df['lead_quality'])

In [None]:
for col in ['totalvisits','page_views_per_visit','asymmetrique_activity_score', 'asymmetrique_profile_score']:
    df[col] = df[col].fillna(0.0)

In [None]:
df = df.drop(['asymmetrique_activity_index','asymmetrique_activity_score','asymmetrique_profile_index','asymmetrique_profile_score'],1)

In [None]:
for col in (df.dtypes[df.dtypes == 'object'].index):
    description = df[col].describe()
    display(description)

In [None]:
df = df.drop(['x_education_forums', 'a_free_copy_of_mastering_the_interview', 'through_recommendations',
         'search', 'newspaper_article', 'digital_advertisement'], 1)

In [None]:
df = df.replace(np.nan, 'not_answered', regex=True)

In [None]:
sns.countplot(x = "last_notable_activity", hue = "converted", data = df)
xticks(rotation = 90)

In [None]:
df.isnull().sum()

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=15)

In [None]:
df_train, df_val = train_test_split(df_train_full, test_size=0.33, random_state=11)

In [None]:
y_train = df_train.converted.values

In [None]:
y_val = df_val.converted.values

In [None]:
del df_train['converted']
del df_val['converted']

In [None]:
# Ensure the training set has no missing values
df_train_full.isnull().sum()

In [None]:
# 1 => Number of converted leads 
df_train_full.converted.value_counts()

In [None]:
global_mean = df_train_full.converted.mean()
f'Conversion Rate => {global_mean}'

In [None]:
categorical = list(df.dtypes[df.dtypes == 'object'].index)
categorical.remove('prospect_id')
categorical

In [None]:
numerical = list(df.dtypes[df.dtypes == 'int64'].index) + list(df.dtypes[df.dtypes == 'float64'].index)
numerical.remove('lead_number')
numerical.remove('converted')
numerical

In [None]:
df_train_full[categorical].nunique()

In [None]:
new_categorical = []
for column in categorical:
    if len(df[column].unique()) > 1:
        new_categorical.append(column)
categorical = new_categorical
new_categorical.remove('newspaper')

In [None]:
df_train_full[categorical].nunique()

In [None]:
# 24 columns being used in total
df_train_full[categorical + numerical].shape

In [None]:
def calculate_group_risk(df, column_name):
    df_group = df.groupby(by=column_name).converted.agg(['mean'])
    df_group['diff'] = df_group['mean'] - global_mean
    df_group['risk'] = df_group['mean'] / global_mean
    return df_group

In [None]:
# If the risk is lower than 1, the group has lower risks: the lead rate in this group is smaller than the global lead rate
# (0.5) => two times less likely to convert

# If the value is higher than 1, the group is risky: the lead rate in this group is higher than the global lead rate
# (2) => two times more likely to convert

for col in categorical:
    display(calculate_group_risk(df_train_full, col))

In [None]:
from sklearn.metrics import mutual_info_score

In [None]:
def calculate_mi(series):
    return mutual_info_score(series, df_train_full.converted)

In [None]:
df_mi = df_train_full[categorical].apply(calculate_mi)
df_mi = df_mi.sort_values(ascending=False).to_frame(name='MI')
df_mi
# Higher values means a higher degree of dependence, meaning the variable is useful for predicting the target
# Lower values means that the target and categorical variable are independent, thus the variable is not as useful for predicting the target

In [None]:
# Measure the dependency between a binary target variable and a numerical variable
# Positive correlation means that when one variable goes up, the other variable tends to go up as well
# Zero correlation means no relationship between variables: they are completely independent
# Negative correlation occurs when one variable goes up while the other goes down
df_train_full[numerical].corrwith(df_train_full.converted)

In [None]:
# People who spend more time on the website are more likely to be converted

In [None]:
# Create a dictionary of the Dataframe content
train_dict = df_train[categorical + numerical].to_dict(orient='records')

In [None]:
from sklearn.feature_extraction import DictVectorizer

dv = DictVectorizer(sparse=False)

dv.fit(train_dict)

In [None]:
X_train = dv.transform(train_dict)

In [None]:
dv.get_feature_names()

In [None]:
# Training the Logistic Regression Model
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='liblinear', random_state=1)
model.fit(X_train, y_train)

In [None]:
# Test the model against the validation dataset
val_dict = df_val[categorical + numerical].to_dict(orient='records')
# Create the dictionaries
X_val = dv.transform(val_dict)

In [None]:
# 1-p, select all rows at index 1
y_pred = model.predict_proba(X_val)[:, 1]

In [None]:
converted_pred = y_pred >= 0.5

In [None]:
(y_val == converted_pred).mean()

In [None]:
# Bias Term
model.intercept_[0]

In [None]:
# Weights Vector
dict(zip(dv.get_feature_names(), model.coef_[0].round(3)))

In [None]:
import math

# Sigmoid function for Linear Regression
def sigmoid(score):
    return 1 / (1 + math.exp(-score))

In [None]:
# Bias Term base prediction %
sigmoid(model.intercept_[0]) * 100

In [None]:
# Calculating the accuracy of our model
y_pred = model.predict_proba(X_val)[:,1]
converted = y_pred >= 0.5
f'Accuracy of Model: {(converted == y_val).mean()}'

In [None]:
from sklearn.metrics import accuracy_score

thresholds = np.linspace(0,1,11)
for t in thresholds:
    converted = y_pred >= t
    acc = accuracy_score(y_val, converted)
    print('%0.2f %0.3f' % (t, acc))

In [None]:
thresholds = np.linspace(0, 1, 121)
accuracies = []
for t in thresholds:
    acc = accuracy_score(y_val, y_pred >= t)
    accuracies.append(acc)
plt.plot(thresholds, accuracies)
plt.xlabel('threshold')
plt.ylabel('accuracy')

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
print(y_val)
print(y_pred)
confusion_matrix(y_val, converted)

In [None]:
def confusion_table_create(y_val, y_pred, t = 0.5):
    actual_churn = (y_val == 1)
    actual_no_churn = (y_val == 0)
    
    predict_churn = (y_pred >= t)
    predict_no_churn = (y_pred < t)
 
    true_positive = (predict_churn & actual_churn).sum()
    false_positive = (predict_churn & actual_no_churn).sum()
 
    false_negative = (predict_no_churn & actual_churn).sum()
    true_negative = (predict_no_churn & actual_no_churn).sum()
    
    return np.array([[true_negative, false_positive], [false_negative, true_positive]])

def print_confusion_table(confusion_table):
    # Predicted False and the actual label is also False (TN)
    print(f'True Negatives: {confusion_table[0,0]}')
    # Predicted True but the actual label was False (FP)
    print(f'False Positives: {confusion_table[0,1]}')
    
    # Predicted False but the actual label was True (FN)
    print(f'False Negatives: {confusion_table[1,0]}')
    # Predicted True and the actual label is also True (TP)
    print(f'True Positives: {confusion_table[1,1]}')

In [None]:
confusion_table = confusion_table_create(y_val, y_pred)
print_confusion_table(confusion_table)

In [None]:
confusion_table / confusion_table.sum()

In [None]:
# Number of positive predictions that turned out correct (Based on Predictions) (TP / TP + FP)
def precision(confusion_table):
    return confusion_table[1,1] / (confusion_table[1,1] + confusion_table[0,1])

# Number of correctly positive examples among all positive examples (TP / TP + FN)
def recall(confusion_table):
    return confusion_table[1,1] / (confusion_table[1,1] + confusion_table[1,0])

# Fraction of false positives among all negatives (FP / FP + TN)
def false_positive_rate(confusion_table):
    return confusion_table[0,1] / (confusion_table[0,0] + confusion_table[0,1])

# Fraction of true positives among all positives (TP / TP + FN)
def true_positive_rate(confusion_table):
    return confusion_table[1,1] / (confusion_table[1,1] + confusion_table[1,0])

In [None]:
# Better the precision, the fewer false positives there are
precision(confusion_table) * 100

In [None]:
# Better the recall, the fewer false negatives there are
recall(confusion_table) * 100

In [None]:
false_positive_rate(confusion_table) * 100

In [None]:
true_positive_rate(confusion_table) * 100

In [None]:
false_positive_rate(confusion_table) * 100 + true_positive_rate(confusion_table) * 100

In [None]:
scores = []
 
thresholds = np.linspace(0, 1, 101)
 
for t in thresholds:
    tp = ((y_pred >= t) & (y_val == 1)).sum()
    fp = ((y_pred >= t) & (y_val == 0)).sum()
    fn = ((y_pred < t) & (y_val == 1)).sum()
    tn = ((y_pred < t) & (y_val == 0)).sum()
    scores.append((t, tp, fp, fn, tn))

In [None]:
df_scores = pd.DataFrame(scores)

In [None]:
df_scores.columns = ['threshold','tp','fp','fn','tn']

In [None]:
# Compute the TPR and FPR for all values at once in the dataframe
df_scores['tpr'] = df_scores.tp / (df_scores.tp + df_scores.fn)
df_scores['fpr'] = df_scores.fp / (df_scores.fp + df_scores.tn)
df_scores[::10]

In [None]:
plt.plot(df_scores.threshold, df_scores.tpr, label='TPR')
plt.plot(df_scores.threshold, df_scores.fpr, label='FPR')
plt.legend()
plt.xlabel('thresholds')
# A small FPR indicates that the model makes very few mistakes predicting negative examples (FP (True but was False))
# TPR should decrease slowly staying closing to 100% indicating true positives are predicted well (True and was True)

In [None]:
plt.figure(figsize=(5, 5))
plt.plot(df_scores.fpr, df_scores.tpr)
plt.plot([0, 1], [0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')

In [None]:
# Scikit-Learn ROC Curve calculation
from sklearn.metrics import roc_curve

fpr, tpr, thresholds = roc_curve(y_val, y_pred)

plt.figure(figsize=(5, 5))
plt.plot(fpr, tpr)
plt.plot([0, 1], [0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')

In [None]:
from sklearn.metrics import auc
auc(df_scores.fpr, df_scores.tpr)

In [None]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_val, y_pred)

In [None]:
neg = y_pred[y_val == 0]
pos = y_pred[y_val == 1]
 
np.random.seed(1)
# Size is the size of the total np array, low and high are the thresholds of the random number
neg_choice = np.random.randint(low=0, high=len(neg), size=10000)
pos_choice = np.random.randint(low=0, high=len(pos), size=10000)
(pos[pos_choice] > neg[neg_choice]).mean()

**K-fold cross validation**

In [None]:
def train(df, y, C):
    cat = df[categorical + numerical].to_dict(orient='records')
    dv = DictVectorizer(sparse=False)
    dv.fit(cat)
    
    X = dv.transform(cat)
    
    model = LogisticRegression(solver='liblinear', C=C)
    model.fit(X, y)
    
    return dv, model

# Dataframe, DictVectorizer, Model
def predict(df, dv, model):
    cat = df[categorical + numerical].to_dict(orient='records')
    
    X = dv.transform(cat)
    y_pred = model.predict_proba(X)[:,1]
    
    return y_pred

In [None]:
from sklearn.model_selection import KFold

kfold = KFold(n_splits=10, shuffle=True, random_state=1)

for C in [0.001, 0.01, 0.1, 0.5, 1, 10]:
    aucs = []
    for train_idx, val_idx in kfold.split(df_train_full):
        # access datafrane records by their numbers
        df_train = df_train_full.iloc[train_idx]
        df_val = df_train_full.iloc[val_idx]

        y_train = df_train.converted.values
        y_val = df_val.converted.values

        dv, model = train(df_train, y_train, C)
        y_pred = predict(df_val, dv, model)

        auc = roc_auc_score(y_val, y_pred)
        aucs.append(auc)
    print('C=%s, auc = %0.3f ± %0.3f' % (C, np.mean(aucs), np.std(aucs)))

In [None]:
import pickle

Pickle model, dictvect export
#with open('leads-model.bin', 'wb') as f_out:
#    pickle.dump((dv, model), f_out)