#### Install Independencies

In [30]:
# !pip install pandas
# !pip install sklearn
# !pip install matplotlib
# !pip install apyori

In [31]:
from pandas import Series, DataFrame
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
import sklearn.metrics


In [32]:
prospect_data = pd.read_csv("Data-02-05-web-browsing-data.csv")

prospect_data.dtypes

SESSION_ID         int64
IMAGES             int64
REVIEWS            int64
FAQ                int64
SPECS              int64
SHIPPING           int64
BRO_TOGETHER       int64
COMPARE_SIMILAR    int64
VIEW_SIMILAR       int64
WARRANTY           int64
SPONSORED_LINKS    int64
BUY                int64
dtype: object

In [33]:
# Look at the top records to understand how the data looks like.
prospect_data.head()

Unnamed: 0,SESSION_ID,IMAGES,REVIEWS,FAQ,SPECS,SHIPPING,BRO_TOGETHER,COMPARE_SIMILAR,VIEW_SIMILAR,WARRANTY,SPONSORED_LINKS,BUY
0,1001,0,0,1,0,1,0,0,0,1,0,0
1,1002,0,1,1,0,0,0,0,0,0,1,0
2,1003,1,0,1,1,1,0,0,0,1,0,0
3,1004,1,0,0,0,1,1,1,0,0,0,0
4,1005,1,1,1,0,1,0,1,0,0,0,0


In [34]:
#Do summary statistics analysis of the data
prospect_data.describe()

Unnamed: 0,SESSION_ID,IMAGES,REVIEWS,FAQ,SPECS,SHIPPING,BRO_TOGETHER,COMPARE_SIMILAR,VIEW_SIMILAR,WARRANTY,SPONSORED_LINKS,BUY
count,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0
mean,1250.5,0.51,0.52,0.44,0.48,0.528,0.5,0.58,0.468,0.532,0.55,0.37
std,144.481833,0.500401,0.5001,0.496884,0.5001,0.499715,0.500501,0.494053,0.499475,0.499475,0.497992,0.483288
min,1001.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1125.75,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1250.5,1.0,1.0,0.0,0.0,1.0,0.5,1.0,0.0,1.0,1.0,0.0
75%,1375.25,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
max,1500.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


### Correlation Analysis

In [35]:
prospect_data.corr()['BUY']

SESSION_ID         0.026677
IMAGES             0.046819
REVIEWS            0.404628
FAQ               -0.095136
SPECS              0.009950
SHIPPING          -0.022239
BRO_TOGETHER      -0.103562
COMPARE_SIMILAR    0.190522
VIEW_SIMILAR      -0.096137
WARRANTY           0.179156
SPONSORED_LINKS    0.110328
BUY                1.000000
Name: BUY, dtype: float64

In [36]:
#Drop columns with low correlation
predictors = prospect_data[['REVIEWS','BRO_TOGETHER','COMPARE_SIMILAR','WARRANTY','SPONSORED_LINKS']]
targets = prospect_data.BUY

### Train Test Split

In [37]:
pred_train, pred_test, tar_train, tar_test  =   train_test_split(predictors, targets, test_size=.3)

print( "Predictor - Training : ", pred_train.shape, "Predictor - Testing : ", pred_test.shape )

Predictor - Training :  (350, 5) Predictor - Testing :  (150, 5)


### Build Model and Check Accuracy

In [38]:
from sklearn.naive_bayes import GaussianNB

classifier=GaussianNB()
classifier=classifier.fit(pred_train.values,tar_train.values)

predictions=classifier.predict(pred_test.values)

#Analyze accuracy of predictions
sklearn.metrics.confusion_matrix(tar_test,predictions)

array([[81, 12],
       [28, 29]])

In [39]:
sklearn.metrics.accuracy_score(tar_test, predictions)

0.7333333333333333

In [40]:
pred_prob=classifier.predict_proba(pred_test.values)
pred_prob[0,1]

0.11485259655902337

Instead of doing a Yes/No prediction, we can instead do a probability computation to show the probability for the prospect to buy the product

The probability above can be read as 22% chance that the prospect will buy the product.

## Real Time Predictions

In [41]:
browsing_data = np.array([0,0,0,0,0]).reshape(1, -1)
print("New visitor: propensity :",classifier.predict_proba(browsing_data)[:,1] )

New visitor: propensity : [0.05797895]


So the initial probability is 5%. Now, suppose the customer clicks does a comparison of similar products. The array changes to include a 1 for that function. The new probability will be

In [42]:
browsing_data = np.array([0,0,1,0,0]).reshape(1, -1)
print("After checking similar products: propensity :",classifier.predict_proba(browsing_data)[:,1] )

After checking similar products: propensity : [0.13780042]


It goes up to 12%. Next, he checksout reviews.

In [43]:
browsing_data = np.array([1,0,1,0,0]).reshape(1, -1)
print("After checking reviews: propensity :",classifier.predict_proba(browsing_data)[:,1] )


After checking reviews: propensity : [0.52742998]


###### It shoots up to 56%. You can have a threshold for when you want to offer chat. You can keep checking this probability against that threshold to see if you want to popup a chat window.

This example shows you how you can use predictive analytics in real time to decide whether a prospect has high propensity to convert and offer him a chat with a sales rep/agent.

