# Imports

In [None]:
!pip install plotly_express

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import os
import json
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
import plotly_express as px
from plotly import tools
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
%matplotlib inline

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Data Loading

In [None]:
def load_df(csv_path, nrows=None):
    JSON_COLUMNS = ['device', 'geoNetwork', 'totals', 'trafficSource']
    
    df = pd.read_csv(csv_path, 
                     converters={column: json.loads for column in JSON_COLUMNS}, 
                     dtype={'fullVisitorId': 'str'}, # Important!!
                     nrows=nrows)
    
    for column in JSON_COLUMNS:
        column_as_df = pd.json_normalize(df[column])
        column_as_df.columns = [f"{column}.{subcolumn}" for subcolumn in column_as_df.columns]
        df = df.drop(column, axis=1).merge(column_as_df, right_index=True, left_index=True)
    print(f"Loaded {os.path.basename(csv_path)}. Shape: {df.shape}")
    return df

In [None]:
train_df = load_df('/kaggle/input/ga-customer-revenue-prediction/train.csv')
test_df = load_df('/kaggle/input/ga-customer-revenue-prediction/test.csv')
train_df.to_csv('./train.csv', index=False)
test_df.to_csv('./test.csv', index=False)

In [None]:
train_df = pd.read_csv('./train.csv', low_memory=False)
test_df = pd.read_csv('./test.csv', low_memory=False)

# Initial Analysis

In [None]:
train_df.head()

In [None]:
train_df.info()

In [None]:
train_df['device.isMobile'] = train_df['device.isMobile'].astype(int)

In [None]:
train_df[['totals.visits', 'totals.hits', 'totals.pageviews', 'totals.bounces', 'totals.newVisits', 'totals.transactionRevenue']] = train_df[['totals.visits', 'totals.hits', 'totals.pageviews', 'totals.bounces', 'totals.newVisits', 'totals.transactionRevenue']].astype(float)

In [None]:
# Target Variable
train_df['totals.transactionRevenue'] = train_df['totals.transactionRevenue'].fillna(0)

In [None]:
train_df.describe()

# In-Depth Analysis

## Inspecting Null Values

In [None]:
null_df = pd.DataFrame(train_df.isnull().sum()*100/train_df.shape[0], columns=['null_pct']).sort_values(by='null_pct', ascending=True)
null_df = null_df[null_df['null_pct'] > 0]
ax = null_df.plot(kind='barh', figsize=(10,7), color="coral", fontsize=13, legend=False)
for i in ax.patches:
    ax.text(i.get_width()+.3, i.get_y()+0.1, str(round(i.get_width(), 2))+'%', fontsize=15, color='dimgrey')

In [None]:
null_df.index

In [None]:
# Dropping Mostly-Null Columns 
train_df = train_df.drop([
    'sessionId', 'visitId', \
    'trafficSource.campaignCode', 'trafficSource.adContent', 'trafficSource.adwordsClickInfo.gclId', \
    'trafficSource.adwordsClickInfo.page', 'trafficSource.adwordsClickInfo.slot',\
    'trafficSource.adwordsClickInfo.adNetworkType', 'trafficSource.adwordsClickInfo.isVideoAd' \
], axis=1)
train_df.shape

## Inspecting Categorical Columns

In [None]:
cat_df = pd.DataFrame(train_df.apply(lambda x: x.nunique()).sort_values(), columns=['num_uniques'])
na_cols = cat_df[cat_df['num_uniques'] == 1].index
cat_df = cat_df[cat_df['num_uniques'] != 1]
train_df = train_df.drop(na_cols, axis=1)
cat_df

In [None]:
train_df.shape

## Target Variable Analysis

Since we are predicting the natural log of sum of all transactions of the user, let us sum up the transaction revenue at user level and take a log and then do a scatter plot.

In [None]:
train_df["totals.transactionRevenue"] = train_df["totals.transactionRevenue"].astype('float')
gdf = train_df.groupby("fullVisitorId")["totals.transactionRevenue"].sum().reset_index()

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))
fig.suptitle('Target Variable - Original vs Log')
ax1.scatter(range(gdf.shape[0]), np.sort(gdf["totals.transactionRevenue"].values))
ax2.scatter(range(gdf.shape[0]), np.sort(np.log1p(gdf["totals.transactionRevenue"].values)))

In this competition, the data set is so unbalanced that it's hard to say whether our solution can beat all-zeros, because most of the customers don't bring in any real revenue. And if a customer will pay, the minimum of transaction revenue is no less than 1E+07.

This is in line with what was explained in the competition overview:
> The 80/20 rule has proven true for many businesses–only a small percentage of customers produce most of the revenue. As such, marketing teams are challenged to make appropriate investments in promotional strategies.

In this case, the ratio is even worse:

In [None]:
print(f"Percentage of Customers who brought Revenue: {round(100 * sum(gdf['totals.transactionRevenue'] > 0) / gdf['totals.transactionRevenue'].shape[0], 2)}%")

The log operation normalizes the target and reduces the spread enough for a model to be able to predict it.

In [None]:
print("Number of unique visitors in train set : ",train_df.fullVisitorId.nunique(), "/", train_df.shape[0])
print("Number of unique visitors in test set : ",test_df.fullVisitorId.nunique(), "/", test_df.shape[0])
print("Number of common visitors in train and test set : ",len(set(train_df.fullVisitorId.unique()).intersection(set(test_df.fullVisitorId.unique())) ))


In [None]:
train_df['totals.transactionRevenue'] = train_df['totals.transactionRevenue'].replace({0: np.nan})

In [None]:
def horizontal_bar_chart(cnt_srs, color):
    trace = go.Bar(
        y=cnt_srs.index[::-1],
        x=cnt_srs.values[::-1],
        showlegend=False,
        orientation = 'h',
        marker=dict(
            color=color,
        ),
    )
    return trace

## Device Information

In [None]:
# Device Browser
cnt_srs = train_df.groupby('device.browser')['totals.transactionRevenue'].agg(['size', 'count', 'mean'])
cnt_srs.columns = ["count", "count of non-zero revenue", "mean"]
cnt_srs = cnt_srs.sort_values(by="count", ascending=False)
trace1 = horizontal_bar_chart(cnt_srs["count"].head(10), 'rgba(50, 171, 96, 0.6)')
trace2 = horizontal_bar_chart(cnt_srs["count of non-zero revenue"].head(10), 'rgba(50, 171, 96, 0.6)')
trace3 = horizontal_bar_chart(cnt_srs["mean"].head(10), 'rgba(50, 171, 96, 0.6)')

# Creating two subplots
fig = tools.make_subplots(rows=1, cols=3, vertical_spacing=0.04, 
                          subplot_titles=["Count", "Non-zero Revenue Count", "Mean Revenue"])

fig.append_trace(trace1, 1, 1)
fig.append_trace(trace2, 1, 2)
fig.append_trace(trace3, 1, 3)

fig['layout'].update(height=1200, width=1200, paper_bgcolor='rgb(233,233,233)', title="Device Browser")
py.iplot(fig)


In [None]:
# Device Category
cnt_srs = train_df.groupby('device.deviceCategory')['totals.transactionRevenue'].agg(['size', 'count', 'mean'])
cnt_srs.columns = ["count", "count of non-zero revenue", "mean"]
cnt_srs = cnt_srs.sort_values(by="count", ascending=False)
trace1 = horizontal_bar_chart(cnt_srs["count"].head(10), 'rgba(71, 58, 131, 0.8)')
trace2 = horizontal_bar_chart(cnt_srs["count of non-zero revenue"].head(10), 'rgba(71, 58, 131, 0.8)')
trace3 = horizontal_bar_chart(cnt_srs["mean"].head(10), 'rgba(71, 58, 131, 0.8)')
# Creating two subplots
fig = tools.make_subplots(rows=1, cols=3, vertical_spacing=0.04, 
                          subplot_titles=["Count", "Non-zero Revenue Count", "Mean Revenue"])

fig.append_trace(trace1, 1, 1)
fig.append_trace(trace2, 1, 2)
fig.append_trace(trace3, 1, 3)

fig['layout'].update(height=1200, width=1200, paper_bgcolor='rgb(233,233,233)', title="Device Category")
py.iplot(fig)


In [None]:
# Operating system
cnt_srs = train_df.groupby('device.operatingSystem')['totals.transactionRevenue'].agg(['size', 'count', 'mean'])
cnt_srs.columns = ["count", "count of non-zero revenue", "mean"]
cnt_srs = cnt_srs.sort_values(by="count", ascending=False)
trace1 = horizontal_bar_chart(cnt_srs["count"].head(10), 'rgba(246, 78, 139, 0.6)')
trace2 = horizontal_bar_chart(cnt_srs["count of non-zero revenue"].head(10),'rgba(246, 78, 139, 0.6)')
trace3 = horizontal_bar_chart(cnt_srs["mean"].head(10),'rgba(246, 78, 139, 0.6)')
# Creating two subplots
fig = tools.make_subplots(rows=1, cols=3, vertical_spacing=0.04, 
                          subplot_titles=["Count", "Non-zero Revenue Count", "Mean Revenue"])

fig.append_trace(trace1, 1, 1)
fig.append_trace(trace2, 1, 2)
fig.append_trace(trace3, 1, 3)

fig['layout'].update(height=1200, width=1200, paper_bgcolor='rgb(233,233,233)', title="Operating System")
py.iplot(fig)


Inferences:

* Device Browser distribution looks similar on both the count and count of non-zero revenue plots
* On the device category front, Desktop seem to have higher percentage of non-zero revenue counts compared to Mobile Devices.
* In device operating system, though the number of counts is more from windows, the number of counts where revenue is not zero is more for Macintosh.
* Chrome OS also has higher percentage of non-zero revenue counts
* On the mobile OS side, iOS has more percentage of non-zero revenue counts compared to Android

## Geographical Information

In [None]:
# Continent
cnt_srs = train_df.groupby('geoNetwork.continent')['totals.transactionRevenue'].agg(['size', 'count', 'mean'])
cnt_srs.columns = ["count", "count of non-zero revenue", "mean"]
cnt_srs = cnt_srs.sort_values(by="count", ascending=False)
trace1 = horizontal_bar_chart(cnt_srs["count"].head(10), 'rgba(58, 71, 80, 0.6)')
trace2 = horizontal_bar_chart(cnt_srs["count of non-zero revenue"].head(10), 'rgba(58, 71, 80, 0.6)')
trace3 = horizontal_bar_chart(cnt_srs["mean"].head(10), 'rgba(58, 71, 80, 0.6)')

# Creating two subplots
fig = tools.make_subplots(rows=1, cols=3, vertical_spacing=0.08, horizontal_spacing=0.15, 
                          subplot_titles=["Count", "Non-zero Revenue Count", "Mean Revenue"])

fig.append_trace(trace1, 1, 1)
fig.append_trace(trace2, 1, 2)
fig.append_trace(trace3, 1, 3)

fig['layout'].update(height=1500, width=1200, paper_bgcolor='rgb(233,233,233)', title="Continent")
py.iplot(fig)


In [None]:
# Sub Continent
cnt_srs = train_df.groupby('geoNetwork.subContinent')['totals.transactionRevenue'].agg(['size', 'count', 'mean'])
cnt_srs.columns = ["count", "count of non-zero revenue", "mean"]
cnt_srs = cnt_srs.sort_values(by="count", ascending=False)
trace1 = horizontal_bar_chart(cnt_srs["count"], 'orange')
trace2 = horizontal_bar_chart(cnt_srs["count of non-zero revenue"], 'orange')
trace3 = horizontal_bar_chart(cnt_srs["mean"], 'orange')

# Creating two subplots
fig = tools.make_subplots(rows=1, cols=3, vertical_spacing=0.08, horizontal_spacing=0.15, 
                          subplot_titles=["Count", "Non-zero Revenue Count", "Mean Revenue"])

fig.append_trace(trace1, 1, 1)
fig.append_trace(trace2, 1, 2)
fig.append_trace(trace3, 1, 3)

fig['layout'].update(height=1500, width=1200, paper_bgcolor='rgb(233,233,233)', title="Sub Continent")
py.iplot(fig)


Inferences:

* On the continent plot, we can see that America has both higher number of counts as well as highest number of counts where the revenue is non-zero
* Though Asia and Europe have high number of counts, the number of non-zero revenue counts from these continents are comparatively low.
* These two points hold true for the Sub Continents plot as well.

## Traffic Information

In [None]:
# Source
cnt_srs = train_df.groupby('trafficSource.source')['totals.transactionRevenue'].agg(['size', 'count', 'mean'])
cnt_srs.columns = ["count", "count of non-zero revenue", "mean"]
cnt_srs = cnt_srs.sort_values(by="count", ascending=False)
trace1 = horizontal_bar_chart(cnt_srs["count"].head(10), 'green')
trace2 = horizontal_bar_chart(cnt_srs["count of non-zero revenue"].head(10), 'green')
trace3 = horizontal_bar_chart(cnt_srs["mean"].head(10), 'green')

# Creating two subplots
fig = tools.make_subplots(rows=1, cols=3, vertical_spacing=0.08, horizontal_spacing=0.15, 
                          subplot_titles=["Count", "Non-zero Revenue Count", "Mean Revenue"])

fig.append_trace(trace1, 1, 1)
fig.append_trace(trace2, 1, 2)
fig.append_trace(trace3, 1, 3)

fig['layout'].update(height=1500, width=1200, paper_bgcolor='rgb(233,233,233)', title="Traffic Source")
py.iplot(fig)

In [None]:
# Medium
cnt_srs = train_df.groupby('trafficSource.medium')['totals.transactionRevenue'].agg(['size', 'count', 'mean'])
cnt_srs.columns = ["count", "count of non-zero revenue", "mean"]
cnt_srs = cnt_srs.sort_values(by="count", ascending=False)
trace1 = horizontal_bar_chart(cnt_srs["count"], 'purple')
trace2 = horizontal_bar_chart(cnt_srs["count of non-zero revenue"], 'purple')
trace3 = horizontal_bar_chart(cnt_srs["mean"], 'purple')

# Creating two subplots
fig = tools.make_subplots(rows=1, cols=3, vertical_spacing=0.08, horizontal_spacing=0.15, 
                          subplot_titles=["Count", "Non-zero Revenue Count", "Mean Revenue"])

fig.append_trace(trace1, 1, 1)
fig.append_trace(trace2, 1, 2)
fig.append_trace(trace3, 1, 3)

fig['layout'].update(height=1500, width=1200, paper_bgcolor='rgb(233,233,233)', title="Traffic Medium")
py.iplot(fig)

Inferences:

* In the traffic source plot, even though Youtube has high number of counts in the dataset, the number of non-zero revenue counts are very less.
* Google plex has a high ratio of non-zero revenue count to total count in the traffic source plot.
* On the traffic source medium, "referral" has more number of non-zero revenue count compared to "organic" medium.

## User Activity Information

In [None]:
gdf = train_df.groupby("fullVisitorId").agg({'totals.hits': 'sum', 'totals.pageviews': 'sum', 'totals.transactionRevenue': 'sum'})
gdf['isPayingVisitor'] = gdf['totals.transactionRevenue'] > 0

In [None]:
px.scatter(gdf, x='totals.hits', facet_col='isPayingVisitor')

In [None]:
px.scatter(gdf, x='totals.pageviews', facet_col='isPayingVisitor')

Inferences:

* People with a higher(>300) number of "hits" tend to have a higher chance of being paying customers.
* This holds true for "pageviews" as well and the distinction is clearer, suggesting that "pageviews" might be a more important feature than "hits"