In [None]:
import pandas as pd
import altair as alt

import numpy as np

In [None]:
%matplotlib inline

# produce vector inline graphics
from IPython.display import set_matplotlib_formats, display, Markdown, HTML

set_matplotlib_formats('pdf', 'svg')

import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [10, 4]


# Ad Click Dashboard

In [None]:
# IMPORT DATA

df = pd.read_csv('data/df.csv')

nmf_components = np.load('data/nmf-components.npy')

with open('data/feature-names.txt', 'r') as f:
    feature_names = [line.strip() for line in f.readlines()]
    

### Temporal effects

In [None]:
bars = (alt.Chart(df, width=800, title='Click rate by month').mark_bar(width=100)
 .encode(
     alt.X('month(Timestamp):O', title=''),
     y=alt.Y('mean(Clicked on Ad):Q', title='', axis=alt.Axis(labels=False))
 ))
text = bars.mark_text(
    align='center',
    baseline='middle',
    dy=20,
    color='white',
    size=20
).encode(
    text=alt.Text('mean(Clicked on Ad):O', format=('.2f'))
)

(bars + text).configure_axis(labelAngle=0, labelFontSize=20).configure_title(fontSize=20)

## Age, time on site, click rate


In [None]:
brush = alt.selection_interval(encodings=['x'], empty='all')
brush2 = alt.selection_interval(encodings=['x'], empty='all')

domain = ['No Click', 'Click']
range_ = ['lightgray', 'orange']

bars1 = alt.Chart(df, width=400).mark_bar().encode(
    x=alt.X('Daily Time Spent on Site:Q', bin=True),
    color='Click_labeled:N',
    y='count(Click_labeled):N'
).add_selection(
    brush
).transform_filter(
    brush2
).properties(
    title='Less time = More Clicks'
)

bars = alt.Chart(df, width=400).mark_bar().encode(
    x=alt.X('Age:Q', scale=alt.Scale(domain=(10, 65))),
    color=alt.Color('Click_labeled:N', scale=alt.Scale(domain=domain, range=range_)),
    y=alt.Y('count(Click_labeled):N', scale=alt.Scale(domain=(0, 62))),
).transform_filter(
    brush
).add_selection(
    brush2
).properties(
    title='Older = higher click rate'
)

(bars | bars1).configure_title(
    fontSize=20,
    anchor='start'
)

**Key takeaways:**
* Most users are around 30 years old, but older users have the highest click rate.
* Users who spend over 60 minutes on the site have a sharp decrease in click rate.
* Users who spend over 70 minutes on the site are much more likely to be young.

### Another view on Age vs time on site


In [None]:
bars = (alt.Chart(df, width=800).mark_bar()
 .encode(
     x=alt.X('Daily Time Spent on Site', bin=alt.Bin(maxbins=14),
             axis=alt.Axis(labelAngle=0,
                                        labelFontSize=16,
                                        titleFontSize=20,
                                        labelColor='gray',
                                        titleColor='gray')),
     y=alt.Y('count(Daily Time Spent on Site)', title=''),
     color=alt.Color('Age', 
                     scale=alt.Scale(scheme='viridis', reverse=True),
                     bin=alt.Bin(maxbins=10)))
 
 .properties(title='Over 70 minutes on site = younger users')
)


bars.configure_title(fontSize=20)

### Month and age

In [None]:
(alt.Chart(df, width=800, title='Age vs Month, Counts').mark_bar(width=100)
 .encode(
     alt.X('month(Timestamp):O', title=''),
     y=alt.Y('count(Clicked on Ad):Q', title='', axis=alt.Axis(labels=False)),
     color=alt.Color('Age', 
                         scale=alt.Scale(scheme='viridis', reverse=True),
                         bin=alt.Bin(maxbins=10))
 ).configure_axis(labelAngle=0, labelFontSize=20)).configure_title(fontSize=20)



## Topic Modeling



In [None]:
divs = []
for i,topic in enumerate(nmf_components):

    num_ads = len(df[df['Topic']==i+1])
    top_5 = [feature_names[i] for i in topic.argsort()[-5:]]  
    top = ''.join([f'<li>{word}</li>' for word in  top_5])
    top = '<ul>' + top + '</ul>'
    markdown = f'''<div style="width: 33%; float: left;"> 
    
<h3>Topic #{i+1}</h4>
<h4>Number of Ads in Topic: {num_ads}</h4>
<br>
Top 5 Words:
{top}

</div>
''' 
    
    divs.append(markdown)
    
div_text = ''.join(divs)
display(Markdown(div_text))

In [None]:
base = (alt.Chart(df, width=400).mark_bar()
 .encode(
     x=alt.X('Topic:N', axis=alt.Axis(labelAngle=0,
                                        labelFontSize=16,
                                        titleFontSize=20,
                                        labelColor='gray',
                                        titleColor='gray')),
     y=alt.Y('mean(Clicked on Ad)', axis=alt.Axis(labels=False, title='')))
 
 .properties(title='No clear difference between topics and click rates'))


bars = base.encode(color=alt.Color('Topic:N', 
                         scale=alt.Scale(scheme='viridis', reverse=True)))

text = base.mark_text(
    align='center',
    baseline='middle',
    dy=20,
    color='white',
    size=24
).encode(
    text=alt.Text('mean(Clicked on Ad):O', format=('.2f'))
)

topic_age = (alt.Chart(df[df['Clicked on Ad']==True], width=400).mark_bar()
 .encode(
     x=alt.X('Age:Q', bin=True, axis=alt.Axis(labelAngle=0,
                                        labelFontSize=16,
                                        titleFontSize=20,
                                        labelColor='gray',
                                        titleColor='gray')),
     y=alt.Y('count()', axis=alt.Axis(labels=False, title='')),
     color=alt.Color('Topic:N', 
                         scale=alt.Scale(scheme='viridis', reverse=True)))
 
 .properties(title='Count of Ad Clicks by Topic and Age'))


((bars + text ) | topic_age).configure_title(fontSize=20)