Task
- Add in other uestions
    - Refactor code for the aggregation of timely activity
- Fix the plots
- Start on EDA
    - Each major feature will cover distribution of the feature, readtime/scroll pct, how it relates to the topics selected, and activity distribution.
        - We will use histograms, bar plots, box plots, and scatter plots (can look at pie charts too)
- Model selection

Background Information on Dataset: BLAH BLAH BLAH

Importing Packages

In [1]:
import numpy as np
import pandas as pd
import dask.dataframe as dd
import matplotlib
import plotly
import plotly.express as px
import plotly.graph_objects as go

from datetime import datetime

EDA

Load in dataset

Aside: Let's measure how long it takes to load in our dataset: using pandas and dask

In [None]:
import cProfile
import pstats

profiler = cProfile.Profile()
profiler.enable()
df = pd.read_parquet("Data/Large/articles.parquet")
profiler.disable()

profiler_stats = pstats.Stats(profiler)
profiler_stats.print_stats()

In [None]:
profiler = cProfile.Profile()
profiler.enable()
df = dd.read_parquet("Data/Large/articles.parquet", engine = "fastparquet")
profiler.disable()

profiler_stats = pstats.Stats(profiler)
profiler_stats.print_stats()

Even though the dataframe took 1.464 seconds to load using pandas, dask took 0.054 seconds. This is a huge speedup ~ 27x

In [2]:
#Load in various dataframes
## Articles
df_art = pd.read_parquet("Data/Small/articles.parquet")

## Behaviors
df_bev = pd.read_parquet("Data/Small/train/behaviors.parquet")

## History
df_his = pd.read_parquet("Data/Small/train/history.parquet")



Join the data sources

In [3]:
# Convert datatype of column first
df_bev['article_ids_clicked'] = df_bev['article_ids_clicked'].apply(lambda x: x[0])

In [4]:
# Join bevhaiors to article
df= df_bev.join(df_art.set_index("article_id"), on = "article_ids_clicked")

# Join bevhaiors to history 
df= df.join(df_his.set_index("user_id"), on = "user_id")

# Drop all other dataframes from me
df_bev = []
df_his = []
df_art = []

In [5]:
# Preprocessing
df.dropna(subset=['article_id'], inplace= True)
#df.dropna(subset =['age'], inplace = True)


df['article_id'] = df['article_id'].apply(lambda x: int(x))

df['article_id']= df['article_id'].astype(np.int64)



Visualizations

In [None]:
def plotly_index_values(groupby_object):
    """Takes in a clause from a pandas groupby statement and returns X and Y variables used for plotting"""

    index = groupby_object.index
    values = groupby_object.values

    return index, values



Lets calculate the unique users for hourly, daily, and day of the week. Let's use a subset of the data until we know our plots are very good

#### Biggest thing is user engagement : Bigger User Engagement -> More eveneue
#### We need to maximize the amount of ads these guys are viewing -> this leads on to them clicking on new articles for ads
#### So, let's not make article length too short so that people can maximize their session lengths with a lot of articles!

## User Activity

In [None]:
df['user_id'].unique()

In [None]:
# Let's make subsets for each user_id and then populate the frequency based on that

## Get all unique ids in a list

unique_user_ids = df['user_id'].unique()[0:1000]

# Create dictionaries
unique_users_daily_freq = {}
unique_users_hourly_freq = {}
unique_users_dayofweek_freq = {}
unique_users_weekly_freq = {}

for id in unique_user_ids:
    # Get the subset of that user id
    tmp_df = df[df['user_id'] == id]

    # Now lets go through each and populate the unique dates, hours and day of the week for each user
    dates = []
    hours = []
    dayofweek = []
    week = []
    indices = np.array(tmp_df.index)
    for i in indices:
        tmp_datetime = pd.DatetimeIndex(tmp_df['impression_time_fixed'][i])
        tmp_date = tmp_datetime.date
        tmp_time = tmp_datetime.time
        tmp_dayofweek = tmp_datetime.day_of_week
        tmp_week = tmp_datetime.isocalendar().week

        for j in tmp_date:
            dates.append(j)
        
        for k in tmp_time:
            hours.append(k)
        
        for l in tmp_dayofweek:
            dayofweek.append(l)
        
        for m in tmp_week:
            week.append(m)

    # Get rid of duplicate values
    unique_dates = list(set(dates))
    unique_hours = list(set(hours))
    unique_dayofweek = list(set(dayofweek))
    unique_week = list(set(week))

    # Convert to string
    unique_dates = [x.strftime('%m/%d/%Y') for x in unique_dates]
    unique_hours = [x.hour for x in unique_hours]


    # Populate our unique_user_daily_freq dict
    for i in unique_dates:
        
        if i not in unique_users_daily_freq:
            unique_users_daily_freq[i] = 1
        else:
            unique_users_daily_freq[i] +=1
        

    # Populate hourly activity
    for j in unique_hours:

        if j not in unique_users_hourly_freq:
            unique_users_hourly_freq[j] = 1
        else:
            unique_users_hourly_freq[j] +=1

    # Populate dayofweeka activity
    for k in unique_dayofweek:

        if k not in unique_users_dayofweek_freq:
            unique_users_dayofweek_freq[k] = 1
        else:
            unique_users_dayofweek_freq[k] +=1
    

    # Populate dayofweeka activity
    for l in unique_week:

        if l not in unique_users_weekly_freq:
            unique_users_weekly_freq[l] = 1
        else:
            unique_users_weekly_freq[l] +=1
    
            
    

In [None]:
# What does the daily user activity look like?
unique_users_daily_freq = dict(sorted(unique_users_daily_freq.items()))

indices = [x for x in unique_users_daily_freq.keys()]
values = [x for x in unique_users_daily_freq.values()]

fig = go.Figure()

# Add traces
fig.add_trace(go.Scatter(x = indices, y = values,
                         mode = 'lines+markers'
                         )

)

fig.show()


In [None]:
# What does the hourly user activity look like?
indices = [x for x in unique_users_hourly_freq.keys()]
values = [x for x in unique_users_hourly_freq.values()]

fig = go.Figure()

# Add traces
fig.add_trace(go.Scatter(x = indices, y = values,
                         mode = 'markers'
                         )

)

fig.show()

In [None]:
# What does the weekly user activity look like?
indices = [x for x in unique_users_weekly_freq.keys()]
values = [x for x in unique_users_weekly_freq.values()]

fig = go.Figure()

# Add traces
fig.add_trace(go.Scatter(x = indices, y = values,
                         mode = 'markers'
                         )

)

fig.show()

In [None]:
# What does the weekly user activity look like?
indices = [x for x in unique_users_dayofweek_freq.keys()]
values = [x for x in unique_users_dayofweek_freq.values()]

fig = go.Figure()

# Add traces
fig.add_trace(go.Scatter(x = indices, y = values,
                         mode = 'markers'
                         )

)

fig.show()

## Users

Daily User growth

In [None]:
unique_user_ids = df['user_id'].unique()

# Create dictionaries
unique_users_daily_growth_freq= {}
unique_users_hourly_freq = {}
unique_users_dayofweek_freq = {}
unique_users_weekly_freq = {}

for id in unique_user_ids:
    # Get the subset of that user id
    tmp_df = df[df['user_id'] == id]
    first_index = tmp_df['impression_time_fixed'].index[0]
    tmp_datetime = pd.DatetimeIndex(tmp_df['impression_time_fixed'][first_index])
    tmp_date = tmp_datetime[0].date()
    join_date = tmp_date.strftime('%m/%d/%Y')
    #join_date = join_date.strftime('%m/%d/%Y')
    
    if join_date not in unique_users_daily_growth_freq:
        unique_users_daily_growth_freq[join_date] = 1
    else:
        unique_users_daily_growth_freq[join_date] +=1
    
unique_users_daily_growth_freq = dict(sorted(unique_users_daily_growth_freq.items()))

In [None]:
indices = [x for x in unique_users_daily_growth_freq.keys()]
values = [x for x in unique_users_daily_growth_freq.values()]

fig = go.Figure()

# Add traces
fig.add_trace(go.Bar(x = indices, y = values,
                         
                         )

)

fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')

fig.show()

### Average readtime per user

In [33]:
## Average readtime per user
avg_read_times = df.groupby(by =  'user_id')['read_time'].mean()

fig = go.Figure()
fig.add_trace(go.Histogram(x=avg_read_times))


# Overlay both histograms
fig.update_layout(barmode='overlay')
# Reduce opacity to see both histograms
fig.update_traces(opacity=0.55)
fig.show()




In [None]:
## Average readtime per user
avg_read_times = df.groupby(by =  'user_id')['read_time'].mean()

fig = go.Figure()
fig.add_trace(go.Histogram(x=avg_read_times))


# Overlay both histograms
fig.update_layout(barmode='overlay')
# Reduce opacity to see both histograms
fig.update_traces(opacity=0.55)
fig.show()

### Average scroll percentage per user

In [None]:
# Average scroll time across each user
## Average readtime per user
avg_scroll_percentage = df.groupby(by = 'user_id')['scroll_percentage'].mean()

fig = go.Figure()
fig.add_trace(go.Histogram(x=avg_scroll_percentage))


# Overlay both histograms
fig.update_layout(barmode='overlay')
# Reduce opacity to see both histograms
fig.update_traces(opacity=0.55)
fig.show()

## Session

Daily Active Sessions

In [None]:
# Number of unique sessions per day
tmp_df = df
tmp_df['impression_time'] = tmp_df['impression_time'].apply(lambda x: x.strftime('%m/%d/%Y'))


unique_session_dates= tmp_df['impression_time'].unique()
unique_sessions_per_day = tmp_df.groupby(by = 'session_id')['impression_time'].min()
unique_sessions_daily_growth = {k:0 for k in unique_sessions_per_day}


for x in unique_sessions_per_day.values:
    unique_sessions_daily_growth[x] +=1

unique_sessions_daily_growth = dict(sorted(unique_sessions_daily_growth.items()))

In [None]:
indices = [x for x in unique_sessions_daily_growth.keys()]
values = [x for x in unique_sessions_daily_growth.values()]

fig = go.Figure()

# Add traces
fig.add_trace(go.Bar(x = indices, y = values,
                         
                         )

)

fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')

fig.show()



### Average readtime per session

In [None]:
# For each session, what was the average readtimes

## Average readtime per session
avg_read_times = df.groupby(by ='session_id')['read_time'].mean()

fig = go.Figure()
fig.add_trace(go.Histogram(x=avg_read_times))


# Overlay both histograms
fig.update_layout(barmode='overlay')
# Reduce opacity to see both histograms
fig.update_traces(opacity=0.55)
fig.show()



    

Average scroll percentage per session

In [None]:
# For each session, what was the average scroll time

# Distribution blot histogram (bin it)
df.groupby(by= 'session_id')['scroll_percentage'].mean()

## Average readtime per user
avg_scroll_pct = df.groupby(by ='session_id')['scroll_percentage'].mean()

fig = go.Figure()
fig.add_trace(go.Histogram(x=avg_scroll_pct))


# Overlay both histograms
fig.update_layout(barmode='overlay')
# Reduce opacity to see both histograms
fig.update_traces(opacity=0.55)
fig.show()

## Topic

In [None]:
# Let's get unique_user_ids topics

## Get all unique ids in a list

unique_user_ids = df['user_id'].values[0:1000]

# Create dictionaries
unique_users_topics_freq= {}
unique_topic_scroll_freq = {}
unique_topic_read_freq = {}

for id in unique_user_ids:
    # Get the subset of that user id
    tmp_df = df[df['user_id'] == id]

    # Now lets go through each topic
    indices = np.array(tmp_df.index)
    for i in indices:
        tmp_topics = tmp_df['topics'][i]
        tmp_scroll = tmp_df['scroll_percentage'][i]
        tmp_read = tmp_df['read_time'][i]

        topics = [x for x in tmp_topics]
        scroll = [tmp_scroll]
        read = [tmp_read]

            
    # Find the average scroll percentages across each topic  (Can be related to whether a topic doesnt require too much reading has visualizations)

    ## Group by user ID
    ### Look at article_id for whichever topics the article is included in add that readtime and scroll percentage
        tmp_dict = {k:v for k,v in zip(topics, scroll)}

        for k,v in zip(tmp_dict.keys(), tmp_dict.values()):

            if (k in unique_topic_scroll_freq.keys()):
                tmp_array= np.append(unique_topic_scroll_freq[k],v)
                unique_topic_scroll_freq[k] = tmp_array
            if (k not in unique_topic_scroll_freq.keys()):
                unique_topic_scroll_freq[k] = []

    # Find the average read time across each topic
    ## Group by user ID
    ### Look at article_id for whichever topics the article is included in add that readtime and scroll percentage
        tmp_dict = {k:v for k,v in zip(topics, read)}

        for k,v in zip(tmp_dict.keys(), tmp_dict.values()):

            if (k in unique_topic_read_freq.keys()):
                tmp_array= np.append(unique_topic_read_freq[k],v)
                unique_topic_read_freq[k] = tmp_array
            if (k not in unique_topic_read_freq.keys()):
                unique_topic_read_freq[k] = []

    
    ## Unique User Topics
    # Get rid of duplicate values
    unique_topics = list(set(topics))

    
    # Populate our dict
    for i in unique_topics:
        
        if i not in unique_users_topics_freq:
            unique_users_topics_freq[i] = 1
        else:
            unique_users_topics_freq[i] +=1



### Distribution of Topics across users

In [None]:
# What does the distribution of topics look like lets sort it?
sorted_topic_freq = dict(sorted(unique_users_topics_freq.items(), key = lambda x: x[1], reverse = True))


indices = [x for x in sorted_topic_freq.keys()]
values = [x for x in sorted_topic_freq.values()]

fig = go.Figure()

# Add traces
fig.add_trace(go.Bar(x = indices, y = values,
                         
                         )

)

fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')

fig.show()

### Average readtime per topic

In [None]:
# Find the average read times across each topic

# Get the average scroll length for each article
for k,v in zip(unique_topic_read_freq.keys(), unique_topic_read_freq.values()):
    unique_topic_read_freq[k] = np.nanmean(v)

sorted_unique_topic_read_freq = dict(sorted(unique_topic_read_freq.items(), key = lambda x: x[1], reverse = True))


indices = [x for x in sorted_unique_topic_read_freq . keys()]
values = [x for x in sorted_unique_topic_read_freq.values()]

fig = go.Figure()

# Add traces
fig.add_trace(go.Bar(x = indices, y = values,
                         
                         )

)

fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')

fig.show()

### Average scroll perct per topic

In [None]:
# Find the average scroll percentages across each topic

# Get the average scroll length for each article
for k,v in zip(unique_topic_scroll_freq.keys(), unique_topic_scroll_freq.values()):
    unique_topic_scroll_freq[k] = np.nanmean(v)

sorted_unique_topic_scroll_freq = dict(sorted(unique_topic_scroll_freq.items(), key = lambda x: x[1], reverse = True))


indices = [x for x in sorted_unique_topic_scroll_freq . keys()]
values = [x for x in sorted_unique_topic_scroll_freq.values()]

fig = go.Figure()

# Add traces
fig.add_trace(go.Bar(x = indices, y = values,
                         
                         )

)

fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')

fig.show()

### Activity

In [None]:
# Get the list of each unqiue topic in a specific session 
topics = df.groupby(by = 'session_id')['topics'].apply(list)

# Get the list of each unique timestamp for these sessions
timestamps = df.groupby(by = 'session_id')['impression_time'].apply(list)

# Get all the unique topics

# Get all the unique topics
unique_topics = []
for i in range(len(topics.values)):
    for j in range(0, len(topics.values[i][0])):
        tmp = topics.values[i][0][j]
        if tmp not in unique_topics:
            unique_topics.append(tmp)


unique_topics = sorted(unique_topics)

# Get all unique dates
timestamps = df.groupby(by = 'session_id')['impression_time'].apply(list)

unique_dates = []
unique_hours= [str(i) if i > 9 else str(0) + str(i) for i in range(24)]

for i in range(len(timestamps.values)):
    for j in range(len(timestamps.values[i])):
        tmp_datetime = timestamps.values[i][j]
        tmp_date = tmp_datetime.strftime('%m/%d/%Y')
        if tmp_date not in unique_dates:
            unique_dates.append(tmp_date)


unique_dates = sorted(unique_dates)





In [None]:
unique_topic_daily_activity  = {k:{k:0 for k in unique_dates} for k in unique_topics}
unique_topic_hourly_activity  = {k:{k:0 for k in unique_hours} for k in unique_topics}

# Populate the dictionary
for i in zip(range(len(topics.values))):
    for j, k  in zip(range(0, len(topics.values[i][0])), range(0, len(i))):
        tmp = topics.values[i][0][j]
        tmp_datetime = timestamps.values[i][k]
        tmp_date = tmp_datetime.strftime('%m/%d/%Y')
        tmp_time = tmp_datetime.strftime('%H')

        # Add to dictionary
        unique_topic_daily_activity[tmp][tmp_date] +=1
        unique_topic_hourly_activity[tmp][tmp_time] +=1
    




Daily

In [None]:
fig = go.Figure()

for topic in unique_topic_daily_activity.keys():
    # What does the daily user activity look like?
    indices = [x for x in unique_topic_daily_activity[topic].keys()]
    values = [x for x in unique_topic_daily_activity[topic].values()]


    # Add traces
    fig.add_trace(go.Scatter(x = indices, y = values,
                            mode = 'lines+markers'
                            )

    )

fig.show()

Hourly Activity

In [None]:
fig = go.Figure()

for topic in unique_topic_hourly_activity.keys():
    # What does the daily user activity look like?
    indices = [x for x in unique_topic_hourly_activity[topic].keys()]
    values = [x for x in unique_topic_hourly_activity[topic].values()]


    # Add traces
    fig.add_trace(go.Scatter(x = indices, y = values,
                            mode = 'lines+markers'
                            )

    )

fig.show()

## Article

### Average readtime per article

In [None]:
## For each article, what was each users read time


## Get all unique ids in a list

unique_user_ids = df['user_id'].values[0:1000]

## We take the set because the scroll, article per user is joined in a list for every user id (so just take the set of it!)
unique_user_ids = set(unique_user_ids)


unique_article_ids = df['article_id'].unique()
unique_article_ids= unique_article_ids[~np.isnan(unique_article_ids)]


# Create dictionaries

unique_article_read = {k: [0] for k in unique_article_ids}

for id in unique_user_ids:
    # Get the subset of that user id
    tmp_df = df[df['user_id'] == id]

    # Now lets go through each scroll and article
    ## Iterate through each of the df
    indices = np.array(tmp_df.index)
    for i in indices:
        tmp_dict = {}
        # Select the scroll / article of that indice and 
        tmp_read = tmp_df['read_time_fixed'][i]

        tmp_article = tmp_df['article_id_fixed'][i]

        read = [x for x in tmp_read]
        
        articles = [np.int64(x) for x in tmp_article]
    
        tmp_dict = {k:v for k,v in zip(articles, read)}


        for k,v in zip(tmp_dict.keys(), tmp_dict.values()):

            #if unique_article_scroll.get(k) != None:
            #    unique_article_scroll[k] += v
            if (k in unique_article_read.keys()) & (np.isnan(v) == False):
                tmp_array= np.append(unique_article_read[k],v)
                unique_article_read[k] = tmp_array
        
    
    # Get the average scroll length for each article
    for k,v in zip(unique_article_read.keys(), unique_article_read.values()):
        unique_article_read[k] = np.mean(v)



# What does the hourly user activity look like?
indices = [x for x in unique_article_read.keys()]
values = [x for x in unique_article_read.values()]

fig = go.Figure()

# Add traces
fig.add_trace(go.Scatter(x = indices, y = values,
                         mode = 'markers'
                         )

)

fig.show()


### Average scroll pct per article

In [None]:
## For each article, what was each users scroll percentage + read time? 


## Get all unique ids in a list

unique_user_ids = df['user_id'].values[0:2000]

## We take the set because the scroll, article per user is joined in a list for every user id (so just take the set of it!)
unique_user_ids = set(unique_user_ids)


unique_article_ids = df['article_id'].unique()
unique_article_ids= unique_article_ids[~np.isnan(unique_article_ids)]

# Create dictionaries

unique_article_scroll = {k: [0] for k in unique_article_ids}

for id in unique_user_ids:
    # Get the subset of that user id
    tmp_df = df[df['user_id'] == id]

    # Now lets go through each scroll and article
    ## Iterate through each of the df
    indices = np.array(tmp_df.index)
    for i in indices:
        tmp_dict = {}
        # Select the scroll / article of that indice and 
        tmp_scroll = tmp_df['scroll_percentage_fixed'][i]

        tmp_article = tmp_df['article_id_fixed'][i]

        scroll = [x for x in tmp_scroll]
        
        articles = [np.int64(x) for x in tmp_article]
    
        tmp_dict = {k:v for k,v in zip(articles, scroll)}


        for k,v in zip(tmp_dict.keys(), tmp_dict.values()):

            if (k in unique_article_scroll.keys()) & (np.isnan(v) == False):
                tmp_array= np.append(unique_article_scroll[k],v)
                unique_article_scroll[k] = tmp_array
        
    
# Get the average scroll length for each article
for k,v in zip(unique_article_scroll.keys(), unique_article_scroll.values()):
    unique_article_scroll[k] = np.mean(v)



indices = [x for x in unique_article_scroll.keys()]
avg_scroll_pct = [x for x in unique_article_scroll.values()]

fig = go.Figure()

# Add traces
fig.add_trace(go.Histogram(x=avg_scroll_pct)

)

fig.show()


### Number of articles clicked in a session

In [None]:
# How many unique articles are clicked in a session?

tmp_aps = df.groupby('session_id')['article_id'].apply(list)

articles_per_session = {k:0 for k in range(1, 20)}

for i in tmp_aps:
    num_articles = len(i)
    articles_per_session[num_articles] += 1


indices = [k for k in articles_per_session.keys()]
values = [k for k in articles_per_session.values()]


fig = go.Figure()

# Add trace
fig.add_trace(go.Bar(x= indices, y = values, textfont_size=20))

fig.update_traces(opacity=0.75)
fig.show()

# Devices

In [None]:
# Overall distribtuion of devices

df['device_type'].value_counts()

In [None]:
devices = [d for d in df['device_type'].unique()]

devices

### Readtime per device

In [None]:
# Readtime per device

## Get indices with different devices 
### Color based on label
#### Histogram -> Bar graph for avg?
devices = [d for d in df['device_type'].unique()]


fig = go.Figure()
for d in devices:
    device = df[df['device_type'] == d]['read_time']
    fig.add_trace(go.Histogram(x=device))

# Overlay both histograms
fig.update_layout(barmode='overlay')
# Reduce opacity to see both histograms
fig.update_traces(opacity=0.55)
fig.show()


In [None]:
# Average readtime per device

## Get indices with different devices 
### Color based on label
#### Histogram -> Bar graph for avg?

fig = go.Figure()

devices = [d for d in df['device_type'].unique()]


fig = go.Figure()
for d in devices:
    avg_device_readtime = df[df['device_type'] == d]['read_time'].mean()
    fig.add_trace(go.Bar(x= [d], y = [avg_device_readtime], textfont_size=20))



# Overlay both histograms
#fig.update_layout(barmode='overlay')
# Reduce opacity to see both histograms
fig.update_traces(opacity=0.75)
fig.show()

### Scroll time per device

In [None]:
# Scroll pct per device (scatter plot probably better)

## Get indices with different devices 
### Color based on label
#### Histogram -> Bar graph for avg?

fig = go.Figure()

devices = [d for d in df['device_type'].unique()]


fig = go.Figure()
for d in devices:
    avg_device_scrollpct = df[df['device_type'] == d]['scroll_percentage'].mean()
    fig.add_trace(go.Bar(x= [d], y = [avg_device_scrollpct], textfont_size=20))



# Overlay both histograms
#fig.update_layout(barmode='overlay')
# Reduce opacity to see both histograms
fig.update_traces(opacity=0.75)
fig.show()

### Topic

In [None]:
## Subset based on those entries
## populate the counts for the traces


fig = go.Figure()

devices = [d for d in df['device_type'].unique()]

# Get all the unique topics
unique_topics = []
for i in df.index:
    for j in range(0, len(df['topics'][i])):
        tmp = df['topics'][i][j]
        if tmp not in unique_topics:
            unique_topics.append(tmp)


unique_topics = sorted(unique_topics)



fig = go.Figure()
# Iterate through each device 
for d in devices:
    # Find the subset of the data with that device
    tmp_df = df[df['device_type'] == d]

    # Create a dict object with 0 counts for all topics
    device_topic_freq = {k:0 for k in unique_topics}
    for i in tmp_df.index:
        for j in range(0, len(df['topics'][i])):
            # Find that index
            tmp_topic = df['topics'][i][j]

            # Enumerate
            device_topic_freq[tmp_topic] += 1
    
    device_topic_freq = dict(sorted(device_topic_freq.items(), key = lambda kv: kv[1], reverse= True))
    #print(device_topic_freq)
        
    fig.add_trace(go.Bar(x= [x for x in device_topic_freq.keys()], y = [y for y in device_topic_freq.values()], textfont_size=20))



# Overlay both histograms
#fig.update_layout(barmode='overlay')
# Reduce opacity to see both histograms
fig.update_traces(opacity=0.75)
fig.show()

### Activity

#### Daily Activity

In [None]:
fig = go.Figure()
# Iterate through each device 
for d in devices:
    # Find the subset of the data with that device
    tmp_df = df[df['device_type'] == d]

    # Create a dict object with 0 counts for all topics
    device_daily_activity = {}
    for i in tmp_df.index:
        # Find that index
        tmp_timestamp = df['impression_time'][i]
        tmp_datetime = tmp_timestamp
        tmp_date = tmp_datetime.strftime('%m/%d/%Y')

        # Daily Activity
        if tmp_date not in device_daily_activity:
            device_daily_activity[tmp_date] = 0
        else:
            device_daily_activity[tmp_date] +=1
    
    # Sort by dates
    device_daily_activity = dict(sorted(device_daily_activity.items()))

    # Add trace
    fig.add_trace(go.Bar(x= [x for x in device_daily_activity.keys()], y = [y for y in device_daily_activity.values()], textfont_size=20))



    # Overlay both histograms
    #fig.update_layout(barmode='overlay')
    # Reduce opacity to see both histograms
fig.update_traces(opacity=0.75)
fig.show()

#### Hourly Activity

In [None]:
fig = go.Figure()
# Iterate through each device 
for d in devices:
    # Find the subset of the data with that device
    tmp_df = df[df['device_type'] == d]

    # Create a dict object with 0 counts for all topics
    device_hourly_activity = {}
    for i in tmp_df.index:
        # Find that index
        tmp_timestamp = df['impression_time'][i]
        tmp_datetime = tmp_timestamp
        tmp_time = tmp_datetime.strftime('%H')

        # Hourly Activity
        if tmp_time not in device_hourly_activity:
            device_hourly_activity[tmp_time] = 0
        else:
            device_hourly_activity[tmp_time] +=1
    
    # Sort by dates
    device_hourly_activity = dict(sorted(device_hourly_activity.items()))

    # Add trace
    fig.add_trace(go.Bar(x= [x for x in device_hourly_activity.keys()], y = [y for y in device_hourly_activity.values()], textfont_size=20))



    # Overlay both histograms
    #fig.update_layout(barmode='overlay')
    # Reduce opacity to see both histograms
fig.update_traces(opacity=0.75)
fig.show()

# If subscriber

### How many users are subscribers

In [None]:
indices = df['is_subscriber'].value_counts().index
values = df['is_subscriber'].value_counts().values

fig = go.Figure()


# Add trace
fig.add_trace(go.Bar(x= indices, y = values, textfont_size=20))



    # Overlay both histograms
    #fig.update_layout(barmode='overlay')
    # Reduce opacity to see both histograms
fig.update_traces(opacity=0.75)
fig.show()


### Read time for subscriber vs non-subscribers

In [None]:
# Distribution paid subscribers

#### Histogram -> Bar graph for avg?

subscriber= df[df['is_subscriber'] == True]['read_time']
not_subscriber = df[df['is_subscriber'] == False]['read_time']

fig = go.Figure()
fig.add_trace(go.Histogram(x=subscriber))
fig.add_trace(go.Histogram(x=not_subscriber))

# Overlay both histograms
fig.update_layout(barmode='overlay')
# Reduce opacity to see both histograms
fig.update_traces(opacity=0.75)
fig.show()

In [None]:
# Average readtime per device

## Get indices with different devices 
### Color based on label
#### Histogram -> Bar graph for avg?

subscriber= df[df['is_subscriber'] == True]['read_time']
not_subscriber = df[df['is_subscriber'] == False]['read_time']

avg_readtime_is_subscriber= [subscriber.mean(), not_subscriber.mean()]
is_subscriber = ['Yes', 'No']

fig = go.Figure()
fig.add_trace(go.Bar(x=is_subscriber, y = avg_readtime_is_subscriber, textfont_size=20))

# Overlay both histograms
#fig.update_layout(barmode='overlay')
# Reduce opacity to see both histograms
fig.update_traces(opacity=0.75)
fig.show()

### Scroll percentage if subscriber

In [None]:
# Distribution paid subscribers

#### Histogram -> Bar graph for avg?

subscriber= df[df['is_subscriber'] == True]['scroll_percentage']
not_subscriber = df[df['is_subscriber'] == False]['scroll_percentage']

fig = go.Figure()
fig.add_trace(go.Histogram(x=subscriber))
fig.add_trace(go.Histogram(x=not_subscriber))

# Overlay both histograms
fig.update_layout(barmode='overlay')
# Reduce opacity to see both histograms
fig.update_traces(opacity=0.75)
fig.show()

In [None]:
# Average readtime per device

## Get indices with different devices 
### Color based on label
#### Histogram -> Bar graph for avg?

subscriber= df[df['is_subscriber'] == True]['scroll_percentage']
not_subscriber = df[df['is_subscriber'] == False]['scroll_percentage']

avg_scrollpct_is_subscriber= [subscriber.mean(), not_subscriber.mean()]
is_subscriber = ['Yes', 'No']

fig = go.Figure()
fig.add_trace(go.Bar(x=is_subscriber, y = avg_scrollpct_is_subscriber, textfont_size=20))

# Overlay both histograms
#fig.update_layout(barmode='overlay')
# Reduce opacity to see both histograms
fig.update_traces(opacity=0.75)
fig.show()

#### Topic Distribution

In [None]:
## Subset based on those entries
## populate the counts for the traces


is_subscriber = [d for d in df['is_subscriber'].unique()]

# Get all the unique topics
unique_topics = []
for i in df.index:
    for j in range(0, len(df['topics'][i])):
        tmp = df['topics'][i][j]
        if tmp not in unique_topics:
            unique_topics.append(tmp)

unique_topics = sorted(unique_topics)


fig = go.Figure()
# Iterate through each device 
for d in is_subscriber:
    # Find the subset of the data with that device
    tmp_df = df[df['is_subscriber'] == d]

    # Create a dict object with 0 counts for all topics
    is_subscriber_topic_freq = {k:0 for k in unique_topics}
    for i in tmp_df.index:
        for j in range(0, len(df['topics'][i])):
            # Find that index
            tmp_topic = df['topics'][i][j]

            # Enumerate
            is_subscriber_topic_freq[tmp_topic] += 1
    
    is_subscriber_topic_freq = dict(sorted(is_subscriber_topic_freq.items(), key = lambda kv: kv[1], reverse= True))
    #print(device_topic_freq)
        
    fig.add_trace(go.Bar(x= [x for x in is_subscriber_topic_freq.keys()], y = [y for y in is_subscriber_topic_freq.values()], textfont_size=20))



# Overlay both histograms
#fig.update_layout(barmode='overlay')
# Reduce opacity to see both histograms
fig.update_traces(opacity=0.75)
fig.show()

### Activity

#### Daily Activity

In [None]:
is_subscriber = [d for d in df['is_subscriber'].unique()]

fig = go.Figure()
# Iterate through each device 
for d in is_subscriber:
    # Find the subset of the data with that device
    tmp_df = df[df['is_subscriber'] == d]

    # Create a dict object with 0 counts for all topics
    is_subscriber_daily_activity = {}
    for i in tmp_df.index:
        # Find that index
        tmp_timestamp = df['impression_time'][i]
        tmp_datetime = tmp_timestamp
        tmp_date = tmp_datetime.strftime('%m/%d/%Y')

        # Daily Activity
        if tmp_date not in is_subscriber_daily_activity:
            is_subscriber_daily_activity[tmp_date] = 0
        else:
            is_subscriber_daily_activity[tmp_date] +=1
        
    
    # Sort by dates
    is_subscriber_daily_activity = dict(sorted(is_subscriber_daily_activity.items()))

    # Add trace
    fig.add_trace(go.Bar(x= [x for x in is_subscriber_daily_activity.keys()], y = [y for y in is_subscriber_daily_activity.values()], textfont_size=20))



    # Overlay both histograms
    #fig.update_layout(barmode='overlay')
    # Reduce opacity to see both histograms
fig.update_traces(opacity=0.75)
fig.show()

#### Hourly Activity

In [None]:
is_subscriber = [d for d in df['is_subscriber'].unique()]

fig = go.Figure()
# Iterate through each device 
for d in is_subscriber:
    # Find the subset of the data with that device
    tmp_df = df[df['is_subscriber'] == d]

    # Create a dict object with 0 counts for all topics
    is_subscriber_hourly_activity = {}
    for i in tmp_df.index:
        # Find that index
        tmp_timestamp = df['impression_time'][i]
        tmp_datetime = tmp_timestamp
        tmp_time = tmp_datetime.strftime('%H')

        # Daily Activity
        if tmp_time not in is_subscriber_hourly_activity:
            is_subscriber_hourly_activity[tmp_time] = 0
        else:
            is_subscriber_hourly_activity[tmp_time] +=1
        
    
    # Sort by dates
    is_subscriber_hourly_activity = dict(sorted(is_subscriber_hourly_activity.items()))

    # Add trace
    fig.add_trace(go.Bar(x= [x for x in is_subscriber_hourly_activity.keys()], y = [y for y in is_subscriber_hourly_activity.values()], textfont_size=20))



    # Overlay both histograms
    #fig.update_layout(barmode='overlay')
    # Reduce opacity to see both histograms
fig.update_traces(opacity=0.75)
fig.show()

# Gender

In [None]:

# distribution of genders
indices = df['gender'].value_counts().index
values = df['gender'].value_counts().values

fig = go.Figure()


# Add trace
fig.add_trace(go.Bar(x= indices, y = values, textfont_size=20))



    # Overlay both histograms
    #fig.update_layout(barmode='overlay')
    # Reduce opacity to see both histograms
fig.update_traces(opacity=0.75)
fig.show()

### Read time per gender

In [None]:
# Readtime per gender

genders = df['gender'].unique()

fig = go.Figure()
for d in genders:

    gender_ = df[df['gender'] == d]['read_time']

    fig.add_trace(go.Histogram(x=gender_))

# Overlay both histograms
fig.update_layout(barmode='overlay')
# Reduce opacity to see both histograms
fig.update_traces(opacity=0.75)
fig.show()

In [None]:
# Average readtime per device

## Get indices with different devices 
### Color based on label
#### Histogram -> Bar graph for avg?

# Readtime per gender

genders = df['gender'].unique()
genders = genders[~np.isnan(genders)]

fig = go.Figure()
for d in genders:

    avg_gender_readtime = df[df['gender'] == d]['read_time'].mean()

    fig.add_trace(go.Bar(x= [d], y = [avg_gender_readtime], textfont_size=20))




# Overlay both histograms
#fig.update_layout(barmode='overlay')
# Reduce opacity to see both histograms
fig.update_traces(opacity=0.75)
fig.show()

### Scrolltime per gender

In [None]:
genders = df['gender'].unique()

fig = go.Figure()
for d in genders:

    gender_ = df[df['gender'] == d]['scroll_percentage']

    fig.add_trace(go.Histogram(x=gender_))

# Overlay both histograms
fig.update_layout(barmode='overlay')
# Reduce opacity to see both histograms
fig.update_traces(opacity=0.75)
fig.show()

In [None]:
# Average readtime per device

## Get indices with different devices 
### Color based on label
#### Histogram -> Bar graph for avg?

# Readtime per gender

genders = df['gender'].unique()

fig = go.Figure()
for d in genders:

    avg_gender_scrollpct = df[df['gender'] == d]['scroll_percentage'].mean()

    fig.add_trace(go.Bar(x= [d], y = [avg_gender_scrollpct], textfont_size=20))




# Overlay both histograms
#fig.update_layout(barmode='overlay')
# Reduce opacity to see both histograms
fig.update_traces(opacity=0.75)
fig.show()

### Topics

In [None]:
## Subset based on those entries
## populate the counts for the traces

genders = df['gender'].unique()
genders = genders[~np.isnan(genders)]

#genders = [d for d in df['gender'].unique() ]

# Get all the unique topics
unique_topics = []
for i in df.index:
    for j in range(0, len(df['topics'][i])):
        tmp = df['topics'][i][j]
        if tmp not in unique_topics:
            unique_topics.append(tmp)

unique_topics = sorted(unique_topics)


fig = go.Figure()
# Iterate through each device 
for d in genders:
    # Find the subset of the data with that device
    tmp_df = df[df['gender'] == d]

    # Create a dict object with 0 counts for all topics
    genders_topic_freq = {k:0 for k in unique_topics}
    for i in tmp_df.index:
        for j in range(0, len(df['topics'][i])):
            # Find that index
            tmp_topic = df['topics'][i][j]

            # Enumerate
            genders_topic_freq[tmp_topic] += 1
    
    genders_topic_freq = dict(sorted(genders_topic_freq.items(), key = lambda kv: kv[1], reverse= True))
    #print(device_topic_freq)
        
    fig.add_trace(go.Bar(x= [x for x in genders_topic_freq.keys()], y = [y for y in genders_topic_freq.values()], textfont_size=20))



# Overlay both histograms
#fig.update_layout(barmode='overlay')
# Reduce opacity to see both histograms
fig.update_traces(opacity=0.75)
fig.show()

### Activity

#### Daily Activity

In [None]:
genders = df['gender'].unique()
genders = genders[~np.isnan(genders)]

fig = go.Figure()
# Iterate through each device 
for d in genders:
    # Find the subset of the data with that device
    tmp_df = df[df['gender'] == d]

    # Create a dict object with 0 counts for all topics
    genders_daily_activity = {}
    for i in tmp_df.index:
        # Find that index
        tmp_timestamp = df['impression_time'][i]
        tmp_datetime = tmp_timestamp
        tmp_date = tmp_datetime.strftime('%m/%d/%Y')

        # Daily Activity
        if tmp_date not in genders_daily_activity:
            genders_daily_activity[tmp_date] = 0
        else:
            genders_daily_activity[tmp_date] +=1
        
    
    # Sort by dates
    genders_daily_activity = dict(sorted(genders_daily_activity.items()))

    # Add trace
    fig.add_trace(go.Bar(x= [x for x in genders_daily_activity.keys()], y = [y for y in genders_daily_activity.values()], textfont_size=20))



    # Overlay both histograms
    #fig.update_layout(barmode='overlay')
    # Reduce opacity to see both histograms
fig.update_traces(opacity=0.75)
fig.show()

#### Hourly Activity

In [None]:
genders = df['gender'].unique()
genders = genders[~np.isnan(genders)]

fig = go.Figure()
# Iterate through each device 
for d in genders:
    # Find the subset of the data with that device
    tmp_df = df[df['gender'] == d]

    # Create a dict object with 0 counts for all topics
    genders_hourly_activity = {}
    for i in tmp_df.index:
        # Find that index
        tmp_timestamp = df['impression_time'][i]
        tmp_datetime = tmp_timestamp
        tmp_time = tmp_datetime.strftime('%H')

        # Daily Activity
        if tmp_time not in genders_hourly_activity:
            genders_hourly_activity[tmp_time] = 0
        else:
            genders_hourly_activity[tmp_time] +=1
        
    
    # Sort by dates
    genders_hourly_activity = dict(sorted(genders_hourly_activity.items()))

    # Add trace
    fig.add_trace(go.Bar(x= [x for x in genders_hourly_activity.keys()], y = [y for y in genders_hourly_activity.values()], textfont_size=20))



    # Overlay both histograms
    #fig.update_layout(barmode='overlay')
    # Reduce opacity to see both histograms
fig.update_traces(opacity=0.75)
fig.show()

# Age

### Age Distribution

In [7]:
# distribution of genders
indices = df['age'].value_counts().sort_index().index
values = df['age'].value_counts().sort_index().values

fig = go.Figure()


# Add trace
fig.add_trace(go.Bar(x= indices, y = values, textfont_size=20))



    # Overlay both histograms
    #fig.update_layout(barmode='overlay')
    # Reduce opacity to see both histograms
fig.update_traces(opacity=0.75)
fig.show()

#### Average readtime per age group

In [None]:
# Age readtime/scroll
## Binned by 10-19, 20-29, etc

indices = df['age'].value_counts().sort_index().index
values = df['age'].value_counts().sort_index().values


# Create graph object
fig = go.Figure()

# for each age group add a readtime 
for age in indices:
    read_time = df[df['age'] == age]['read_time']
    fig.add_trace(go.Histogram(x=read_time))


fig.update_layout(barmode='overlay')
fig.update_traces(opacity=0.75)
fig.show()


In [None]:
# Average read time
## Binned by 10-19, 20-29, etc

indices = df['age'].value_counts().sort_index().index
values = df['age'].value_counts().sort_index().values


# Create graph object
fig = go.Figure()

# for each age group add a readtime 
for age in indices:
    avg_read_time = df[df['age'] == age]['read_time'].mean()
    fig.add_trace(go.Bar(x=[age], y = [avg_read_time], textfont_size=20))


fig.show()

Average Scroll PCt per age group

In [None]:
indices = df['age'].value_counts().sort_index().index
values = df['age'].value_counts().sort_index().values

# Create graph object
fig = go.Figure()

# for each age group add a readtime 
for age in indices:
    scroll_pct = df[df['age'] == age]['scroll_percentage']
    fig.add_trace(go.Histogram(x=scroll_pct))


fig.update_layout(barmode='overlay')
fig.update_traces(opacity=0.75)
fig.show()

In [None]:
indices = df['age'].value_counts().sort_index().index
values = df['age'].value_counts().sort_index().values

# Create graph object
fig = go.Figure()

# for each age group add a readtime 
for age in indices:
    avg_scroll_pct = df[df['age'] == age]['scroll_percentage'].mean()
    fig.add_trace(go.Bar(x=[age], y = [avg_scroll_pct], textfont_size=20))

fig.show()

### Topics

In [8]:
## Subset based on those entries
## populate the counts for the traces

ages = df['age'].unique()
ages = ages[~np.isnan(ages)]

#genders = [d for d in df['gender'].unique() ]

# Get all the unique topics
unique_topics = []
for i in df.index:
    for j in range(0, len(df['topics'][i])):
        tmp = df['topics'][i][j]
        if tmp not in unique_topics:
            unique_topics.append(tmp)

unique_topics = sorted(unique_topics)


fig = go.Figure()
# Iterate through each device 
for d in ages:
    # Find the subset of the data with that device
    tmp_df = df[df['age'] == d]

    # Create a dict object with 0 counts for all topics
    ages_topic_freq = {k:0 for k in unique_topics}
    for i in tmp_df.index:
        for j in range(0, len(df['topics'][i])):
            # Find that index
            tmp_topic = df['topics'][i][j]

            # Enumerate
            ages_topic_freq[tmp_topic] += 1
    
    ages_topic_freq = dict(sorted(ages_topic_freq.items(), key = lambda kv: kv[1], reverse= True))
    #print(device_topic_freq)
        
    fig.add_trace(go.Bar(x= [x for x in ages_topic_freq.keys()], y = [y for y in ages_topic_freq.values()], textfont_size=20))



# Overlay both histograms
#fig.update_layout(barmode='overlay')
# Reduce opacity to see both histograms
fig.update_traces(opacity=0.75)
fig.show()

### Activity

#### Daily Activity

In [9]:
ages = df['age'].unique()
ages = ages[~np.isnan(ages)]

fig = go.Figure()
# Iterate through each device 
for d in ages:
    # Find the subset of the data with that device
    tmp_df = df[df['age'] == d]

    # Create a dict object with 0 counts for all topics
    ages_daily_activity = {}
    for i in tmp_df.index:
        # Find that index
        tmp_timestamp = df['impression_time'][i]
        tmp_datetime = tmp_timestamp
        tmp_date = tmp_datetime.strftime('%m/%d/%Y')

        # Daily Activity
        if tmp_date not in ages_daily_activity:
            ages_daily_activity[tmp_date] = 0
        else:
            ages_daily_activity[tmp_date] +=1
        
    
    # Sort by dates
    ages_daily_activity = dict(sorted(ages_daily_activity.items()))

    # Add trace
    fig.add_trace(go.Bar(x= [x for x in ages_daily_activity.keys()], y = [y for y in ages_daily_activity.values()], textfont_size=20))



    # Overlay both histograms
    #fig.update_layout(barmode='overlay')
    # Reduce opacity to see both histograms
fig.update_traces(opacity=0.75)
fig.show()

#### Hourly Activity

In [10]:
ages = df['age'].unique()
ages = ages[~np.isnan(ages)]

fig = go.Figure()
# Iterate through each device 
for d in ages:
    # Find the subset of the data with that device
    tmp_df = df[df['age'] == d]

    # Create a dict object with 0 counts for all topics
    ages_hourly_activity = {}
    for i in tmp_df.index:
        # Find that index
        tmp_timestamp = df['impression_time'][i]
        tmp_datetime = tmp_timestamp
        tmp_time = tmp_datetime.strftime('%H')

        # Daily Activity
        if tmp_time not in ages_hourly_activity:
            ages_hourly_activity[tmp_time] = 0
        else:
            ages_hourly_activity[tmp_time] +=1
        
    
    # Sort by dates
    ages_hourly_activity = dict(sorted(ages_hourly_activity.items()))

    # Add trace
    fig.add_trace(go.Bar(x= [x for x in ages_hourly_activity.keys()], y = [y for y in ages_hourly_activity.values()], textfont_size=20))



    # Overlay both histograms
    #fig.update_layout(barmode='overlay')
    # Reduce opacity to see both histograms
fig.update_traces(opacity=0.75)
fig.show()

# Postcodes

### Distribution of Post Codes

In [14]:
# distribution of genders
indices = df['postcode'].value_counts().index
values = df['postcode'].value_counts().values

fig = go.Figure()


# Add trace
fig.add_trace(go.Bar(x= indices, y = values, textfont_size=20))



    # Overlay both histograms
    #fig.update_layout(barmode='overlay')
    # Reduce opacity to see both histograms
fig.update_traces(opacity=0.75)
fig.show()

### Average Read Time in each postal code

In [None]:
indices = df['postcode'].value_counts().sort_index().index
values = df['postcode'].value_counts().sort_index().values

# Create graph object
fig = go.Figure()

# for each post code add a readtime 
for postcode in indices:
    read_time = df[df['postcode'] == postcode]['read_time']
    fig.add_trace(go.Histogram(x=read_time))


fig.update_layout(barmode='overlay')
fig.update_traces(opacity=0.75)
fig.show()

In [None]:
indices = df['postcode'].value_counts().sort_index().index
values = df['postcode'].value_counts().sort_index().values

# Create graph object
fig = go.Figure()

# for each post code add a readtime 
for postcode in indices:
    avg_read_time = df[df['postcode'] == postcode]['read_time'].mean()
    fig.add_trace(go.Bar(x=[postcode], y = [avg_read_time], textfont_size=20))


fig.update_layout(barmode='overlay')
fig.update_traces(opacity=0.75)
fig.show()

### Average Scroll pct in each postal code

In [None]:
indices = df['postcode'].value_counts().sort_index().index
values = df['postcode'].value_counts().sort_index().values

# Create graph object
fig = go.Figure()

# for each post code add a readtime 
for postcode in indices:
    scroll_pct = df[df['postcode'] == postcode]['scroll_percentage']
    fig.add_trace(go.Histogram(x=scroll_pct))


fig.update_layout(barmode='overlay')
fig.update_traces(opacity=0.75)
fig.show()

In [None]:
indices = df['postcode'].value_counts().sort_index().index
values = df['postcode'].value_counts().sort_index().values

# Create graph object
fig = go.Figure()

# for each post code add a readtime 
for postcode in indices:
    avg_scroll_pct = df[df['postcode'] == postcode]['scroll_percentage'].mean()
    fig.add_trace(go.Bar(x=[postcode], y = [avg_scroll_pct], textfont_size=20))


fig.update_layout(barmode='overlay')
fig.update_traces(opacity=0.75)
fig.show()

### Topics

In [16]:
## Subset based on those entries
## populate the counts for the traces

postcodes = df['postcode'].unique()
postcodes = postcodes[~np.isnan(postcodes)]

#genders = [d for d in df['gender'].unique() ]

# Get all the unique topics
unique_topics = []
for i in df.index:
    for j in range(0, len(df['topics'][i])):
        tmp = df['topics'][i][j]
        if tmp not in unique_topics:
            unique_topics.append(tmp)

unique_topics = sorted(unique_topics)


fig = go.Figure()
# Iterate through each device 
for d in postcodes:
    # Find the subset of the data with that device
    tmp_df = df[df['postcode'] == d]

    # Create a dict object with 0 counts for all topics
    postcodes_topic_freq = {k:0 for k in unique_topics}
    for i in tmp_df.index:
        for j in range(0, len(df['topics'][i])):
            # Find that index
            tmp_topic = df['topics'][i][j]

            # Enumerate
            postcodes_topic_freq[tmp_topic] += 1
    
    postcodes_topic_freq = dict(sorted(postcodes_topic_freq.items(), key = lambda kv: kv[1], reverse= True))
    #print(device_topic_freq)
        
    fig.add_trace(go.Bar(x= [x for x in postcodes_topic_freq.keys()], y = [y for y in postcodes_topic_freq.values()], textfont_size=20))



# Overlay both histograms
#fig.update_layout(barmode='overlay')
# Reduce opacity to see both histograms
fig.update_traces(opacity=0.75)
fig.show()

### Activity

#### Daily Activity

In [18]:
postcodes = df['postcode'].unique()
postcodes = postcodes[~np.isnan(postcodes)]

fig = go.Figure()
# Iterate through each device 
for d in postcodes:
    # Find the subset of the data with that device
    tmp_df = df[df['postcode'] == d]

    # Create a dict object with 0 counts for all topics
    postcodes_daily_activity = {}
    for i in tmp_df.index:
        # Find that index
        tmp_timestamp = df['impression_time'][i]
        tmp_datetime = tmp_timestamp
        tmp_date = tmp_datetime.strftime('%m/%d/%Y')

        # Daily Activity
        if tmp_date not in postcodes_daily_activity:
            postcodes_daily_activity[tmp_date] = 0
        else:
            postcodes_daily_activity[tmp_date] +=1
        
    
    # Sort by dates
    postcodes_daily_activity = dict(sorted(postcodes_daily_activity.items()))

    # Add trace
    fig.add_trace(go.Bar(x= [x for x in postcodes_daily_activity.keys()], y = [y for y in postcodes_daily_activity.values()], textfont_size=20))



    # Overlay both histograms
    #fig.update_layout(barmode='overlay')
    # Reduce opacity to see both histograms
fig.update_traces(opacity=0.75)
fig.show()

#### Hourly Activity

In [22]:
postcodes = df['postcode'].unique()
postcodes = postcodes[~np.isnan(postcodes)]


fig = go.Figure()
# Iterate through each device 
for d in postcodes:
    # Find the subset of the data with that device
    tmp_df = df[df['postcode'] == d]

    # Create a dict object with 0 counts for all topics
    postcodes_hourly_activity = {}
    for i in tmp_df.index:
        # Find that index
        tmp_timestamp = df['impression_time'][i]
        tmp_datetime = tmp_timestamp
        tmp_time = tmp_datetime.strftime('%H')

        # Daily Activity
        if tmp_time not in postcodes_hourly_activity:
            postcodes_hourly_activity[tmp_time] = 0
        else:
            postcodes_hourly_activity[tmp_time] +=1
        
    
    # Sort by dates
    postcodes_hourly_activity = dict(sorted(postcodes_hourly_activity.items()))

    # Add trace
    fig.add_trace(go.Bar(x= [x for x in postcodes_hourly_activity.keys()], y = [y for y in postcodes_hourly_activity.values()], textfont_size=20))



    # Overlay both histograms
    #fig.update_layout(barmode='overlay')
    # Reduce opacity to see both histograms
fig.update_traces(opacity=0.75)
fig.show()

In [32]:
# How many unique articles are clicked in a session?

tmp_aps = df.groupby('session_id')['article_id'].apply(list)

articles_per_session = {k:0 for k in range(1, 20)}

for i in tmp_aps:
    num_articles = len(i)
    articles_per_session[num_articles] += 1


indices = [k for k in articles_per_session.keys()]
values = [k for k in articles_per_session.values()]


fig = go.Figure()

# Add trace
fig.add_trace(go.Bar(x= indices, y = values, textfont_size=20))

fig.update_traces(opacity=0.75)
fig.show()
