Task
- Refactor code
    - Fix all visualizations
    - Functionalize everything
- Fix the plots
- Start on EDA
    - Each major feature will cover distribution of the feature, readtime/scroll pct, how it relates to the topics selected, and activity distribution.
        - We will use histograms, bar plots, box plots, and scatter plots (can look at pie charts too)
        - Separate out some plots maybe
        - Look into changing activity to scatter plots
        - Next thing we have to do is look update the multiple bar plot
        
- Model selection

Background Information on Dataset: BLAH BLAH BLAH

Importing Packages

In [288]:
import numpy as np
import pandas as pd

import plotly
import plotly.express as px
import plotly.graph_objects as go

from datetime import datetime
from plotly.subplots import make_subplots

# EDA

Load in dataset

In [289]:
#Load in various dataframes
## Articles
df_art = pd.read_parquet("Data/Small/articles.parquet")

## Behaviors
df_bev = pd.read_parquet("Data/Small/train/behaviors.parquet")

## History
df_his = pd.read_parquet("Data/Small/train/history.parquet")



Join the data sources

In [290]:
# Convert datatype of column first
df_bev['article_ids_clicked'] = df_bev['article_ids_clicked'].apply(lambda x: x[0])

In [291]:
# Join bevhaiors to article
df= df_bev.join(df_art.set_index("article_id"), on = "article_ids_clicked")

# Join bevhaiors to history 
df= df.join(df_his.set_index("user_id"), on = "user_id")

# Drop all other dataframes from me
df_bev = []
df_his = []
df_art = []

In [292]:
# Preprocessing
df.dropna(subset=['article_id'], inplace= True)
#df.dropna(subset =['age'], inplace = True)

# Change article IDs into int
df['article_id'] = df['article_id'].apply(lambda x: int(x))
df['article_id']= df['article_id'].astype(np.int64)


# Change genders from float to strings
def gender_(x):
    if x == 0.0:
        return 'Male'
    elif x == 1.0:
        return 'Female'
    else:
        return None

df['gender'] = df['gender'].apply(lambda x: gender_(x))

# Change age to int
#df['age'] = df['age'].apply(lambda x: np.int_(x) if np.isnan(x) == False else x)
#df['age'] = df[~df['age'].isnull()]['age'].astype(np.int32)

# Change age to str it's a range
df['age'] = df['age'].astype('Int64')
df['age'] = df['age'].astype(str)
df['age'] = df['age'].apply(lambda x: x if x == '<NA>' else x + ' - ' + x[0] + '9' )



# Change postcodes
# Change genders from float to strings
def postcodes_(x):
    if x == 0.0:
        return 'Metropolitan'
    elif x == 1.0:
        return 'Rural District'

    elif x == 2.0:
        return 'Municipality'
    
    elif x == 3.0:
        return 'Provincial'
    
    elif x == 4.0:
        return 'Big City'

    else:
        return None

df['postcode'] = df['postcode'].apply(lambda x: postcodes_(x))





Visualizations

Lets calculate the unique users for hourly, daily, and day of the week. Let's use a subset of the data until we know our plots are very good

#### Biggest thing is user engagement : Bigger User Engagement -> More eveneue
#### We need to maximize the amount of ads these guys are viewing -> this leads on to them clicking on new articles for ads
#### So, let's not make article length too short so that people can maximize their session lengths with a lot of articles!

In [None]:
- subplot without shared axes
- subplot with

In [293]:
def single_subset_bar(df_, feature_, xaxis_title, yrange):
    # Index and Values 
    indices = [xaxis_title]
    values_ = [len(df_[feature_].unique())]

    # Instantiate figure object
    fig = go.Figure()

    # Add traces
    fig.add_trace(go.Bar(x = indices, y = values_,
                        width=[0.3], text = '<b>{}<b>'.format(values_[0]),
                            )

    )
    # Update axis properties
    fig.update_yaxes(
        title_text= 'Count', range = yrange
        )

    # Update trace properties
    fig.update_traces(
        textposition='outside',
        textfont=dict(
            family='sans serif',
            size=16,
            color='#1f77b4'
            )
            )
            
    # Update layout of plot
    fig.update_layout(
        title = '<b>Total {}<b>'.format(xaxis_title) ,
        uniformtext_minsize=8, uniformtext_mode='hide',  
        font=dict(
            family="Courier New, monospace",
            size=16,
            )
            )

    return fig.show()

In [294]:
def multiple_subset_bar(df_, feature_, yrange):

    # Assign tmp_df based on feature
    if feature_ == 'age':
        tmp_df = df_[df_['age'] != '<NA>']
    else:
        tmp_df = df_[~df_[feature_].isnull()]

    
    categories = [d for d in tmp_df[feature_].unique()]
    categories.sort()


    fig = go.Figure()
    for category_ in categories:
        ## Check for Frequency vs Count 
        count= len(tmp_df[tmp_df[feature_] == category_])

        fig.add_trace(
            go.Bar(
                x= [str(category_)], y = [count],
                text = '<b>{}<b>'.format(count), 
                name= str(category_)
            )
                
        )

    # Update trace properties
        
    # Update axis properties
    fig.update_yaxes(
        title_text= 'Count', range = yrange
        )
    
    fig.update_xaxes(
        title_text= str(feature_)
        )

    # Update trace properties
    fig.update_traces(
        textposition='outside',
        textfont=dict(
            family='sans serif',
            size=16,
            color='#1f77b4'
            )
            )
            
    # Update layout of plot
    fig.update_layout(
        title = '<b>Distribution of {}<b>'.format(feature_) ,
        uniformtext_minsize=8, uniformtext_mode='hide',  
        font=dict(
            family="Courier New, monospace",
            size=16,
            )
            )

    return fig.show()

In [295]:
def single_subset_feature_visualization(df_,  feature_, data_title):
    """ Takes in an array of values and will output a histogram / boxplot / average""" 
    fig = make_subplots(
        rows=3, cols=1, subplot_titles=("<b>Histogram<b>", "<b>Box plot<b>", "<b>Average {} for {}<b>".format(feature_, data_title))
    )
    
    tmp_df = df_[~df_[feature_].isnull()]
    
    values = tmp_df[feature_].values
    # Average
    average = values.mean()

    # Add traces
    fig.add_trace(go.Histogram(x= values, name = 'Histogram'), row=1, col=1)

    xo = [data_title for x in range(0, len(values))]
    fig.add_trace(go.Box(y = values, x = xo, name = 'Box plot'), row=2, col=1)
    fig.add_trace(go.Bar(x=[data_title], y = [average], width = [0.3], name = 'Bar plot'),  row=3, col=1)

    # Update xaxis properties
    fig.update_xaxes(title_text=str(feature_), row=1, col=1)
   # fig.update_xaxes(title_text=str(feature_), row=2, col=1)
   # fig.update_xaxes(title_text=str(feature_), row=3, col=1)
   

    # Update yaxis properties
    fig.update_yaxes(title_text='Count', row=1, col=1)
    fig.update_yaxes(title_text=str(feature_), row=2, col=1)
    fig.update_yaxes(title_text=str(feature_), range = [0, 110], row=3, col=1)
    
    # Update suplot title sizes
    fig.update_annotations(font_size= 20,)

    # Update title and height
    fig.update_layout(
        title_text= "<b>Distributions of {} for {}<b>".format(feature_, data_title), height= 750, width = 1000,
        uniformtext_minsize=8, uniformtext_mode='hide',  
        font=dict(
            family="Courier New, monospace",
            size=16,
            )
            )
        
        


    return fig.show()
    
    

In [296]:
def multiple_subset_feature_visualization(df_,  feature_1, feature_2):
    """ Takes in an array of values and will output a histogram / boxplot / average""" 
    fig = make_subplots(
        rows=3, cols=1, subplot_titles=("<b>Histogram<b>", "<b>Box plot<b>", "<b>Average {} for each {}<b>".format(feature_2, feature_1))
    )

    # Assign tmp_df based on feature
    if feature_1 == 'age':
        tmp_df = df_[df_['age'] != '<NA>']
    else:
        tmp_df = df_[~df_[feature_1].isnull()]

    categories = [d for d in tmp_df[feature_1].unique()]
    categories.sort()

    # Populate the graph
    for category_ in categories:
        subset_feature_2 = tmp_df[tmp_df[feature_1] == category_][feature_2].values
        avg = tmp_df[tmp_df[feature_1] == category_][feature_2].mean()
        # Add histogram
        fig.add_trace(
            go.Histogram(
                x= subset_feature_2,
                name = str(category_) + ' Histogram',
                ),
                
                row=1, col=1
                
        )
        # Add Boxplot
        ## Need to create an array that is similar to the array used in subset_feature_2, to name the traces!
        xo = [str(category_) for x in range(0, len(subset_feature_2))]
        
        fig.add_trace(
            go.Box(
                y= subset_feature_2, x = xo,
                name = str(category_) + ' Box',
                ),
                
                row=2, col=1
                
        )
        # Add Bar
       # count= len(subset_feature_2)


       # print(subset_feature_2)
        fig.add_trace(
            go.Bar(
                x= [str(category_)], y = [avg],
                text = '<b>{}<b>'.format(avg), 
                textposition='outside',
                name= str(category_) + ' Bar',
                textfont=dict(
                family='sans serif',
                size=18,
                color='#1f77b4'
                )
                
            ), 
            row =3, col = 1
                
        )



    
   # tmp_df = df_[~df_[feature_].isnull()]
    
   # values = tmp_df[feature_2].values


    # Update xaxis properties
    fig.update_xaxes(title_text=str(feature_2), row=1, col=1)
    fig.update_xaxes(title_text=str(feature_1), row=2, col=1)
    fig.update_xaxes(title_text=str(feature_1), row=3, col=1)
   

    # Update yaxis properties
    fig.update_yaxes(title_text='Count', row=1, col=1)
    fig.update_yaxes(title_text=str(feature_2), row=2, col=1)
    fig.update_yaxes(title_text=str(feature_2), range = [0, 125], row=3, col=1)
    
    # Update suplot title sizes
    fig.update_annotations(font_size= 20,)


    # Update title and height
    fig.update_layout(
        title_text= "<b>Distributions of {} for each {}<b>".format(feature_2, feature_1), height= 750, width = 1000,
        uniformtext_minsize=8, uniformtext_mode='hide',  
        font=dict(
            family="Courier New, monospace",
            size=16,
            )
        
        )
            


    return fig.show()

In [355]:
def plot_bar(indices_, values_, yrange_, xaxis_title, yaxis_title, title_):
    """Given indices and values populate a bar graph """

    fig = go.Figure()
    for idx, val in zip(indices_, values_):
        ## Check for Frequency vs Count 

        fig.add_trace(
            go.Bar(
                x= [str(idx)], y = [val],
                text = '<b>{}<b>'.format(val), 
                name= str(idx)
            )
                
        )

    # Update trace properties
        
    # Update axis properties
    fig.update_yaxes(
        title_text= yaxis_title, range = yrange_
        )
    
    fig.update_xaxes(
        title_text= xaxis_title
        )

    # Update trace properties
    fig.update_traces(
        textposition='outside',
        textfont=dict(
            family='sans serif',
            size=16,
            color='#1f77b4'
            )
            )
            
    # Update layout of plot
    fig.update_layout(
        title = title_, height= 750, width = 1000,
        uniformtext_minsize=8, uniformtext_mode='hide',  
        font=dict(
            family="Courier New, monospace",
            size=16,
            )
            )

    return fig.show()

In [356]:
def plot_box(indices_, values_, yrange_, xaxis_title, yaxis_title, title_):
    """Given indices and values populate a bar graph """

    fig = go.Figure()
    for idx, val in zip(indices_, values_):
        ## Check for Frequency vs Count 

        fig.add_trace(
            go.Box(
                y = [val],
                #text = '<b>{}<b>'.format(val), 
                name= str(idx)
            )
                
        )

    # Update trace properties
        
    # Update axis properties
    fig.update_yaxes(
        title_text= yaxis_title, range = yrange_
        )
    
    fig.update_xaxes(
        title_text= xaxis_title
        )

    # Update trace properties

            
    # Update layout of plot
    fig.update_layout(
        title = title_, height= 750, width = 1000,
        uniformtext_minsize=8, uniformtext_mode='hide',  
        font=dict(
            family="Courier New, monospace",
            size=16,
            )
            )

    return fig.show()

In [357]:
def plot_scatter(indices_, values_, yrange_, xaxis_title, yaxis_title, title_):
    """Given indices and values populate a bar graph """

    fig = go.Figure()

    fig.add_trace(go.Scatter( x = indices, y = values_, mode = 'lines', name = 'Line', marker = dict(color = "rgba(135, 206, 250, 0.5)" )

    )
    )

    for idx, val in zip(indices_, values_):
        ## Check for Frequency vs Count 

        fig.add_trace(
            go.Scatter(
                x= [str(idx)], y = [val],
                text = '<b>{}<b>'.format(val), 
                name= str(idx), 
                marker=dict(size = 12, 
                
            ),
            mode='lines+markers+text'
                
        )
        )


    # Update trace properties
        
    # Update axis properties
    fig.update_yaxes(
        title_text= yaxis_title, range = yrange_
        )
    
    fig.update_xaxes(
        title_text= xaxis_title
        )
    
    # Update trace properties
    fig.update_traces(
        textposition='bottom center',
        textfont=dict(
            family='sans serif',
            size=12,
            color='#1f77b4'
            )
            )
       
    # Update layout of plot
    fig.update_layout(
        title = title_, height= 750, width = 1000,
        uniformtext_minsize=8, uniformtext_mode='hide',  
        font=dict(
            family="Courier New, monospace",
            size=16,
            )
            )

    return fig.show()

## Overall Feature Analysis

### Number of Impressions

In [None]:
# Number of Impressions
single_subset_bar(df_ = df, feature_ = 'impression_id', xaxis_title = 'Number of Impressions', yrange = [0, 80000])

### Distribution of Readtimes

In [None]:
single_subset_feature_visualization(df_ = df, feature_ = 'read_time')

### Distribution of Scroll Percentages

In [None]:
single_subset_feature_visualization(df_ = df, feature_ = 'scroll_percentage')

## Users

### Number of Users


In [None]:
single_subset_bar(df_ = df, feature_ = 'user_id', xaxis_title = 'Number of Users', yrange = [0, 11000])

### Daily User growth

In [None]:
unique_user_ids = df['user_id'].unique()

# Create dictionaries
unique_users_daily_growth_freq= {}
unique_users_hourly_freq = {}
unique_users_dayofweek_freq = {}
unique_users_weekly_freq = {}

for id in unique_user_ids:
    # Get the subset of that user id
    tmp_df = df[df['user_id'] == id]
    first_index = tmp_df['impression_time_fixed'].index[0]
    tmp_datetime = pd.DatetimeIndex(tmp_df['impression_time_fixed'][first_index])
    tmp_date = tmp_datetime[0].date()
    join_date = tmp_date.strftime('%m/%d/%Y')
    #join_date = join_date.strftime('%m/%d/%Y')
    
    if join_date not in unique_users_daily_growth_freq:
        unique_users_daily_growth_freq[join_date] = 1
    else:
        unique_users_daily_growth_freq[join_date] +=1
    
unique_users_daily_growth_freq = dict(sorted(unique_users_daily_growth_freq.items()))

In [None]:
indices = [x for x in unique_users_daily_growth_freq.keys()]
values = [x for x in unique_users_daily_growth_freq.values()]

fig = go.Figure()


# Add traces
fig.add_trace(go.Bar(x = indices, y = values,
                         
                         )

)

fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')

fig.show()

### Average readtime per user

In [None]:
## Average readtime per user

tmp_user_df = pd.DataFrame(data = df.groupby(by =  'user_id')['read_time'].mean(), columns = ['read_time'])
single_subset_feature_visualization(df_ = tmp_user_df,  feature_ = 'read_time', data_title = 'Unique Users')



### Average scroll percentage per user

In [None]:
## Average readtime per user

tmp_user_df = pd.DataFrame(data = df.groupby(by =  'user_id')['scroll_percentage'].mean(), columns = ['scroll_percentage'])
single_subset_feature_visualization(df_ = tmp_user_df,  feature_ = 'scroll_percentage', data_title = 'Unique Users')

### User Activity

In [299]:
def populate_dict(list_, dict_):
    
    for idx in list_:
        if idx not in dict_:
            dict_[idx] = 1
        else:
            dict_[idx] += 1

In [300]:
def weekly_map(list_):
    weeks = [17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27]
    placeholder = [i for i in range(1, 9)]
    dict_ = {k:v for k,v in zip(weeks, placeholder)}
    res = []

    for idx in list_:
        res.append('Week '+ str(dict_[idx]))
    
    return res


In [301]:
def int_dow_dict(dict_):

    str_dow = ['Monday', 'Tuesday', 'Wednesday' , 'Thursday', 'Friday', 'Saturday', 'Sunday']
    int_dow = [i for i in range(7)]
    dow_dict = {k:v for k,v in zip(int_dow, str_dow)}

    res = {}

    for keys in dict_.keys():
        res[dow_dict[keys]] = dict_[keys]


    
    return res

In [None]:
# Let's make subsets for each user_id and then populate the frequency based on that

## Get all unique ids in a list

unique_user_ids = df['user_id'].unique()[0:1000]

# Create dictionaries
unique_users_daily_freq = {}
unique_users_hourly_freq = {}
unique_users_dayofweek_freq = {}
unique_users_weekly_freq = {}

# Generate an empty dataframe with the unique user ids
## append new entries

for id in unique_user_ids:
    # Get the subset of that user id
    tmp_df = df[df['user_id'] == id]

    # Now lets go through each and populate the unique dates, hours and day of the week for each user
    dates = []
    hours = []
    dayofweek = []
    week = []
    indices = np.array(tmp_df.index)
    
    for i in indices:
        tmp_datetime = pd.DatetimeIndex(tmp_df['impression_time_fixed'][i])
        tmp_date = tmp_datetime.date
        tmp_time = tmp_datetime.time
        tmp_dayofweek = tmp_datetime.weekday
        tmp_week = tmp_datetime.isocalendar().week

        for j,k,l,m in zip(tmp_date, tmp_time, tmp_dayofweek, tmp_week):
            dates.append(j)
            hours.append(k)
            dayofweek.append(l)
            week.append(m)

    
    # Get rid of duplicate values
    unique_dates = list(set(dates))
    unique_hours = list(set(hours))
    unique_dayofweek = list(set(dayofweek))
    unique_week = list(set(week))
    
    # Convert to string
    unique_hours = [x.hour for x in unique_hours]
    unique_hours = [str(i) + ':00' if i > 9 else str(0) + str(i) + ':00' for i in unique_hours]

    # Convert Week to W
    # Convert the week int to mapping from 1++
    unique_week= weekly_map(unique_week)
        
    # Populate dicts
    populate_dict(list_ = unique_dates, dict_ = unique_users_daily_freq)
    populate_dict(list_ = unique_hours, dict_ = unique_users_hourly_freq)
    populate_dict(list_ = unique_dayofweek, dict_ = unique_users_dayofweek_freq)
    populate_dict(list_ = unique_week, dict_ = unique_users_weekly_freq)


# Create tmp dataframes
unique_users_daily_freq = dict(sorted(unique_users_daily_freq.items()))
unique_users_hourly_freq = dict(sorted(unique_users_hourly_freq.items()))

# Sort by integers for day of the week and then lets chane the dict
unique_users_dayofweek_freq = dict(sorted(unique_users_dayofweek_freq.items()))
unique_users_dayofweek_freq = int_dow_dict(unique_users_dayofweek_freq)

unique_users_weekly_freq = dict(sorted(unique_users_weekly_freq.items()))
                

#### Daily User Activity

In [None]:
# What does the daily user activity look like?
unique_users_daily_freq = dict(sorted(unique_users_daily_freq.items()))

indices = [x for x in unique_users_daily_freq.keys()]
values = [x for x in unique_users_daily_freq.values()]


plot_scatter(
    indices_ = indices , values_ = values,
    yrange_ = [200, 900], xaxis_title = 'Date',
    yaxis_title= 'Active Users', title_ = '<b>Daily Active Users<b>'
    )

#### Hourly User Activity

In [None]:
# What does the hourly user activity look like?
indices = [x for x in unique_users_hourly_freq.keys()]
values = [x for x in unique_users_hourly_freq.values()]

fig = go.Figure()

plot_scatter(
    indices_ = indices , values_ = values,
    yrange_ = [0, 20000], xaxis_title = 'Hour',
    yaxis_title= 'Active Users', title_ = '<b>Hourly Active Users<b>'
    )

#### Weekly User Activity

In [None]:
# What does the weekly user activity look like?
indices = [x for x in unique_users_weekly_freq.keys()]
values = [x for x in unique_users_weekly_freq.values()]

plot_bar(
    indices_ = indices, values_ = values,
    yrange_ = [0, 1100], xaxis_title = 'Week',
    yaxis_title= 'Active Users', title_ = '<b> Weekly Active Users <b>')

#### Day Of The Week User Activity

In [None]:
# What does the weekly user activity look like?
indices = [x for x in unique_users_dayofweek_freq.keys()]
values = [x for x in unique_users_dayofweek_freq.values()]

plot_bar(
    indices_ = indices, values_ = values,
    yrange_ = [0, 1100], xaxis_title = 'Day',
    yaxis_title= 'Active Users', title_ = '<b> Day of the Week Activity  <b>')

## Session

### Number of Sessions

In [None]:
single_subset_bar(df_ = df, feature_ = 'session_id', xaxis_title = 'Number of Sessions', yrange = [0, 40000])

### Average readtime per session

In [None]:
tmp_session_df = pd.DataFrame(data = df.groupby(by =  'session_id')['read_time'].mean(), columns = ['read_time'])
single_subset_feature_visualization(df_ = tmp_session_df,  feature_ = 'read_time', data_title = 'Unique Sessions')

### Average scroll percentage per session

In [None]:
tmp_session_df = pd.DataFrame(data = df.groupby(by =  'session_id')['scroll_percentage'].mean(), columns = ['scroll_percentage'])
single_subset_feature_visualization(df_ = tmp_session_df,  feature_ = 'scroll_percentage', data_title = 'Unique Sessions')

### Daily Active Sessions

In [None]:
# Number of unique sessions per day
copy_df = df.copy()
copy_df['impression_time'] = copy_df['impression_time'].apply(lambda x: x.strftime('%m/%d/%Y'))

unique_sessions_per_day = copy_df.groupby(by = 'session_id')['impression_time'].min()
tmp_dau_df = pd.DataFrame(data = unique_sessions_per_day.values, index = unique_sessions_per_day.keys(), columns= ['Session Dates'])

# Plot
multiple_subset_bar(df_ = tmp_dau_df , feature_ = 'Session Dates', yrange = [0, 8000])

## Topic

### Number of Topics

In [302]:

# Create a Topics DF
topic_list = []
for index in df['topics']:
    for topic_ in index:
        if topic_ not in topic_list:
            topic_list.append(topic_)

tmp_topic_df = pd.DataFrame(data = topic_list, columns = ['topics'])


# Number of Topics
single_subset_bar(df_ = tmp_topic_df, feature_ = 'topics', xaxis_title = 'Number of Topics', yrange = [0, 100])


In [338]:
def article_id_scroll_read (dict_, res):
    
    for k,v in zip(dict_.keys(), dict_.values()):
        if (k in res.keys()):
            tmp_array= np.append(res[k],v)
            res[k] = tmp_array
        if (k not in res.keys()):
            res[k] = []
    
    return res

In [401]:
# Let's get unique_user_ids topics

## Get all unique ids in a list

unique_user_ids = df['user_id'].values[0:1000]

# Create dictionaries
unique_users_topics_freq= {}
unique_topic_scroll_freq = {}
unique_topic_read_freq = {}

for id in unique_user_ids:
    # Get the subset of that user id
    tmp_df = df[df['user_id'] == id]

    # Now lets go through each topic
    indices = np.array(tmp_df.index)
    for i in indices:
        tmp_topics = tmp_df['topics'][i]
        tmp_scroll = tmp_df['scroll_percentage'][i]
        tmp_read = tmp_df['read_time'][i]

        topics = [x for x in tmp_topics]
        scroll = [tmp_scroll]
        read = [tmp_read]

    # Find the average scroll percentages across each topic  (Can be related to whether a topic doesnt require too much reading has visualizations)
    ### Look at article_id for whichever topics the article is included in add that scroll percentage
        tmp_topic_scroll = {k:v for k,v in zip(topics, scroll)}
        unique_topic_scroll_freq = article_id_scroll_read(tmp_topic_scroll, unique_topic_scroll_freq)

    # Find the average read time across each topic
    ### Look at article_id for whichever topics the article is included in add that readtime 
        tmp_topic_read = {k:v for k,v in zip(topics, read)}
        unique_topic_read_freq = article_id_scroll_read(tmp_topic_read, unique_topic_read_freq )

    ## Unique User Topics
    # Get rid of duplicate values
    unique_topics = list(set(topics))

    # Populate our dict
    populate_dict(unique_topics, unique_users_topics_freq)


# Sort the dictionaries
sorted_topic_freq = dict(sorted(unique_users_topics_freq.items(), key = lambda x: x[1], reverse = True))

# Find the average read times across each topic
unique_topic_read_avg_freq = {k:round(np.nanmean(v), 2) for k,v in zip(unique_topic_read_freq.keys(), unique_topic_read_freq.values()) }
sorted_unique_topic_read_avg_freq = dict(sorted(unique_topic_read_avg_freq.items(), key = lambda x: x[1], reverse = True))

# Sort the topics for distribution
sorted_unique_topic_read_freq = dict(sorted(unique_topic_read_freq.items()))


In [397]:
sorted_unique_topic_read_freq

{'Sundhed': 115.68,
 'Bolig': 83.67,
 'Konflikt og krig': 83.14,
 'Underholdning': 79.91,
 'Begivenhed': 78.53,
 'Dyr': 75.18,
 'Livsstil': 74.33,
 'Politik': 73.62,
 'Kendt': 73.6,
 'Uddannelse': 72.07,
 'Kriminalitet': 70.98,
 'Erhverv': 70.76,
 'Samfund': 69.55,
 'Sport': 66.83,
 'Transportmiddel': 64.95,
 'Kultur': 64.8,
 'International politik': 64.12,
 'Katastrofe': 63.71,
 'Økonomi': 61.67,
 'Teknologi': 60.74,
 'Vejr': 54.45,
 'Videnskab': 48.16}

### Distribution of Topics across users

In [399]:
# What does the distribution of topics look like lets sort it?
indices = [x for x in sorted_topic_freq.keys()][0:10]
values = [x for x in sorted_topic_freq.values()][0:10]

plot_bar(
    indices_ = indices, values_ = values,
    yrange_ = [0, 400], xaxis_title = 'Topics',
    yaxis_title= 'Count', title_ = '<b> Top 10 Topics User Activity<b>')

### Average readtime per topic

In [400]:
indices = [x for x in sorted_unique_topic_read_avg_freq.keys()][0:5]
values = [x for x in sorted_unique_topic_read_avg_freq.values()][0:5]

plot_bar(
    indices_ = indices, values_ = values,
    yrange_ = [0, 150], xaxis_title = 'Topics',
    yaxis_title= 'Read Time', title_ = '<b> Top 10 Topics User Activity<b>')

In [409]:
def plot_box(indices_, values_, yrange_, xaxis_title, yaxis_title, title_):
    """Given indices and values populate a bar graph """


    fig = go.Figure()
    for trace_, name_ in zip(values_, indices_):
        fig.add_trace(go.Box(y = trace_, name = name_))

    # Update trace properties
        
    # Update axis properties
    fig.update_yaxes(
        title_text= yaxis_title, range = yrange_
        )
    
    fig.update_xaxes(
        title_text= xaxis_title
        )

    # Update trace properties

            
    # Update layout of plot
    fig.update_layout(
        title = title_, height= 750, width = 1000,
        uniformtext_minsize=8, uniformtext_mode='hide',  
        font=dict(
            family="Courier New, monospace",
            size=16,
            )
            )

    return fig.show()

In [411]:
# Boxplot

indices = [x for x in sorted_unique_topic_read_freq.keys()][0:10]
values = [x for x in sorted_unique_topic_read_freq.values()][0:10]

plot_box(
    indices_ = indices, values_ = values,
    yrange_ = [0, 2000], xaxis_title = 'Topics',
    yaxis_title= 'Read Time', title_ = '<b> Top 10 Topics User Activity<b>')


### Average scroll perct per topic

In [389]:
# Find the average scroll percentages across each topic

# Get the average scroll length for each article
unique_topic_scroll_avg_freq = {}
for k,v in zip(unique_topic_scroll_freq.keys(), unique_topic_scroll_freq.values()):
    unique_topic_scroll_avg_freq [k] = np.nanmean(v)

sorted_unique_topic_scroll_freq = dict(sorted(unique_topic_scroll_avg_freq.items(), key = lambda x: x[1], reverse = True))


indices = [x for x in sorted_unique_topic_scroll_freq.keys()]
values = [x for x in sorted_unique_topic_scroll_freq.values()]

fig = go.Figure()

# Add traces
fig.add_trace(go.Bar(x = indices, y = values,
                         
                         )

)

fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')

fig.show()

In [391]:
# Boxplot

sorted_unique_topic_scroll_freq = dict(sorted(unique_topic_scroll_freq.items()))


indices = [x for x in sorted_unique_topic_scroll_freq.keys()]
values = [x for x in sorted_unique_topic_scroll_freq.values()]

fig = go.Figure()

for trace_, name_ in zip(values, indices):

    fig.add_trace(go.Box(y = trace_, name = name_))


fig.show()

### Activity

In [None]:
# Get the list of each unqiue topic in a specific session 
topics = df.groupby(by = 'session_id')['topics'].apply(list)

# Get the list of each unique timestamp for these sessions
timestamps = df.groupby(by = 'session_id')['impression_time'].apply(list)

# Get all the unique topics

# Get all the unique topics
unique_topics = []
for i in range(len(topics.values)):
    for j in range(0, len(topics.values[i][0])):
        tmp = topics.values[i][0][j]
        if tmp not in unique_topics:
            unique_topics.append(tmp)


unique_topics = sorted(unique_topics)

# Get all unique dates
timestamps = df.groupby(by = 'session_id')['impression_time'].apply(list)

unique_dates = []
unique_hours= [str(i) if i > 9 else str(0) + str(i) for i in range(24)]

for i in range(len(timestamps.values)):
    for j in range(len(timestamps.values[i])):
        tmp_datetime = timestamps.values[i][j]
        tmp_date = tmp_datetime.strftime('%m/%d/%Y')
        if tmp_date not in unique_dates:
            unique_dates.append(tmp_date)


unique_dates = sorted(unique_dates)





In [None]:
unique_topic_daily_activity  = {k:{k:0 for k in unique_dates} for k in unique_topics}
unique_topic_hourly_activity  = {k:{k:0 for k in unique_hours} for k in unique_topics}

# Populate the dictionary
for i in zip(range(len(topics.values))):
    for j, k  in zip(range(0, len(topics.values[i][0])), range(0, len(i))):
        tmp = topics.values[i][0][j]
        tmp_datetime = timestamps.values[i][k]
        tmp_date = tmp_datetime.strftime('%m/%d/%Y')
        tmp_time = tmp_datetime.strftime('%H')

        # Add to dictionary
        unique_topic_daily_activity[tmp][tmp_date] +=1
        unique_topic_hourly_activity[tmp][tmp_time] +=1
    




#### Daily

In [None]:
fig = go.Figure()

for topic in unique_topic_daily_activity.keys():
    # What does the daily user activity look like?
    indices = [x for x in unique_topic_daily_activity[topic].keys()]
    values = [x for x in unique_topic_daily_activity[topic].values()]


    # Add traces
    fig.add_trace(go.Scatter(x = indices, y = values,
                            mode = 'lines+markers'
                            )

    )

fig.show()

#### Hourly Activity

In [None]:
fig = go.Figure()

for topic in unique_topic_hourly_activity.keys():
    # What does the daily user activity look like?
    indices = [x for x in unique_topic_hourly_activity[topic].keys()]
    values = [x for x in unique_topic_hourly_activity[topic].values()]


    # Add traces
    fig.add_trace(go.Scatter(x = indices, y = values,
                            mode = 'lines+markers'
                            )

    )

fig.show()

## Article

### Number of Articles

In [None]:
# Total Number of Articles
single_subset_bar(df_ = df, feature_ = 'article_id', xaxis_title = 'Number of Articles', yrange = [0, 2000])


### Average readtime per article

In [None]:
## For each article, what was each users read time


## Get all unique ids in a list

unique_user_ids = df['user_id'].values[0:1000]

## We take the set because the scroll, article per user is joined in a list for every user id (so just take the set of it!)
unique_user_ids = set(unique_user_ids)


unique_article_ids = df['article_id'].unique()
unique_article_ids= unique_article_ids[~np.isnan(unique_article_ids)]


# Create dictionaries

unique_article_read = {k: [0] for k in unique_article_ids}
unique_article_read_avg = {k: [0] for k in unique_article_ids}

for id in unique_user_ids:
    # Get the subset of that user id
    tmp_df = df[df['user_id'] == id]

    # Now lets go through each scroll and article
    ## Iterate through each of the df
    indices = np.array(tmp_df.index)
    for i in indices:
        tmp_dict = {}
        # Select the scroll / article of that indice and 
        tmp_read = tmp_df['read_time_fixed'][i]

        tmp_article = tmp_df['article_id_fixed'][i]

        read = [x for x in tmp_read]
        
        articles = [np.int64(x) for x in tmp_article]
    
        tmp_dict = {k:v for k,v in zip(articles, read)}


        for k,v in zip(tmp_dict.keys(), tmp_dict.values()):

            #if unique_article_scroll.get(k) != None:
            #    unique_article_scroll[k] += v
            if (k in unique_article_read.keys()) & (np.isnan(v) == False):
                tmp_array= np.append(unique_article_read[k],v)
                unique_article_read[k] = tmp_array
        
    
    # Get the average scroll length for each article
    for k,v in zip(unique_article_read.keys(), unique_article_read.values()):
        unique_article_read_avg[k] = np.mean(v)




In [None]:
# What does the hourly user activity look like?
values = [x for x in unique_article_read_avg.values()]

fig = go.Figure()

# Add traces
fig.add_trace(go.Histogram(x = values
                         )

)

fig.show()

In [None]:
# What does the hourly user activity look like?
values = [x for x in unique_article_read_avg.values()]


fig = go.Figure()

# Boxplot




fig.add_trace(go.Box(y = values))


fig.show()


### Average scroll pct per article

In [None]:
## For each article, what was each users scroll percentage + read time? 


## Get all unique ids in a list

unique_user_ids = df['user_id'].values[0:2000]

## We take the set because the scroll, article per user is joined in a list for every user id (so just take the set of it!)
unique_user_ids = set(unique_user_ids)


unique_article_ids = df['article_id'].unique()
unique_article_ids= unique_article_ids[~np.isnan(unique_article_ids)]

# Create dictionaries

unique_article_scroll = {k: [0] for k in unique_article_ids}

for id in unique_user_ids:
    # Get the subset of that user id
    tmp_df = df[df['user_id'] == id]

    # Now lets go through each scroll and article
    ## Iterate through each of the df
    indices = np.array(tmp_df.index)
    for i in indices:
        tmp_dict = {}
        # Select the scroll / article of that indice and 
        tmp_scroll = tmp_df['scroll_percentage_fixed'][i]

        tmp_article = tmp_df['article_id_fixed'][i]

        scroll = [x for x in tmp_scroll]
        
        articles = [np.int64(x) for x in tmp_article]
    
        tmp_dict = {k:v for k,v in zip(articles, scroll)}


        for k,v in zip(tmp_dict.keys(), tmp_dict.values()):

            if (k in unique_article_scroll.keys()) & (np.isnan(v) == False):
                tmp_array= np.append(unique_article_scroll[k],v)
                unique_article_scroll[k] = tmp_array
        
    
# Get the average scroll length for each article
for k,v in zip(unique_article_scroll.keys(), unique_article_scroll.values()):
    unique_article_scroll[k] = np.mean(v)




In [None]:
indices = [x for x in unique_article_scroll.keys()]
avg_scroll_pct = [x for x in unique_article_scroll.values()]

fig = go.Figure()

# Add traces
fig.add_trace(go.Histogram(x=avg_scroll_pct)

)

fig.show()

In [None]:
indices = [x for x in unique_article_scroll.keys()]
avg_scroll_pct = [x for x in unique_article_scroll.values()]

fig = go.Figure()

# Add traces
fig.add_trace(go.Box(y=avg_scroll_pct)

)

fig.show()

### Number of articles clicked in a session

In [None]:
# How many unique articles are clicked in a session?

tmp_aps = df.groupby('session_id')['article_id'].apply(list)

articles_per_session = {k:0 for k in range(1, 20)}

for i in tmp_aps:
    num_articles = len(i)
    articles_per_session[num_articles] += 1


indices = [k for k in articles_per_session.keys()]
values = [k for k in articles_per_session.values()]


fig = go.Figure()

# Add trace
fig.add_trace(go.Bar(x= indices, y = values, textfont_size=20))

fig.update_traces(opacity=0.75)
fig.show()

## Devices

In [None]:
# Overall distribtuion of devices
multiple_subset_bar(df_ = df, feature_ = 'device_type', yrange = [0, 50000])

### Readtime per device

In [None]:
multiple_subset_feature_visualization(df_ =df,  feature_1 = 'device_type', feature_2 = 'read_time')

### Scroll time per device

In [None]:
multiple_subset_feature_visualization(df_ =df,  feature_1 = 'device_type', feature_2 = 'scroll_percentage')

### Topic

In [None]:
# Separate axes

In [None]:
## Subset based on those entries
## populate the counts for the traces


fig = go.Figure()

devices = [d for d in df['device_type'].unique()]

# Get all the unique topics
unique_topics = []
for i in df.index:
    for j in range(0, len(df['topics'][i])):
        tmp = df['topics'][i][j]
        if tmp not in unique_topics:
            unique_topics.append(tmp)


unique_topics = sorted(unique_topics)



fig = go.Figure()
# Iterate through each device 
for d in devices:
    # Find the subset of the data with that device
    tmp_df = df[df['device_type'] == d]

    # Create a dict object with 0 counts for all topics
    device_topic_freq = {k:0 for k in unique_topics}
    for i in tmp_df.index:
        for j in range(0, len(df['topics'][i])):
            # Find that index
            tmp_topic = df['topics'][i][j]

            # Enumerate
            device_topic_freq[tmp_topic] += 1
    
    device_topic_freq = dict(sorted(device_topic_freq.items(), key = lambda kv: kv[1], reverse= True))
    #print(device_topic_freq)
        
    fig.add_trace(go.Bar(x= [x for x in device_topic_freq.keys()], y = [y for y in device_topic_freq.values()], textfont_size=20))



# Overlay both histograms
#fig.update_layout(barmode='overlay')
# Reduce opacity to see both histograms
fig.update_traces(opacity=0.75)
fig.show()

### Activity

#### Daily Activity

In [None]:
fig = go.Figure()
# Iterate through each device 
for d in devices:
    # Find the subset of the data with that device
    tmp_df = df[df['device_type'] == d]

    # Create a dict object with 0 counts for all topics
    device_daily_activity = {}
    for i in tmp_df.index:
        # Find that index
        tmp_timestamp = df['impression_time'][i]
        tmp_datetime = tmp_timestamp
        tmp_date = tmp_datetime.strftime('%m/%d/%Y')

        # Daily Activity
        if tmp_date not in device_daily_activity:
            device_daily_activity[tmp_date] = 0
        else:
            device_daily_activity[tmp_date] +=1
    
    # Sort by dates
    device_daily_activity = dict(sorted(device_daily_activity.items()))

    # Add trace
    fig.add_trace(go.Bar(x= [x for x in device_daily_activity.keys()], y = [y for y in device_daily_activity.values()], textfont_size=20))



    # Overlay both histograms
    #fig.update_layout(barmode='overlay')
    # Reduce opacity to see both histograms
fig.update_traces(opacity=0.75)
fig.show()

#### Hourly Activity

In [None]:
fig = go.Figure()
# Iterate through each device 
for d in devices:
    # Find the subset of the data with that device
    tmp_df = df[df['device_type'] == d]

    # Create a dict object with 0 counts for all topics
    device_hourly_activity = {}
    for i in tmp_df.index:
        # Find that index
        tmp_timestamp = df['impression_time'][i]
        tmp_datetime = tmp_timestamp
        tmp_time = tmp_datetime.strftime('%H')

        # Hourly Activity
        if tmp_time not in device_hourly_activity:
            device_hourly_activity[tmp_time] = 0
        else:
            device_hourly_activity[tmp_time] +=1
    
    # Sort by dates
    device_hourly_activity = dict(sorted(device_hourly_activity.items()))

    # Add trace
    fig.add_trace(go.Bar(x= [x for x in device_hourly_activity.keys()], y = [y for y in device_hourly_activity.values()], textfont_size=20))



    # Overlay both histograms
    #fig.update_layout(barmode='overlay')
    # Reduce opacity to see both histograms
fig.update_traces(opacity=0.75)
fig.show()

# If subscriber

### How many users are subscribers

In [None]:
multiple_subset_bar(df_ = df, feature_ = 'is_subscriber', yrange = [0, 80000])

### Read time for subscriber vs non-subscribers

In [None]:
multiple_subset_feature_visualization(df_ =df,  feature_1 = 'is_subscriber', feature_2 = 'read_time')

### Scroll percentage if subscriber

In [None]:
multiple_subset_feature_visualization(df_ =df,  feature_1 = 'is_subscriber', feature_2 = 'scroll_percentage')

#### Topic Distribution

In [None]:
## Subset based on those entries
## populate the counts for the traces


is_subscriber = [d for d in df['is_subscriber'].unique()]

# Get all the unique topics
unique_topics = []
for i in df.index:
    for j in range(0, len(df['topics'][i])):
        tmp = df['topics'][i][j]
        if tmp not in unique_topics:
            unique_topics.append(tmp)

unique_topics = sorted(unique_topics)


fig = go.Figure()
# Iterate through each device 
for d in is_subscriber:
    # Find the subset of the data with that device
    tmp_df = df[df['is_subscriber'] == d]

    # Create a dict object with 0 counts for all topics
    is_subscriber_topic_freq = {k:0 for k in unique_topics}
    for i in tmp_df.index:
        for j in range(0, len(df['topics'][i])):
            # Find that index
            tmp_topic = df['topics'][i][j]

            # Enumerate
            is_subscriber_topic_freq[tmp_topic] += 1
    
    is_subscriber_topic_freq = dict(sorted(is_subscriber_topic_freq.items(), key = lambda kv: kv[1], reverse= True))
    #print(device_topic_freq)
        
    fig.add_trace(go.Bar(x= [x for x in is_subscriber_topic_freq.keys()], y = [y for y in is_subscriber_topic_freq.values()], textfont_size=20))



# Overlay both histograms
#fig.update_layout(barmode='overlay')
# Reduce opacity to see both histograms
fig.update_traces(opacity=0.75)
fig.show()

### Activity

#### Daily Activity

In [None]:
is_subscriber = [d for d in df['is_subscriber'].unique()]

fig = go.Figure()
# Iterate through each device 
for d in is_subscriber:
    # Find the subset of the data with that device
    tmp_df = df[df['is_subscriber'] == d]

    # Create a dict object with 0 counts for all topics
    is_subscriber_daily_activity = {}
    for i in tmp_df.index:
        # Find that index
        tmp_timestamp = df['impression_time'][i]
        tmp_datetime = tmp_timestamp
        tmp_date = tmp_datetime.strftime('%m/%d/%Y')

        # Daily Activity
        if tmp_date not in is_subscriber_daily_activity:
            is_subscriber_daily_activity[tmp_date] = 0
        else:
            is_subscriber_daily_activity[tmp_date] +=1
        
    
    # Sort by dates
    is_subscriber_daily_activity = dict(sorted(is_subscriber_daily_activity.items()))

    # Add trace
    fig.add_trace(go.Bar(x= [x for x in is_subscriber_daily_activity.keys()], y = [y for y in is_subscriber_daily_activity.values()], textfont_size=20))



    # Overlay both histograms
    #fig.update_layout(barmode='overlay')
    # Reduce opacity to see both histograms
fig.update_traces(opacity=0.75)
fig.show()

#### Hourly Activity

In [None]:
is_subscriber = [d for d in df['is_subscriber'].unique()]

fig = go.Figure()
# Iterate through each device 
for d in is_subscriber:
    # Find the subset of the data with that device
    tmp_df = df[df['is_subscriber'] == d]

    # Create a dict object with 0 counts for all topics
    is_subscriber_hourly_activity = {}
    for i in tmp_df.index:
        # Find that index
        tmp_timestamp = df['impression_time'][i]
        tmp_datetime = tmp_timestamp
        tmp_time = tmp_datetime.strftime('%H')

        # Daily Activity
        if tmp_time not in is_subscriber_hourly_activity:
            is_subscriber_hourly_activity[tmp_time] = 0
        else:
            is_subscriber_hourly_activity[tmp_time] +=1
        
    
    # Sort by dates
    is_subscriber_hourly_activity = dict(sorted(is_subscriber_hourly_activity.items()))

    # Add trace
    fig.add_trace(go.Bar(x= [x for x in is_subscriber_hourly_activity.keys()], y = [y for y in is_subscriber_hourly_activity.values()], textfont_size=20))



    # Overlay both histograms
    #fig.update_layout(barmode='overlay')
    # Reduce opacity to see both histograms
fig.update_traces(opacity=0.75)
fig.show()

# Gender

In [None]:
multiple_subset_bar(df_ = df, feature_ = 'gender', yrange = [0, 5000])

### Read time per gender

In [None]:
multiple_subset_feature_visualization(df_ =df,  feature_1 = 'gender', feature_2 = 'read_time')

### Scrolltime per gender

In [None]:
multiple_subset_feature_visualization(df_ =df,  feature_1 = 'gender', feature_2 = 'scroll_percentage')

### Topics

In [None]:
## Subset based on those entries
## populate the counts for the traces

genders = df['gender'].unique()
genders = genders[~np.isnan(genders)]

#genders = [d for d in df['gender'].unique() ]

# Get all the unique topics
unique_topics = []
for i in df.index:
    for j in range(0, len(df['topics'][i])):
        tmp = df['topics'][i][j]
        if tmp not in unique_topics:
            unique_topics.append(tmp)

unique_topics = sorted(unique_topics)


fig = go.Figure()
# Iterate through each device 
for d in genders:
    # Find the subset of the data with that device
    tmp_df = df[df['gender'] == d]

    # Create a dict object with 0 counts for all topics
    genders_topic_freq = {k:0 for k in unique_topics}
    for i in tmp_df.index:
        for j in range(0, len(df['topics'][i])):
            # Find that index
            tmp_topic = df['topics'][i][j]

            # Enumerate
            genders_topic_freq[tmp_topic] += 1
    
    genders_topic_freq = dict(sorted(genders_topic_freq.items(), key = lambda kv: kv[1], reverse= True))
    #print(device_topic_freq)
        
    fig.add_trace(go.Bar(x= [x for x in genders_topic_freq.keys()], y = [y for y in genders_topic_freq.values()], textfont_size=20))



# Overlay both histograms
#fig.update_layout(barmode='overlay')
# Reduce opacity to see both histograms
fig.update_traces(opacity=0.75)
fig.show()

### Activity

#### Daily Activity

In [None]:
genders = df['gender'].unique()
genders = genders[~np.isnan(genders)]

fig = go.Figure()
# Iterate through each device 
for d in genders:
    # Find the subset of the data with that device
    tmp_df = df[df['gender'] == d]

    # Create a dict object with 0 counts for all topics
    genders_daily_activity = {}
    for i in tmp_df.index:
        # Find that index
        tmp_timestamp = df['impression_time'][i]
        tmp_datetime = tmp_timestamp
        tmp_date = tmp_datetime.strftime('%m/%d/%Y')

        # Daily Activity
        if tmp_date not in genders_daily_activity:
            genders_daily_activity[tmp_date] = 0
        else:
            genders_daily_activity[tmp_date] +=1
        
    
    # Sort by dates
    genders_daily_activity = dict(sorted(genders_daily_activity.items()))

    # Add trace
    fig.add_trace(go.Bar(x= [x for x in genders_daily_activity.keys()], y = [y for y in genders_daily_activity.values()], textfont_size=20))



    # Overlay both histograms
    #fig.update_layout(barmode='overlay')
    # Reduce opacity to see both histograms
fig.update_traces(opacity=0.75)
fig.show()

#### Hourly Activity

In [None]:
genders = df['gender'].unique()
genders = genders[~np.isnan(genders)]

fig = go.Figure()
# Iterate through each device 
for d in genders:
    # Find the subset of the data with that device
    tmp_df = df[df['gender'] == d]

    # Create a dict object with 0 counts for all topics
    genders_hourly_activity = {}
    for i in tmp_df.index:
        # Find that index
        tmp_timestamp = df['impression_time'][i]
        tmp_datetime = tmp_timestamp
        tmp_time = tmp_datetime.strftime('%H')

        # Daily Activity
        if tmp_time not in genders_hourly_activity:
            genders_hourly_activity[tmp_time] = 0
        else:
            genders_hourly_activity[tmp_time] +=1
        
    
    # Sort by dates
    genders_hourly_activity = dict(sorted(genders_hourly_activity.items()))

    # Add trace
    fig.add_trace(go.Bar(x= [x for x in genders_hourly_activity.keys()], y = [y for y in genders_hourly_activity.values()], textfont_size=20))



    # Overlay both histograms
    #fig.update_layout(barmode='overlay')
    # Reduce opacity to see both histograms
fig.update_traces(opacity=0.75)
fig.show()

# Age

### Age Distribution

In [None]:
multiple_subset_bar(df_ = df, feature_ = 'age', yrange = [0, 800])

#### Average readtime per age group

In [None]:
multiple_subset_feature_visualization(df_ =df,  feature_1 = 'age', feature_2 = 'read_time')

#### Average Scroll PCt per age group

In [None]:
multiple_subset_feature_visualization(df_ =df,  feature_1 = 'age', feature_2 = 'scroll_percentage')

### Topics

In [None]:
## Subset based on those entries
## populate the counts for the traces

ages = df['age'].unique()
ages = ages[~np.isnan(ages)]

#genders = [d for d in df['gender'].unique() ]

# Get all the unique topics
unique_topics = []
for i in df.index:
    for j in range(0, len(df['topics'][i])):
        tmp = df['topics'][i][j]
        if tmp not in unique_topics:
            unique_topics.append(tmp)

unique_topics = sorted(unique_topics)


fig = go.Figure()
# Iterate through each device 
for d in ages:
    # Find the subset of the data with that device
    tmp_df = df[df['age'] == d]

    # Create a dict object with 0 counts for all topics
    ages_topic_freq = {k:0 for k in unique_topics}
    for i in tmp_df.index:
        for j in range(0, len(df['topics'][i])):
            # Find that index
            tmp_topic = df['topics'][i][j]

            # Enumerate
            ages_topic_freq[tmp_topic] += 1
    
    ages_topic_freq = dict(sorted(ages_topic_freq.items(), key = lambda kv: kv[1], reverse= True))
    #print(device_topic_freq)
        
    fig.add_trace(go.Bar(x= [x for x in ages_topic_freq.keys()], y = [y for y in ages_topic_freq.values()], textfont_size=20))



# Overlay both histograms
#fig.update_layout(barmode='overlay')
# Reduce opacity to see both histograms
fig.update_traces(opacity=0.75)
fig.show()

### Activity

#### Daily Activity

In [None]:
ages = df['age'].unique()
ages = ages[~np.isnan(ages)]

fig = go.Figure()
# Iterate through each device 
for d in ages:
    # Find the subset of the data with that device
    tmp_df = df[df['age'] == d]

    # Create a dict object with 0 counts for all topics
    ages_daily_activity = {}
    for i in tmp_df.index:
        # Find that index
        tmp_timestamp = df['impression_time'][i]
        tmp_datetime = tmp_timestamp
        tmp_date = tmp_datetime.strftime('%m/%d/%Y')

        # Daily Activity
        if tmp_date not in ages_daily_activity:
            ages_daily_activity[tmp_date] = 0
        else:
            ages_daily_activity[tmp_date] +=1
        
    
    # Sort by dates
    ages_daily_activity = dict(sorted(ages_daily_activity.items()))

    # Add trace
    fig.add_trace(go.Bar(x= [x for x in ages_daily_activity.keys()], y = [y for y in ages_daily_activity.values()], textfont_size=20))



    # Overlay both histograms
    #fig.update_layout(barmode='overlay')
    # Reduce opacity to see both histograms
fig.update_traces(opacity=0.75)
fig.show()

#### Hourly Activity

In [None]:
ages = df['age'].unique()
ages = ages[~np.isnan(ages)]

fig = go.Figure()
# Iterate through each device 
for d in ages:
    # Find the subset of the data with that device
    tmp_df = df[df['age'] == d]

    # Create a dict object with 0 counts for all topics
    ages_hourly_activity = {}
    for i in tmp_df.index:
        # Find that index
        tmp_timestamp = df['impression_time'][i]
        tmp_datetime = tmp_timestamp
        tmp_time = tmp_datetime.strftime('%H')

        # Daily Activity
        if tmp_time not in ages_hourly_activity:
            ages_hourly_activity[tmp_time] = 0
        else:
            ages_hourly_activity[tmp_time] +=1
        
    
    # Sort by dates
    ages_hourly_activity = dict(sorted(ages_hourly_activity.items()))

    # Add trace
    fig.add_trace(go.Bar(x= [x for x in ages_hourly_activity.keys()], y = [y for y in ages_hourly_activity.values()], textfont_size=20))



    # Overlay both histograms
    #fig.update_layout(barmode='overlay')
    # Reduce opacity to see both histograms
fig.update_traces(opacity=0.75)
fig.show()

# Postcodes

### Distribution of Post Codes

In [None]:
multiple_subset_bar(df_ = df, feature_ = 'postcode', yrange = [0, 800])

### Average Read Time in each postal code

In [None]:
multiple_subset_feature_visualization(df_ =df,  feature_1 = 'postcode', feature_2 = 'read_time')

### Average Scroll pct in each postal code

In [None]:
multiple_subset_feature_visualization(df_ =df,  feature_1 = 'postcode', feature_2 = 'scroll_percentage')

### Topics

In [None]:
## Subset based on those entries
## populate the counts for the traces

postcodes = df['postcode'].unique()
postcodes = postcodes[~np.isnan(postcodes)]

#genders = [d for d in df['gender'].unique() ]

# Get all the unique topics
unique_topics = []
for i in df.index:
    for j in range(0, len(df['topics'][i])):
        tmp = df['topics'][i][j]
        if tmp not in unique_topics:
            unique_topics.append(tmp)

unique_topics = sorted(unique_topics)


fig = go.Figure()
# Iterate through each device 
for d in postcodes:
    # Find the subset of the data with that device
    tmp_df = df[df['postcode'] == d]

    # Create a dict object with 0 counts for all topics
    postcodes_topic_freq = {k:0 for k in unique_topics}
    for i in tmp_df.index:
        for j in range(0, len(df['topics'][i])):
            # Find that index
            tmp_topic = df['topics'][i][j]

            # Enumerate
            postcodes_topic_freq[tmp_topic] += 1
    
    postcodes_topic_freq = dict(sorted(postcodes_topic_freq.items(), key = lambda kv: kv[1], reverse= True))
    #print(device_topic_freq)
        
    fig.add_trace(go.Bar(x= [x for x in postcodes_topic_freq.keys()], y = [y for y in postcodes_topic_freq.values()], textfont_size=20))



# Overlay both histograms
#fig.update_layout(barmode='overlay')
# Reduce opacity to see both histograms
fig.update_traces(opacity=0.75)
fig.show()

### Activity

#### Daily Activity

In [None]:
postcodes = df['postcode'].unique()
postcodes = postcodes[~np.isnan(postcodes)]

fig = go.Figure()
# Iterate through each device 
for d in postcodes:
    # Find the subset of the data with that device
    tmp_df = df[df['postcode'] == d]

    # Create a dict object with 0 counts for all topics
    postcodes_daily_activity = {}
    for i in tmp_df.index:
        # Find that index
        tmp_timestamp = df['impression_time'][i]
        tmp_datetime = tmp_timestamp
        tmp_date = tmp_datetime.strftime('%m/%d/%Y')

        # Daily Activity
        if tmp_date not in postcodes_daily_activity:
            postcodes_daily_activity[tmp_date] = 0
        else:
            postcodes_daily_activity[tmp_date] +=1
        
    
    # Sort by dates
    postcodes_daily_activity = dict(sorted(postcodes_daily_activity.items()))

    # Add trace
    fig.add_trace(go.Bar(x= [x for x in postcodes_daily_activity.keys()], y = [y for y in postcodes_daily_activity.values()], textfont_size=20))



    # Overlay both histograms
    #fig.update_layout(barmode='overlay')
    # Reduce opacity to see both histograms
fig.update_traces(opacity=0.75)
fig.show()

#### Hourly Activity

In [None]:
postcodes = df['postcode'].unique()
postcodes = postcodes[~np.isnan(postcodes)]


fig = go.Figure()
# Iterate through each device 
for d in postcodes:
    # Find the subset of the data with that device
    tmp_df = df[df['postcode'] == d]

    # Create a dict object with 0 counts for all topics
    postcodes_hourly_activity = {}
    for i in tmp_df.index:
        # Find that index
        tmp_timestamp = df['impression_time'][i]
        tmp_datetime = tmp_timestamp
        tmp_time = tmp_datetime.strftime('%H')

        # Daily Activity
        if tmp_time not in postcodes_hourly_activity:
            postcodes_hourly_activity[tmp_time] = 0
        else:
            postcodes_hourly_activity[tmp_time] +=1
        
    
    # Sort by dates
    postcodes_hourly_activity = dict(sorted(postcodes_hourly_activity.items()))

    # Add trace
    fig.add_trace(go.Bar(x= [x for x in postcodes_hourly_activity.keys()], y = [y for y in postcodes_hourly_activity.values()], textfont_size=20))



    # Overlay both histograms
    #fig.update_layout(barmode='overlay')
    # Reduce opacity to see both histograms
fig.update_traces(opacity=0.75)
fig.show()