# UofT Journal Usage Stats

### Authors: Tao, Shirley

In [1]:
import glob
import pandas as pd
import datetime
import calendar
import numpy as np
import operator

import plotly.express as px
import plotly.graph_objects as go

import ipywidgets as widgets
from ipywidgets import interact
from IPython.display import display

import warnings
warnings.filterwarnings("ignore")

# Data

The data consists of the JR1 reports provided from the UofT librarians detailing the packages they currently have for all the journals UofT is subscribed to, along side the download counts for all of those journals, and the list of UofT publications for a year compiled from the Web of Science website.  Each JR1 report is an individual excel file, and the Web of Science listings are one excel file per year, so the data needs to be loaded and cleaned from its original form. This section loads the data into two dataframes.

### Getting the Data
1. Download "Web of Science data UofT affiliated pubs 2014-2018.zip" and "JSC370 Data KM.zip" from Quercus.
2. Unzip both into the same folder as this notebook.
3. In folders JSC370 Data KM and Web of Science data UofT affiliated pubs 2014-2018, delete the "_MACOSX" if there is one.
4. Collapse the JSC370 Data KM/JSC370 Data KM folder into just a JSC370 Data KM folder.

### Loading the Data
**First Time**: If this is your first time using the notebook and you don't have the cleaned csv files of the data, follow the instructions above for getting the data, and run the FIRST and SECOND cells to load the data into the notebook.

**For Future Usage**: The data can be saved to csv files so that future and repeated usage of this notebook allows the data to be loaded much faster.
- Run the THIRD cell after running the first two to save the cleaned dataframes to csv files.
- If you have the cleaned csv files, skip the first two cells and run the FOURTH cell to load the data.

The first cell loads the JR1 reports, and will print out the file name and reason if it decides to not read in any files.  Currently we are skipping the CAIRN reports due to the French encoding issues and the excel files being formatted differently as a result of it.  The second cell loads the Web of Science UofT publications information.  The loading bar will turn green when the cell is finished running.

In [None]:
# Part 1: JR1 reports

# takes around 4 minutes to run

# get report names
reportnames = glob.glob('JSC370 Data KM/*.*', recursive = True)

# create loading bar
loading1 = widgets.IntProgress(value=0, min=0, max=len(reportnames) + 3, step=1, description='Loading:', 
                               bar_style='info', orientation='horizontal')
display(loading1)

# read in all excel files
report_data = pd.DataFrame()
for name in reportnames:
    if "JR1" not in name:
        print(name + " Not Loaded: Not JR1")
        loading1.value = loading1.value + 1
        continue
    
    excel = True
    try:
        curr_report = pd.read_excel(name)
    except:
        # corrupted xls files can be read in as csv files
        # we know that for this set of data, the column names are either on the 7th or 9th row
        try:
            curr_report = pd.read_csv(name, skiprows=7, sep="\t")
        except:
            curr_report = pd.read_csv(name, skiprows=9, sep="\t")
        excel = False
    
    if excel:
        # get index where data starts 
        # (this is different between some files so using a loop to get the starting point)
        colnamesindex = -1
        for i in range(len(curr_report)):
            if curr_report.iloc[i, 0] == "Journal" or curr_report.iloc[i, 1] == "Publisher":
                colnamesindex = i
                break
        if colnamesindex == -1:
            print(name + " Not Loaded: Not in English, Not Formatted Right")
            loading1.value = loading1.value + 1
            continue

        # set column names
        curr_report.columns = curr_report.loc[colnamesindex,]

        if "+" in curr_report.columns:
            # for the files formated weirdly with a + and spreading title names across rows
            # fix column names
            colnames = list(curr_report.columns)
            for i in range(len(colnames)):
                if pd.notna(curr_report.iloc[colnamesindex + 1, i]):
                    colnames[i] = colnames[i] + " " + str(curr_report.iloc[colnamesindex + 1, i])
                if pd.notna(curr_report.iloc[colnamesindex + 2, i]):
                    colnames[i] = colnames[i] + " " + str(curr_report.iloc[colnamesindex + 2, i])
            curr_report.columns = colnames
            curr_report = curr_report.drop(columns=["+"])
            curr_report = curr_report.drop(curr_report[curr_report["Journal"] == "+"].index, axis=0)

            # save only data part
            curr_report = curr_report[colnamesindex + 3:]
        else:
            # save only data part
            curr_report = curr_report[colnamesindex + 1:]

    # insert year of report
    jr1_i = name.index("JR1")
    year = name[jr1_i + 4 : jr1_i + 8]
    curr_report.insert(0, "Year", int(year))

    # reformat months to not include year or date for generality
    datetimes = {}
    for colname in curr_report.columns:
        if pd.notna(colname):
            if isinstance(colname, datetime.datetime):
                # some encoded as datetime
                calendar.month_name[colname.month][:3]
                datetimes[colname] = calendar.month_name[colname.month][:3]
            elif isinstance(colname, str):
                # sometimes there's whitespace messing things up
                strippedcolname = colname.strip()
                if strippedcolname.endswith(year):
                    # some encoded as MMM-YYYY
                    datetimes[colname] = strippedcolname[:3]
                else:
                    datetimes[colname] = strippedcolname
    curr_report = curr_report.rename(columns=datetimes)

    # for some reports that don't label the Journal column
    curr_report = curr_report.rename(columns={np.nan: "Journal"})
    
    # insert file package name
    package_i = name.index("\\")
    filepub = name[package_i + 1 : jr1_i - 1]
    curr_report.insert(0, "FilePackage", filepub)
    
    # drop totals row
    curr_report = curr_report.drop(curr_report.index[0])
    
    # insert if SP or not
    mainname = name.split(".")[0]
    if mainname.endswith("SP"):
        curr_report.insert(0, "SP", "Yes")
    else:
        curr_report.insert(0, "SP", "No")
    
    # append this excel file's data to the total dataframe
    try:
        report_data = report_data.append(curr_report)
        loading1.value = loading1.value + 1 
    except ValueError:
        print(name + " Not Loaded: Not Formatted Properly, Can't Append Data")
        loading1.value = loading1.value + 1
        continue
        

# fixing more formatting problems
report_data["Reporting Period Total"] = report_data["Reporting Period Total"].fillna(report_data["Retrievals"])
report_data["Reporting Period HTML"] = report_data["Reporting Period HTML"].fillna(report_data["HTML"])
report_data["Reporting Period PDF"] = report_data["Reporting Period PDF"].fillna(report_data["PDF"])
report_data["Journal DOI"] = report_data["Journal DOI"].fillna(report_data["Journal Doi"])
report_data["Journal"] = report_data["Journal"].fillna(report_data["Title"])
report_data["Journal"] = report_data["Journal"].fillna(report_data["Unnamed: 0"])

report_data = report_data.drop(columns=["Retrievals", "HTML", "PDF", "Journal Doi", "Title", "Dec-2015", "Unnamed: 0"])
report_data.loc[report_data["Online ISSN"] == " ", "Online ISSN"] = np.nan
loading1.value = loading1.value + 1

# drop the rows that are accidentally still there
report_data = report_data[report_data["Reporting Period Total"].notna()]

# strip excess whitespace
report_data["Journal"] = report_data["Journal"].apply(lambda x: x.strip() if isinstance(x, str) else x)
report_data["FilePackage"] = report_data["FilePackage"].apply(lambda x: x.strip() if isinstance(x, str) else x)
loading1.value = loading1.value + 1

# double check data is correct
report_data["Reporting Period Total"] = report_data["Reporting Period Total"].apply(
    lambda x: x if isinstance(x, int) else np.nan)
report_data = report_data.dropna(subset=["Reporting Period Total"])
report_data = report_data.reset_index(drop=True)

# done
loading1.value = loading1.value + 1
loading1.bar_style = 'success'

In [None]:
# Part 2: Web of Science Uoft authors dataframe

# uoft report names
uoftnames = glob.glob('Web of Science data UofT affiliated pubs 2014-2018/*.*', recursive = True)

# create loading bar
loading2 = widgets.IntProgress(value=0, min=0, max=8, step=1, description='Loading:', 
                               bar_style='info', orientation='horizontal')
display(loading2)

# takes around 40 seconds to load
uoft_data = pd.DataFrame()
year = 2014
for name in uoftnames:
    curr_uoft = pd.read_excel(name)
    curr_uoft.insert(0, "Year", year)
    uoft_data = uoft_data.append(curr_uoft)
    year = year + 1
    loading2.value = loading2.value + 1
    
uoft_data = uoft_data.drop_duplicates()
uoft_data = uoft_data.reset_index(drop=True)

# fix formatting problems
uoft_data["Category: Heading 1"] = uoft_data["Category: Heading 1"].fillna(uoft_data["Category: Headings 1"])
uoft_data["PubType"] = uoft_data["PubType"].fillna(uoft_data["Pubtype"])
uoft_data = uoft_data.drop(columns=["Category: Headings 1", "Pubtype"])
loading2.value = loading2.value + 1

# count the number of uoft authors per publication, since the dataframe only gives a count of 
# total authors and lists out the uoft authors
def get_num_uoft_authors(row):
    new_row = row.copy()
    first_three = ["(a1) First UofT affiliated author's position in the author list ",
                   " (a2) Second UofT affiliated author's position in the author list",
                   "(a3) Third UofT affiliated author's position in the author list "]
    num = 0
    if pd.notna(row[first_three[0]]):
        new_row["NumUofTAuthors"] = new_row["NumUofTAuthors"] + 1
    else:
        return new_row
        
    if pd.notna(row[first_three[1]]):
        new_row["NumUofTAuthors"] = new_row["NumUofTAuthors"] + 1
    else:
        return new_row
        
    if pd.notna(row[first_three[2]]):
        new_row["NumUofTAuthors"] = new_row["NumUofTAuthors"] + 1
    else:
        return new_row
        
    for j in range(4, 61):
        if pd.notna(row['a' + str(j)]):
            new_row["NumUofTAuthors"] = new_row["NumUofTAuthors"] + 1
        else:
            return new_row
            
    return new_row

# takes a while to run
uoft_data["NumUofTAuthors"] = 0
uoft_data = uoft_data.apply(get_num_uoft_authors, axis=1)
loading2.value = loading2.value + 1

uoft_data = uoft_data.reset_index(drop=True)
loading2.value = loading2.value + 1
loading2.bar_style = 'success'

In [None]:
# OPTIONAL
# Run this cell to save the data to a csv file so future loading is much faster
report_data.to_csv("JSC370 Data KM.csv")
uoft_data.to_csv("Web of Science data UofT affiliated pubs 2014-2018.csv")

In [19]:
# OPTIONAL
# If the data has already been read in and cleaned and put into a csv before,
# Run this to read it in

# create loading bar
loading5 = widgets.IntProgress(value=0, min=0, max=17, step=1, description='Loading:', 
                               bar_style='info', orientation='horizontal')
display(loading5)

# get all of the journals
df = pd.read_csv("JSC370 Data KM.csv").drop("Unnamed: 0", axis = 1)
report_data = df
loading5.value = loading5.value + 1

# get the uoft publication information
uoft_data = pd.read_csv("Web of Science data UofT affiliated pubs 2014-2018.csv")
loading5.value = loading5.value + 1

# reading from csv makes it think these are strings or floats, so convert to int
report_data["Reporting Period Total"] = report_data["Reporting Period Total"].astype('Int64')
loading5.value = loading5.value + 1

report_data["Reporting Period HTML"] = report_data["Reporting Period HTML"].astype('Int64')
loading5.value = loading5.value + 1

report_data["Reporting Period PDF"] = report_data["Reporting Period PDF"].astype('Int64')
loading5.value = loading5.value + 1

months = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
for month in months:
    report_data[month] = report_data[month].astype('Int64')
    loading5.value = loading5.value + 1
    
loading5.bar_style = 'success'

IntProgress(value=0, bar_style='info', description='Loading:', max=17)

In [4]:
# data selection functions used throughout the notebook

# selects a subset of the data with the aggregated download counts for a month-year time range
# of downloads (only year time range if download count HTML or PDF)
# creates SettingWithCopyWarning even though it's fine
def get_selected_report_data(type_of_downloads, start_year, start_month, end_year, end_month):
    selected_data = pd.DataFrame()
    months = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
    
    years = []  # years we need to compile download counts for
    for i in range(end_year - start_year + 1):
        years.append(start_year + i)

    d_type = ""  # download type we want to count
    if type_of_downloads == "All":
        d_type = "Reporting Period Total"
    elif type_of_downloads == "PDF":
        d_type = "Reporting Period PDF"
    elif type_of_downloads == "HTML":
        d_type = "Reporting Period HTML"
    else:
        # download type not valid
        print("Download Type not valid")

    # monthly specifics only available if download type is total
    if type_of_downloads == "All":
        start_month_i = months.index(start_month)
        end_month_i = months.index(end_month)
        if start_year == end_year:
            # only within the year range
            selected_data = report_data[report_data["Year"] == start_year]
            selected_data["Downloads"] = 0

            for j in range(start_month_i, end_month_i + 1):
                selected_data["Downloads"] = selected_data.loc[:, "Downloads"] + selected_data[months[j]].fillna(0)
        
        elif start_year < end_year:
            # start year months
            selected_data = report_data[report_data["Year"] == start_year]
            selected_data["Downloads"] = 0

            # get counts within month range for start year
            for j in range(start_month_i, len(months)):
                selected_data["Downloads"] = selected_data.loc[:, "Downloads"] + selected_data[months[j]].fillna(0)

            # inbetween years entire year counts
            inbetween_years = years[1 : len(years) - 1]
            inbetween_data = report_data[report_data["Year"].isin(inbetween_years)]
            inbetween_data["Downloads"] = inbetween_data.loc[:, d_type]

            # loop for end year months
            end_data = report_data[report_data["Year"] == end_year]
            end_data["Downloads"] = 0

            for j in range(0, end_month_i + 1):
                end_data["Downloads"] = end_data.loc[:, "Downloads"] + end_data[months[j]].fillna(0)

            selected_data = selected_data.append(inbetween_data)
            selected_data = selected_data.append(end_data)

        else:
            # start year > end year, no data
            selected_data = pd.DataFrame()
    else:
        # do only year selections
        selected_data = report_data[report_data["Year"].isin(years)]
        selected_data["Downloads"] = report_data[d_type]
    
    return selected_data


# merges the selected subset of report data with the uoft publications dataframe
# doesn't directly merge the two dataframes in case they're both huge to avoid memory errors
def merge_selected_reports_and_uoft(selected_data):
    # ISSNs of only selected data
    available_links = selected_data[["Print ISSN", "Online ISSN"]]
    available_links = available_links.drop_duplicates()
    available_links = available_links[(available_links["Print ISSN"].notna()) | 
                                      (available_links["Online ISSN"].notna())]
    available_links = available_links.reset_index(drop=True)

    # get uoft publications that match the selected ISSNs
    matched_uoft = pd.merge(uoft_data, available_links, how="inner", 
                            left_on=["ISSN", "eISSN"], right_on=["Print ISSN", "Online ISSN"])

    # save their ISSNs
    matched_uoft = matched_uoft.drop(columns=["ISSN", "eISSN"])
    available_links = matched_uoft[["Print ISSN", "Online ISSN"]]
    available_links = available_links.drop_duplicates()
    available_links = available_links[(available_links["Print ISSN"].notna()) | 
                                      (available_links["Online ISSN"].notna())]
    available_links = available_links.reset_index(drop=True)

    # get download counts of those uoft publications
    matched_selected = pd.merge(selected_data, available_links, how="inner", 
                            on=["Print ISSN", "Online ISSN"])

    # takes a while and can cause memory problems if both dataframes really big
    matched_data = pd.merge(matched_selected, matched_uoft, how="inner", 
                            on=["Print ISSN", "Online ISSN", "Year"]) #, right_on=["ISSN", "eISSN"])
    
    matched_data = matched_data.reset_index(drop=True)
    return matched_data

# Part 1: Most Downloaded Journals

This section should help answer which journals are downloaded the most.  It displays the most downloaded journals in:
1. table form
2. bar graph form with breakdown of platforms they're downloaded from
3. bar graph form with breakdown of packages they're present in

## How to Use the Widgets
1. The Data Selection allows the user to choose a subset of the data that they want information for.  The user can choose to specify a month-year time range where they want the most downloaded journals.  They can also specify if they want all types of downloads or only PDF or HTML downloads, if they want to view Scholars Portal downloads or not, and if they only want journals from a specific Platform or Package.  Once the user is finished selecting the subset of data they want, click the Select Data button to load the graphs.  Make sure to press Select Data at the start to load data for display.
2. The Display Selection allows the user to choose how many journals to show, and if it should show all journals or only journals with UofT publications.  Currently it can show the top 5, 10, or 15 journals by download count.  Changing a selection here will automatically update the graph.
3. Hover the mouse over a bar in the bar graph to get more information about that journal in that group, its Publisher, and the number of downloads for that journal in that group.


### Notes:
- It can take a while to load the data for the graphs, especially if the selected time range is large.  The loading bar will turn green once all of the graphs have finished updating.
- If the download type is All, the download counts are over the specified month-year range.  If the download type is HTML or PDF, the download counts are only over the year range specified.  This is due to the month breakdown of downloads only being available for total downloads. 
- If the produced table and graphs are empty, then the selected data subset has no information for your selected time range and specified choices.  Try increasing the time range or choosing less restrictive selections for the package or platform.

In [28]:
# global dataframes used
selected_data = None #get_selected_report_data("All", 2014, "Jan", 2018, "Dec")
uoft_reports = None #merge_selected_reports_and_uoft(selected_data)
uoft_downloads = None #uoft_reports.drop_duplicates(subset=["Print ISSN", "Online ISSN", "Journal", "Year", "Platform"])

# current display options
curr_data1 = selected_data
top = 5

In [34]:
# data selection widgets

# date range slider
dates = []
for year in range(0, 5):
    for month in range(1, 13):
        dates.append(datetime.date(2014 + year, month, 1))
options = [(i.strftime(' %b-%Y '), i) for i in dates]
timerange1 = widgets.SelectionRangeSlider(options=options, index=(0,len(dates) - 1), 
                                          description="Time Range", disabled=False,
                                          layout={'width': '500px'})
                   
# download type dropdown
download_dd1 = widgets.Dropdown(options = ["All", "HTML", "PDF"], 
                                description="Downloads:")

# Scholars Portal or not dropdown
sp_dd1 = widgets.Dropdown(options = ["All", "Only SP", "No SP"], description="Choose SP:")

# platforms dropdown
platforms_list = list(report_data["Platform"].value_counts().index)
platforms_list.sort()
platforms_list = ["All"] + platforms_list
platforms_dd1 = widgets.Dropdown(options = platforms_list, description="Platforms:")

# packages dropdown
packages_list = list(report_data["FilePackage"].value_counts().index)
packages_list.sort()
packages_list = ["All"] + packages_list
packages_dd1 = widgets.Dropdown(options = packages_list, description="Packages:")

# select data button
data_btn1 = widgets.Button(description='Select Data', disabled=False, button_style='')

# put them all together in a box for display
data_widgets1 = widgets.VBox([timerange1, download_dd1, sp_dd1, platforms_dd1, 
                              packages_dd1, data_btn1])

##### 

# display selection widgets

# dropdown menu of top
top_dd = widgets.Dropdown(options = [5, 10, 15], description='Top:')

# dropdown menu of if uoft or not
uoft_dd = widgets.Dropdown(options = ["All Reports", "Only UofT Reports"], description="Type:")
input_widgets1 = widgets.HBox([top_dd, uoft_dd])

#####

# loading bar
loading3 = widgets.IntProgress(value=0, min=0, max=10, step=1, description='Loading:', 
                               bar_style='info', orientation='horizontal')

#####

# output widgets
counts_table = widgets.Output()
platform_graph = go.FigureWidget() 
package_graph = go.FigureWidget() 

In [35]:
# functions

# get top_num journals by Downloads from data
def get_top_journals(top_num, data):
    # aggregate ratings across and different versions
    aggregated = data.groupby(["Journal"])["Downloads"].sum()
    aggregated = pd.DataFrame(aggregated)
    results = aggregated.sort_values(by="Downloads", ascending=False)[0:top_num]
    
    # get order 
    order = results.reset_index()
    order = order.drop(columns=["Downloads"])
    order = list(order["Journal"])

    # get publishers
    top_journals = results.index
    top_journals = data[data["Journal"].isin(top_journals)]
    publishers = top_journals.groupby(["Journal", "Publisher"])["Downloads"].sum()
    publishers = pd.DataFrame(publishers)
    publishers = publishers.reset_index()
    # each journal only has one publisher
    # any extra publishers is just the same publisher but spelt slightly differently
    publishers = publishers.drop_duplicates(subset=["Journal"]) 
    publishers = publishers.drop(columns=["Downloads"])
    
    # get platform counts
    platforms = top_journals.groupby(["Journal", "Platform"])["Downloads"].sum()
    platforms = pd.DataFrame(platforms)
    platforms = platforms.reset_index()
    platforms = platforms.merge(publishers)

    # get package counts
    packages = top_journals.groupby(["Journal", "FilePackage"])["Downloads"].sum()
    packages = pd.DataFrame(packages)
    packages = packages.reset_index()
    packages = packages.merge(publishers)
    
    return results, order, platforms, packages


# draw bar chart of downloads with breakdown of extra
def barchart_of_downloads(data, extra, order, totals):
    # draw a bar chart
    fig = px.bar(data, x='Journal', y='Downloads', color=extra, 
                 hover_data=['Downloads', 'Publisher', extra],
                 category_orders={"Journal": order}, height=600, width=900) 

    # draw number of downloads at the top of each bar
    y1 = list(totals["Downloads"])
    xcoord = list(totals.index) 
    annotations = [dict(x=xi, y=yi, text=str(yi), xanchor='center', 
                        yanchor='bottom', showarrow=False) for xi, yi in zip(xcoord, y1)]

    # add labels to the plot
    title_part = "Packages"
    if extra == "Platform":
        title_part = "Platforms"
    
    fig.update_layout(
        title={'text': "Top " + str(top) + " Journals and Their " + title_part,
               'y':0.95,
               'x':0.5,
               'xanchor': 'center',
               'yanchor': 'top'},
        annotations=annotations)
    
    return fig


# update the figures
def update_journal_figs(results, order, platforms, packages):
    # update counts table
    counts_table.clear_output() 
    with counts_table:
        display(results)
    loading3.value = loading3.value + 1
        
    # update bar chart with platform breakdown
    new_fig = barchart_of_downloads(platforms, "Platform", order, results)
    platform_graph.data = []
    platform_graph.add_traces(new_fig.data)
    platform_graph.layout = new_fig.layout
    loading3.value = loading3.value + 1
    
    # update bar chart with package breakdown
    new_fig = barchart_of_downloads(packages, "FilePackage", order, results)
    package_graph.data = []
    package_graph.add_traces(new_fig.data)
    package_graph.layout = new_fig.layout
    loading3.value = loading3.value + 1


# if top number to display is changed
def top_update(change):  
    global top
    counts_table.clear_output() 
    
    loading3.value = 0
    loading3.bar_style = ""
    loading3.value = 5
    
    # don't need to change current data
    top = change.new
    results, order, platforms, packages = get_top_journals(top, curr_data1)
    loading3.value = loading3.value + 1
    
    update_journal_figs(results, order, platforms, packages)
    loading3.value = loading3.value + 1
    loading3.bar_style = "success"
        

# if All or Only Uoft reports is changed
def uoft_update(change):  
    global curr_data1
    counts_table.clear_output() 
    
    loading3.value = 0
    loading3.bar_style = ""
    loading3.value = 5
    
    if change.new == "All Reports":
        curr_data1 = selected_data
    elif change.new == "Only UofT Reports":
        curr_data1 = uoft_downloads
    
    results, order, platforms, packages = get_top_journals(top, curr_data1)
    loading3.value = loading3.value + 1
    
    update_journal_figs(results, order, platforms, packages)
    loading3.value = loading3.value + 1
    loading3.bar_style = "success"

    
# if data selection in Part 1 is changed
def select_data_update1(change):
    global selected_data
    global uoft_reports
    global uoft_downloads
    global curr_data1
    
    loading3.value = 0
    loading3.bar_style = ""
    
    # get stuff from selection widgets
    start, end = timerange1.get_state()["index"] 
    start_year = dates[start].year
    start_month = dates[start].strftime('%b')
    end_year = dates[end].year
    end_month = dates[end].strftime('%b')
    type_of_downloads = download_dd1.value
    sp_selection = sp_dd1.value
    platform_selection = platforms_dd1.value
    package_selection = packages_dd1.value
    
    # get selected data
    selected_data = get_selected_report_data(type_of_downloads, start_year, start_month, end_year, end_month)
    loading3.value = loading3.value + 1
    
    # filter further
    if sp_selection == "Only SP":
        selected_data = selected_data[selected_data["SP"] == "Yes"]
    if sp_selection == "No SP":
        selected_data = selected_data[selected_data["SP"] == "No"]
    if platform_selection != "All":
        selected_data = selected_data[selected_data["Platform"] == platform_selection]
    if package_selection != "All":
        selected_data = selected_data[selected_data["FilePackage"] == package_selection]
    loading3.value = loading3.value + 1
    
    # get uoft publications joined with selected data
    uoft_reports = merge_selected_reports_and_uoft(selected_data)
    loading3.value = loading3.value + 1
    
    # save only 1 of each journal that has uoft publications in it
    uoft_downloads = uoft_reports.drop_duplicates(
        subset=["Print ISSN", "Online ISSN", "Journal", "Year", "Platform"])
    loading3.value = loading3.value + 1
    
    # save data from display selection
    if uoft_dd.value == "All Reports":
        curr_data1 = selected_data
    else:
        curr_data1 = uoft_downloads
    loading3.value = loading3.value + 1
    
    # get data for graphing
    results, order, platforms, packages = get_top_journals(top_dd.value, curr_data1)
    loading3.value = loading3.value + 1
    
    # draw figures
    update_journal_figs(results, order, platforms, packages)
    loading3.value = loading3.value + 1
    loading3.bar_style = "success"

In [36]:
# update when needed
top_dd.observe(top_update, names = 'value')
uoft_dd.observe(uoft_update, names = 'value')
data_btn1.on_click(select_data_update1)

# selections in folding accordian tabs
selections1 = widgets.Accordion(children=[data_widgets1, input_widgets1])
selections1.set_title(0, 'Data Selection')
selections1.set_title(1, 'Display Selection')

# output graph tabs
tab1 = widgets.Tab([counts_table, platform_graph, package_graph])
tab1.set_title(0, "Counts")
tab1.set_title(1, 'Graph With Platform')
tab1.set_title(2, 'Graph With Package')

In [37]:
display(selections1)
display(loading3)
display(tab1)

Accordion(children=(VBox(children=(SelectionRangeSlider(description='Time Range', index=(0, 59), layout=Layout…

IntProgress(value=0, bar_style='info', description='Loading:', max=10)

Tab(children=(Output(), FigureWidget({
    'data': [], 'layout': {'template': '...'}
}), FigureWidget({
    'd…

# Part 2: UofT Journals: Downloads, Authors, Categories

This section should help give an idea of what the journals that currently have uoft publications in them are like.
- Stats: Some basic stats about the number and proportion of uoft publications.
- Distributions: The distribution of total downloads, number of uoft authors per publication, proportion of uoft authors to all authors of the publication, and number of pages per publication.
- Journal Types: A breakdown of publication types and document types.
- Categories: A breakdown of heading, subheading, and subject categories.

## How to Use the Widgets
1. The Data Selection works the same way as it does in Part 1.  It allows the user to choose a subset of the data that they want information for.  Make sure to press Select Data at the start to load data for display.

2. The Display Selection allows the user to also choose a subset of data based on heading category and subheading category.

3. In the pie charts, clicking an item in the legend will omit it from the pie chart, to help gain an idea of what the distribution is like without that element, and clicking it again will bring it back.


### Notes:
- A journal can have multiple heading and subheading categories.  Even if a heading or subheading category is specified, other categories can show up in the pie chart, because a publication has both subheadings.

In [71]:
# dataframes
selected_data2 = None 
uoft_reports2 = None 
uoft_downloads2 = None 

# selections
head_selection = "All Headings"
subhead_selection = "All Subheadings"
curr_data2 = uoft_reports2
curr_downloads = uoft_downloads2
proportions = None

In [72]:
# functions
# select by topic
def select_topics(head, subhead, uoft_selected_data):
    if head != "All Headings":
        has_head = []
        # actually gotta check each individual one to see if we include it or not
        for i, r in uoft_selected_data.iterrows():
            if pd.notna(r["Category: Heading 1"]):
                if head in r["Category: Heading 1"]:
                    has_head.append(i)
        curr_data2 = uoft_selected_data.loc[has_head,:]
    else:
        curr_data2 = uoft_selected_data
        
    if subhead != "All Subheadings":
        has_subhead = []
        # actually gotta check each individual one to see if we include it or not
        for i, r in curr_data2.iterrows():
            if pd.notna(r["Category: Subheadings"]):
                if subhead in r["Category: Subheadings"]:
                    has_subhead.append(i)
        curr_data2 = curr_data2.loc[has_subhead,:]
        
    # unique downloads cause there are sometimes multiple publications per journal
    unique_downloads = curr_data2.drop_duplicates(
        subset=["Print ISSN", "Online ISSN", "Journal", "Year", "Platform"])
    return curr_data2, unique_downloads


# get stats about the number and proportion of uoft publications
def get_stats(all_data, download_data):
    stats = {"Number of UofT publications in journals subscribed to:": str(len(all_data)),
             "Number of UofT authors of publications in journals subscribed to:": str(sum(all_data["NumUofTAuthors"])),
             "Number of journals with UofT publications subscribed to:": str(len(download_data)),
             "Percentage of journals with UofT publications subscribed to:": 
              str(round((len(download_data) / len(selected_data2)) * 100, 3)) + "%"
            }
    stats = pd.DataFrame(stats.items(), columns=["Stat", "Value"])
    stats = stats.set_index("Stat")
    return stats


# get counts of document types
def get_doctype_counts(all_data):
    doctype_counts = all_data["Document Type"].value_counts()
    indexes = list(doctype_counts.index)
    curr_counts = list(doctype_counts.values)

    i = 0
    counts = {}
    for doctype in indexes:
        types = doctype.replace(',',';').split(";")
        for onetype in types:
            onetype = onetype.strip()
            if onetype not in counts:
                counts[onetype] = 0
            counts[onetype] = counts[onetype] + curr_counts[i]
        i = i + 1
    return counts


# get category counts
def get_cat_counts(all_data, column):
    cat_counts = all_data[column].value_counts()
    indexes = list(cat_counts.index)
    curr_counts = list(cat_counts.values)

    i = 0
    cat_counts = {}
    for cat in indexes:
        types = cat.replace(',',';').split(";")
        for onetype in types:
            onetype = onetype.strip()
            if onetype not in cat_counts:
                cat_counts[onetype] = 0
            cat_counts[onetype] = cat_counts[onetype] + curr_counts[i]
        i = i + 1
        
    return cat_counts


# get subject breakdown counts
def get_subject_counts(all_data):
    cat3_counts = all_data["Category: Subjects"].value_counts()
    indexes = list(cat3_counts.index)
    curr_counts = list(cat3_counts.values)

    i = 0
    cat3_counts = {}
    for cat3 in indexes:
        types = cat3.replace(',',';').split(";")
        for onetype in types:
            # also turn it all lowercase since sometimes the subject appears as all caps and 
            # sometimes only the first character is capitalized
            onetype = onetype.strip().lower()
            if onetype not in cat3_counts:
                cat3_counts[onetype] = 0
            cat3_counts[onetype] = cat3_counts[onetype] + curr_counts[i]
        i = i + 1

    all_cat3_counts = cat3_counts.copy()
    max_subject = max(all_cat3_counts.items(), key=operator.itemgetter(1))[0]
    bound = cat3_counts[max_subject] / 4

    small = {}
    total = 0
    for cat3 in cat3_counts:
        if cat3_counts[cat3] < bound:
            small[cat3] = cat3_counts[cat3]
            total = total + cat3_counts[cat3]

    too_small = list(small.keys())
    for key in too_small:
        del cat3_counts[key]
    cat3_counts["Other"] = total

    all_cat3_data = pd.DataFrame(all_cat3_counts.items(), columns=['Subject', 'Number of Publications'])
    cat3_data = pd.DataFrame(cat3_counts.items(), columns=["Subject", "Num Publications"])
    return cat3_data, all_cat3_data

In [73]:
# data selection widgets

# time range slider
timerange2 = widgets.SelectionRangeSlider(options=options, index=(0,len(dates) - 1), 
                                          description="Time Range", disabled=False,
                                          layout={'width': '500px'})

# download type selection dropdown
download_dd2 = widgets.Dropdown(options = ["All", "HTML", "PDF"], 
                                description="Downloads:")

# Scholars Portal selection dropdown
sp_dd2 = widgets.Dropdown(options = ["All", "Only SP", "No SP"], description="Choose SP:")

# platforms dropdown
platforms_dd2 = widgets.Dropdown(options = platforms_list, description="Platforms:")

# packages dropdown
packages_dd2 = widgets.Dropdown(options = packages_list, description="Packages:")

# Select Data button
data_btn2 = widgets.Button(description='Select Data', disabled=False, button_style='')

# put together in a box
data_widgets2 = widgets.VBox([timerange2, download_dd2, sp_dd2, platforms_dd2, 
                              packages_dd2, data_btn2])

##### 

# display widgets

# headings dropdown
head_dd = widgets.Dropdown(options = ["All Headings"] + 
                           list(get_cat_counts(uoft_data, "Category: Heading 1").keys()))

# subheadings dropdown
subhead_dd = widgets.Dropdown(options = ["All Subheadings"] + 
                              list(get_cat_counts(uoft_data, "Category: Subheadings").keys()))

# put together in a box
input_widgets2 = widgets.HBox([head_dd, subhead_dd])

#####

# loading bar
loading4 = widgets.IntProgress(value=0, min=0, max=15, step=1, description='Loading:', 
                               bar_style='info', orientation='horizontal')

#####

# output widgets

# stats
uoft_stats = widgets.Output()

# distributions
download_dist = go.FigureWidget()
num_authors_dist = go.FigureWidget()
prop_authors_dist = go.FigureWidget()
num_pages_dist = go.FigureWidget()

# journal types
pubtype_dist = go.FigureWidget()
doctype_dist = go.FigureWidget()

# categories
cat1_graph = go.FigureWidget()
cat2_graph = go.FigureWidget()
cat3_graph = go.FigureWidget()

In [74]:
# update functions

# select data based on data selections
def select_data_update2(change):
    global selected_data2
    global uoft_reports2
    global uoft_downloads2
    global curr_data2
    global curr_downloads
    global proportions
    uoft_stats.clear_output() 
    
    loading4.bar_style = ""
    loading4.value = 0
    
    start, end = timerange2.get_state()["index"] 
    start_year = dates[start].year
    start_month = dates[start].strftime('%b')
    end_year = dates[end].year
    end_month = dates[end].strftime('%b')
    type_of_downloads = download_dd2.value
    sp_selection = sp_dd2.value
    platform_selection = platforms_dd2.value
    package_selection = packages_dd2.value
    
    selected_data2 = get_selected_report_data(type_of_downloads, start_year, start_month, end_year, end_month)
    loading4.value = loading4.value + 1
    
    if sp_selection == "Only SP":
        selected_data2 = selected_data2[selected_data2["SP"] == "Yes"]
    if sp_selection == "No SP":
        selected_data2 = selected_data2[selected_data2["SP"] == "No"]
    if platform_selection != "All":
        selected_data2 = selected_data2[selected_data2["Platform"] == platform_selection]
    if package_selection != "All":
        selected_data2 = selected_data2[selected_data2["FilePackage"] == package_selection]
    loading4.value = loading4.value + 1
    
    uoft_reports2 = merge_selected_reports_and_uoft(selected_data2)
    loading4.value = loading4.value + 1
    
    loading4.value = loading4.value + 1
    
    curr_data2, curr_downloads = select_topics(head_selection, subhead_selection, uoft_reports2)
    proportions = pd.DataFrame(curr_data2["NumUofTAuthors"] / 
                               curr_data2["Total number of authors"]).rename(columns={0: "Proportions"})
    update_uoft_figs(curr_data2, curr_downloads, proportions)
    
    loading4.value = loading4.value + 1
    loading4.bar_style = "success"
    

# heading selection changed, update graphs
def head_update(change):  
    global head_selection
    global curr_data2
    global curr_downloads
    global proportions
    uoft_stats.clear_output() 
    
    loading4.bar_style = ""
    loading4.value = 5
    
    head_selection = change.new
    curr_data2, curr_downloads = select_topics(head_selection, subhead_selection, uoft_reports2)
    proportions = curr_data2["NumUofTAuthors"] / curr_data2["Total number of authors"]
    proportions = pd.DataFrame(proportions).rename(columns={0: "Proportions"})
    update_uoft_figs(curr_data2, curr_downloads, proportions)
    
    loading4.value = loading4.value + 1
    loading4.bar_style = "success"
    

# subheading selection changed, update graphs
def subhead_update(change):  
    global subhead_selection
    global curr_data2
    global curr_downloads
    global proportions
    uoft_stats.clear_output() 
    
    loading4.bar_style = ""
    loading4.value = 5
    
    subhead_selection = change.new
    curr_data2, curr_downloads = select_topics(head_selection, subhead_selection, uoft_reports2)
    proportions = curr_data2["NumUofTAuthors"] / curr_data2["Total number of authors"]
    proportions = pd.DataFrame(proportions).rename(columns={0: "Proportions"})
    update_uoft_figs(curr_data2, curr_downloads, proportions)
    
    loading4.value = loading4.value + 1
    loading4.bar_style = "success"
    
    
# update the distributions and stats
def update_uoft_figs(curr_data2, curr_downloads, proportions):
    with uoft_stats:
        display(get_stats(curr_data2, curr_downloads))
    loading4.value = loading4.value + 1
    
    # update download distribution
    new_fig = px.histogram(curr_downloads, x="Downloads", marginal="box", nbins=200, 
                           title="Downloads of Journals with UofT Publications", height=400)
    download_dist.data = []
    download_dist.add_traces(new_fig.data)
    download_dist.layout = new_fig.layout
    loading4.value = loading4.value + 1
    
    # update proportion authors distribution
    fig2 = px.histogram(proportions, x="Proportions", nbins=20, 
                        title="Proportion of UofT Authors in Publications", height=400,
                        labels={"Proportions": "Number of UofT Authors / Total Number of Authors"})
    prop_authors_dist.data = []
    prop_authors_dist.add_traces(fig2.data)
    prop_authors_dist.layout = fig2.layout
    loading4.value = loading4.value + 1
        
    # update num authors distribution
    fig1 = px.histogram(curr_data2, x="NumUofTAuthors", marginal="box", nbins=100, 
                        title="Number of UofT Authors in Publications", height=400,
                        labels={"NumUofTAuthors": "Number of UofT Authors"})
    num_authors_dist.data = []
    num_authors_dist.add_traces(fig1.data)
    num_authors_dist.layout = fig1.layout
    loading4.value = loading4.value + 1
        
    # update num pages
    fig3 = px.histogram(curr_data2, x="Number of Pages", marginal="box", nbins=100, 
                        title="Number of Pages in UofT Publications", height=400)
    num_pages_dist.data = []
    num_pages_dist.add_traces(fig3.data)
    num_pages_dist.layout = fig3.layout
    loading4.value = loading4.value + 1
        
    update_pies(curr_data2)
        

# update the pie charts
def update_pies(curr_data2):
    # update pubtypes
    fig = px.pie(values=list(curr_data2["PubType"].value_counts().values), 
                 names=list(curr_data2["PubType"].value_counts().index), 
                 title='Publication Types')
    pubtype_dist.data = []
    pubtype_dist.add_traces(fig.data)
    pubtype_dist.layout = fig.layout
    loading4.value = loading4.value + 1
        
    # update doctypes
    doctype_counts = get_doctype_counts(curr_data2)
    fig1 = px.pie(values=list(doctype_counts.values()), 
                  names=list(doctype_counts.keys()), 
                  title='Document Types')
    doctype_dist.data = []
    doctype_dist.add_traces(fig1.data)
    doctype_dist.layout = fig1.layout
    loading4.value = loading4.value + 1
        
    # update cat1
    cat1_counts = get_cat_counts(curr_data2, "Category: Heading 1")
    #cat1_counts = cat1_counts.rename(columns={"index": "Heading"})
    fig2 = px.pie(values=list(cat1_counts.values()), 
                  names=list(cat1_counts.keys()), 
                  title='Heading 1 Categories', labels={"Category: Heading 1": "Num Publications"})
    cat1_graph.data = []
    cat1_graph.add_traces(fig2.data)
    cat1_graph.layout = fig2.layout
    loading4.value = loading4.value + 1
        
    # update cat2
    cat2_counts = get_cat_counts(curr_data2, "Category: Subheadings")
    fig3 = px.pie(values=list(cat2_counts.values()), 
                  names=list(cat2_counts.keys()), 
                  title='Subheading Categories')
    cat2_graph.data = []
    cat2_graph.add_traces(fig3.data)
    cat2_graph.layout = fig3.layout
    loading4.value = loading4.value + 1
        
    # update cat3
    cat3_counts, all_cat3_counts = get_subject_counts(curr_data2)
    fig4 = px.pie(cat3_counts, values="Num Publications", 
                  names="Subject", 
                  title='Subject Categories')
    cat3_graph.data = []
    cat3_graph.add_traces(fig4.data)
    cat3_graph.layout = fig4.layout
    loading4.value = loading4.value + 1

In [75]:
# display

head_dd.observe(head_update, names = 'value')
subhead_dd.observe(subhead_update, names = 'value')
data_btn2.on_click(select_data_update2)

selections2 = widgets.Accordion(children=[data_widgets2, input_widgets2])
selections2.set_title(0, 'Data Selection')
selections2.set_title(1, 'Display Selection')

dist_tab = widgets.Tab([download_dist, num_authors_dist, prop_authors_dist, num_pages_dist])
dist_tab.set_title(0, "Downloads")
dist_tab.set_title(1, 'Num Authors')
dist_tab.set_title(2, 'Prop Authors')
dist_tab.set_title(3, 'Pages')

type_tab = widgets.Tab([pubtype_dist, doctype_dist])
type_tab.set_title(0, "Publication Types")
type_tab.set_title(1, 'Document Types')

cat_tab = widgets.Tab([cat1_graph, cat2_graph, cat3_graph])
cat_tab.set_title(0, "Headings")
cat_tab.set_title(1, 'Subheadings')
cat_tab.set_title(2, 'Subjects')

main_uoft_tab = widgets.Tab([uoft_stats, dist_tab, type_tab, cat_tab])
main_uoft_tab.set_title(0, "Stats")
main_uoft_tab.set_title(1, 'Distributions')
main_uoft_tab.set_title(2, 'Journal Types')
main_uoft_tab.set_title(3, 'Categories')

In [76]:
display(selections2)
display(loading4)
display(main_uoft_tab)

Accordion(children=(VBox(children=(SelectionRangeSlider(description='Time Range', index=(0, 59), layout=Layout…

IntProgress(value=0, bar_style='info', description='Loading:', max=15)

Tab(children=(Output(), Tab(children=(FigureWidget({
    'data': [], 'layout': {'template': '...'}
}), FigureW…