# Sharing by article type

## Imports 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
## Imports from preprocessing module
from preprocessing import load_clean_dataset

## Constants

In [3]:
FILE_NAME = 'https://raw.githubusercontent.com/TomMonks/' \
    + 'des_sharing_lit_review/main/data/share_sim_data_extract.zip'

## Functions to genereate a high level summary table.

We include three functions to generate a high level summary table.  

* The first calculates the summary metrics from the main table.
* The second function creates a tabular view of the data split by item type.
* The third function formats this as a table suitable for journal publication.  In our paper this is Table 2.

In [4]:
def high_level_metrics(df, name='None'):
    '''A simple high level summary of the review.
    
    Returns a dict containing simple high level counts
    and percentages in the data#
    
    Params:
    -------
    df: pd.DataFrame 
        A cleaned dataset.  Could be overall or subgroups/categories
        
    Returns:
    --------
        dict 
    '''
    results = {}
    included = df[df['study_included'] == 1]
    available = included[included['model_code_available'] == 1]
    results['n_included'] = len(included[included['study_included'] == 1])
    results['n_foss'] = len(included[included['foss_sim'] == '1'])
    results['n_covid'] = len(included[included['covid'] == 1])
    results['n_avail'] = len(included[included['model_code_available'] == 1])
    results['n_foss_avail'] = len(available[available['foss_sim'] == '1'])
    results['n_covid_avail'] = len(available[available['covid'] == 1])
    results['per_foss'] = results['n_foss'] / results['n_included']
    results['per_covid'] = results['n_covid'] / results['n_included']
    results['per_avail'] = results['n_avail'] / results['n_included']
    results['per_foss_avail'] = results['n_foss_avail'] / results['n_foss']
    results['per_covid_avail'] = results['n_covid_avail'] / results['n_covid']
    results['reporting_guide'] = len(included[included['reporting_guidelines_mention'] != 'None'])
    results['per_reporting_guide'] = results['reporting_guide'] / results['n_included']
    return pd.Series(results, name=name)

In [5]:
def analysis_by_item_type(df_clean, decimals=4):
    '''
    Conducts a high level analysis by item type: journal, conference, book
    + overall.
    
    Params:
    -------
    df_clean: pd.DataFrame
        Assumes a cleaned version of the dataset.
    
    Returns: 
    -------
    pd.DataFrame
        Containing the result summary
        
    '''
    overall_results = high_level_metrics(df_clean, 'overall')
    article_type_results = []
    article_types = df_clean['item_type'].unique().tolist()
    for article_type in article_types:
        subset = df_clean[df_clean['item_type'] == article_type]
        article_type_results.append(high_level_metrics(subset, 
                                                       name=article_type))
    article_type_results = [overall_results] + article_type_results
    return pd.DataFrame(article_type_results).T.round(decimals)


In [6]:
def format_table2(summary, article_type='overall'):
    '''
    Create a formatted table 1 of results for manuscript.
    '''
    total_rows = ['n_included', 'n_covid', 'n_foss']
    avail_rows = ['n_avail', 'n_covid_avail', 'n_foss_avail']
    per_rows = ['per_avail', 'per_covid_avail', 'per_foss_avail']
    new_cols_titles = ['metric', article_type, 'shared', 'per']
       
    # only work with the overall column
    selected_cols = [article_type] # , 'journalArticle', 'conferencePaper', 'book']
    overall = summary[selected_cols]
    
    # total number of papers
    totals = overall.loc[total_rows]
    totals = totals.reset_index()
    totals[article_type] = totals[article_type].map('{:,.0f}'.format)
    
    # no. models that are available from the total
    shared = overall.loc[avail_rows]
    shared = shared.reset_index()
    
    # percentage of papers 
    per = overall.loc[per_rows]
    per = per.reset_index()
    per = per * 100
        
    # construct table and format columns in n (%) format
    t1 = pd.concat([totals, shared[article_type], per[article_type]], \
                   axis=1, ignore_index=True)

    t1.columns = new_cols_titles
    
    t1['shared n (\%)'] = t1['shared'].map('{:,.0f}'.format) \
        + ' (' + t1['per'].map('{:,.1f}'.format) + ')'
    
    to_drop = ['shared', 'per']
    t1 = t1.drop(to_drop, axis=1)
    t1.iat[0, 0] = 'Total'
    t1.iat[1, 0] = 'COVID-19'
    t1.iat[2, 0] = 'FOSS'
    t1 = t1.set_index('metric')
    return t1
    

In [7]:
def format_table3(journals, confs, books):
    '''
    Converts three formatted tables into a single formatted table
    
    Params:
    -----
    journals - dataframe
        Table 2 containing journals only
        
    conferences - dataframe
        Table 2 containing conferences only
        
    books - dataframe
        Table 2 containing books only
    '''
    comb = pd.concat([journals, confs, books], axis=1).T
    comb.index = pd.MultiIndex.from_tuples([

        ("Journal", "n"), ("Journal", "shared n (\%)"), ("Conference", "n"), 
        ("Conference", "shared n (\%)"), ("Book", "n"), ("Book", "shared n (\%)")

    ])
    return comb

## Read in data

In [8]:
clean = load_clean_dataset(FILE_NAME)

## Results

### Create a high level summary of the main dataset by article type.

In the manuscript table 3 provides a simple high level results for Journals, Conference and Book manuscripts.

In [9]:
def get_shared_models(df):
    included = clean[clean['study_included'] == 1]
    return included[included['model_code_available'] == 1]

In [10]:
get_shared_models(clean).columns

Index(['key', 'item_type', 'pub_yr', 'author', 'title', 'pub_title', 'doi',
       'url', 'study_included', 'model_code_available',
       'reporting_guidelines_mention', 'covid', 'sim_software', 'foss_sim',
       'model_archive', 'model_repo', 'model_journal_supp',
       'model_journal_supp', 'model_personal_org', 'model_platform',
       'available_on_req', 'excluded_reason'],
      dtype='object')

In [11]:
results = analysis_by_item_type(clean)

In [12]:
journals = format_table2(results, article_type='journalArticle')
confs = format_table2(results, article_type='conferencePaper')
books = format_table2(results, article_type='book')

comb = format_table3(journals, confs, books)
comb

Unnamed: 0,metric,Total,COVID-19,FOSS
Journal,n,364,44,68
Journal,shared n (\%),38 (10.4),12 (27.3),22 (32.4)
Conference,n,98,14,14
Conference,shared n (\%),4 (4.1),4 (28.6),3 (21.4)
Book,n,24,1,6
Book,shared n (\%),1 (4.2),0 (0.0),1 (16.7)


##  Table 3 LateX

Here we generate the LaTeX to insert into a journal publication.

In [13]:
print(comb.style.to_latex(hrules=True, 
                          label="Table:3", 
                          caption="Sharing of models by article type."))

\begin{table}
\caption{Sharing of models by article type.}
\label{Table:3}
\begin{tabular}{lllll}
\toprule
 & metric & Total & COVID-19 & FOSS \\
\midrule
\multirow[c]{2}{*}{Journal} & n & 364 & 44 & 68 \\
 & shared n (\%) & 38 (10.4) & 12 (27.3) & 22 (32.4) \\
\multirow[c]{2}{*}{Conference} & n & 98 & 14 & 14 \\
 & shared n (\%) & 4 (4.1) & 4 (28.6) & 3 (21.4) \\
\multirow[c]{2}{*}{Book} & n & 24 & 1 & 6 \\
 & shared n (\%) & 1 (4.2) & 0 (0.0) & 1 (16.7) \\
\bottomrule
\end{tabular}
\end{table}



In [14]:
print(comb.T.style.to_latex(hrules=True, 
                          label="Table:3", 
                          caption="Sharing of models by article type."))

\begin{table}
\caption{Sharing of models by article type.}
\label{Table:3}
\begin{tabular}{lllllll}
\toprule
 & \multicolumn{2}{r}{Journal} & \multicolumn{2}{r}{Conference} & \multicolumn{2}{r}{Book} \\
 & n & shared n (\%) & n & shared n (\%) & n & shared n (\%) \\
metric &  &  &  &  &  &  \\
\midrule
Total & 364 & 38 (10.4) & 98 & 4 (4.1) & 24 & 1 (4.2) \\
COVID-19 & 44 & 12 (27.3) & 14 & 4 (28.6) & 1 & 0 (0.0) \\
FOSS & 68 & 22 (32.4) & 14 & 3 (21.4) & 6 & 1 (16.7) \\
\bottomrule
\end{tabular}
\end{table}

