# TODO
1. replicate all the other report graphs
    - graphs to create: 
        - revision.png [x]
        - time_at_editor.png [x]
        - main_issues.png [x]
        - download_histograms.png
        - downloads.png
2. create a dashboard containing all the graphs

In [4]:
import pandas as pd
import plotly.graph_objects as go
import numpy as np
from datetime import datetime

## Revision.png & time_at_editor.png

In [2]:
analysis_df = pd.read_stata('data/git-events.dta')

One or more strings in the dta file could not be decoded using utf-8, and
so the fallback encoding of latin-1 is being used.  This can happen when a file
has been incorrectly encoded by Stata or some other software. You should verify
the string values returned are correct.
  analysis_df = pd.read_stata('data/git-events.dta')


In [3]:
analysis_df = analysis_df.sort_values(by=['MS', 'numeric_date'])
analysis_df['spell'] = (analysis_df['numeric_date']-analysis_df['numeric_date'].shift(1))/3600/24
analysis_df['matching'] = (analysis_df['MS'] == analysis_df['MS'].shift(1))
make_missing = analysis_df.index[analysis_df['matching']==False]
analysis_df.loc[make_missing, 'spell'] = np.NaN
analysis_df['at_editor'] = (analysis_df['branch_imputed'].shift(1) == analysis_df['branch_imputed']) | (analysis_df['branch_imputed'].shift(1) == 'author')
analysis_df['change'] = -(analysis_df['at_editor'].shift(1)==analysis_df['at_editor'])
analysis_df['spell_id'] = analysis_df.groupby('MS')['change'].transform(pd.Series.cumsum)

In [4]:
collapsed = analysis_df.groupby(by=['MS', 'accepted_at', 'spell_id', 'at_editor']).sum().reset_index()
collapsed = collapsed[['MS', 'accepted_at', 'spell_id', 'at_editor', 'spell']]
collapsed = collapsed[collapsed['spell_id'] != 1]
collapsed['spell_id'] = np.floor((collapsed['spell_id']-1)/2)
collapsed['spell_id'] = collapsed['spell_id'].astype(int)
collapsed = collapsed.pivot(index=['MS', 'spell_id'], columns='at_editor', values=['spell', 'accepted_at']).reset_index()
collapsed.columns = ['MS', 'revision','time_at_author', 'time_at_editor', 'accepted_at0', 'accepted_at']
collapsed.drop('accepted_at0',axis=1,inplace=True)
collapsed['max_revision'] = collapsed.groupby('MS')['revision'].transform(pd.Series.max)

  collapsed = analysis_df.groupby(by=['MS', 'accepted_at', 'spell_id', 'at_editor']).sum().reset_index()


In [41]:
time_at_editor_chart = go.Figure()
time_at_editor_chart.add_trace(go.Histogram(
        x=collapsed[collapsed['revision']==0]['time_at_editor'],
        hovertemplate="%{y}",
        name='First submission',
        )
    )
time_at_editor_chart.add_trace(go.Histogram(
        x=collapsed[collapsed['revision']>0]['time_at_editor'],
        hovertemplate="%{y}",
        name='Revision',
    )
)
time_at_editor_chart.update_layout(
    title={
            "text": f"Time at editor",
            },
    font = dict(
            size = 14
            ),
    showlegend = False,
    hoverlabel = dict(
            font_size = 14,
            font_family = "Rockwell"
        ),
    barmode='overlay',
    xaxis_title_text='Days at editor',
    yaxis_title_text='Frequency',
)
time_at_editor_chart.update_traces(opacity=0.5)
time_at_editor_chart.show()

In [35]:
revisions_chart = go.Figure(data = go.Histogram(
            x=collapsed[collapsed['revision']==0]['max_revision'],
            hovertemplate="%{y}<extra></extra>"
        )
    )
revisions_chart.update_layout(
    title={
            "text": f"Number of revision round for accepted packages",
            },
    font = dict(
            size = 14
            ),
    showlegend = False,
    hoverlabel = dict(
            font_size = 14,
            font_family = "Rockwell"
        ),
    barmode='overlay',
    xaxis_title_text='Accepted revison',
    yaxis_title_text='Frequency',
)
revisions_chart.update_traces(opacity=0.5)
revisions_chart.show()

## main_issues.png

In [16]:
issues = pd.read_stata("data/issues.dta")
issues.drop(['MS'],axis=1, inplace=True)

In [69]:
mean_issues = dict()
for col in issues.columns:
    mean_issues[col] = issues[col].mean()

mean_issues = pd.DataFrame(mean_issues,index=[0]).melt()
mean_issues.columns = ['issue', 'value']
mean_issues['value'] = mean_issues['value'] * 100 
mean_issues.loc[0,'issue'] = "Cite data"
mean_issues.loc[2,'issue'] = "Confidential data"
mean_issues.loc[3,'issue'] = "Save output"
mean_issues.loc[4,'issue'] = "Relative path"
mean_issues.loc[5,'issue'] = "Include data"
mean_issues.loc[6,'issue'] = "Stata packages"
mean_issues.loc[7,'issue'] = "Matlab toolboxes"

In [85]:
main_issues_chart = go.Figure(data=go.Scatter(
        x=mean_issues['value'],
        y=mean_issues['issue'],
        hovertemplate="%{x}<extra></extra>",
        orientation='h',
        mode='markers'
    )
)
for i in range(mean_issues.shape[0]):
    main_issues_chart.add_shape(
        type='line',
        x0=0, y0=mean_issues.loc[i,'issue'], 
        x1=mean_issues.loc[i,'value'], y1=mean_issues.loc[i,'issue'], 
        line_color="#cccccc"
    )

main_issues_chart.update_layout(
    title={
            "text": f"Main issues during revision of packages",
            },
    font = dict(
            size = 14
            ),
    showlegend = False,
    hoverlabel = dict(
            font_size = 14,
            font_family = "Rockwell"
        ),
    xaxis_title_text='Percent',
)
main_issues_chart.update_traces(marker_size=10)
main_issues_chart.show()

## Downloads.png & Downloads_histogram.png

In [136]:
zenodo22 = pd.read_csv('zenodo/zenodo_data_2022.csv')
zenodo22['created_at'] = zenodo22['created'].apply(lambda x: datetime.strptime(x.split('T')[0],'%Y-%m-%d'))
zenodo22.drop(['downloads', 'views', 'revision', 'created'], axis=1, inplace=True)
zenodo22.columns = ['id', 'downloads', 'views', 'created_at']
zenodo22['year'] = '2022'

In [137]:
creation_time = zenodo22[['id', 'created_at']]

In [138]:
zenodo21 = pd.read_csv('zenodo/zenodo_data_2021.csv')
zenodo21.drop(['downloads', 'views', 'revisions'], axis=1, inplace=True)
zenodo21.columns = ['id', 'downloads', 'views']
zenodo21['year'] = '2021'
zenodo21 = zenodo21.merge(creation_time,how='left', on='id')

In [139]:
zenodo = pd.concat((zenodo21,zenodo22))

In [140]:
zenodo['starts_at'] = "2021-09-14"
zenodo.loc[zenodo['year']=='2022','starts_at'] = "2022-09-07"
zenodo['starts_at'] = zenodo['starts_at'].apply(lambda x : datetime.strptime(x,'%Y-%m-%d'))

In [141]:
zenodo['since'] = 12 * (
    zenodo['starts_at'].dt.year-
    zenodo['created_at'].dt.year
    ) + (
    zenodo['starts_at'].dt.month-
    zenodo['created_at'].dt.month
    )
zenodo['downloads_per_month'] = zenodo['downloads']/zenodo['since']
zenodo.sort_values('id')

Unnamed: 0,id,downloads,views,year,created_at,starts_at,since,downloads_per_month
79,3366100,22.0,105.0,2021,2019-08-12,2021-09-14,25,0.880000
131,3366100,62.0,214.0,2022,2019-08-12,2022-09-07,37,1.675676
78,3908270,21.0,53.0,2021,2020-06-25,2021-09-14,15,1.400000
130,3908270,32.0,108.0,2022,2020-06-25,2022-09-07,27,1.185185
77,3923957,41.0,116.0,2021,2020-06-30,2021-09-14,15,2.733333
...,...,...,...,...,...,...,...,...
4,6963994,2.0,7.0,2022,2022-08-04,2022-09-07,1,2.000000
3,6969758,1.0,23.0,2022,2022-08-06,2022-09-07,1,1.000000
2,6976241,2.0,19.0,2022,2022-08-09,2022-09-07,1,2.000000
1,6990966,2.0,14.0,2022,2022-08-14,2022-09-07,1,2.000000


In [142]:
zenodo.drop(['created_at','starts_at','views'], axis=1,inplace=True)

In [143]:
zenodo21 = zenodo[zenodo.year=="2021"]
zenodo21.drop('year',axis=1,inplace=True)
zenodo21.columns = [col+'2021' if col != 'id' else col for col in zenodo21.columns]
zenodo22 = zenodo[zenodo.year=="2022"]
zenodo22.drop('year',axis=1,inplace=True)
zenodo22.columns = [col+'2022' if col != 'id' else col for col in zenodo22.columns]



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [144]:
zenodo_wide = zenodo22.merge(zenodo21, how='left', on='id')

In [145]:
zenodo_wide.loc[zenodo_wide['id'] == 5259883,'lbl'] = "Geography and Agricultural Productivity"
zenodo_wide.loc[zenodo_wide['id'] == 4619197,'lbl'] = "Quasi-Experimental Shift-Share Research Designs"
zenodo_wide.loc[zenodo_wide['id'] == 4448256,'lbl'] = "Identifying Shocks via Time-Varying Volatility"
zenodo_wide.loc[zenodo_wide['id'] == 4773516,'lbl'] = "Skill-Biased Structural Change"
zenodo_wide.loc[zenodo_wide['id'] == 3997900,'lbl'] = "Trade and Domestic Production Networks"
zenodo_wide.loc[zenodo_wide['id'] == 5104830,'lbl'] = "Measuring the Incentive to Collude"

In [146]:
downloads_chart = go.Figure(data=go.Scatter(
        x=zenodo_wide['downloads_per_month2022'],
        y=zenodo_wide['downloads_per_month2021'],
        hovertemplate="%{x} : %{y}<extra></extra>",
        mode='markers'
    )
)

downloads_chart.update_layout(
    title={
            "text": f"Package Downloads by years",
            },
    font = dict(
            size = 14
            ),
    showlegend = False,
    hoverlabel = dict(
            font_size = 14,
            font_family = "Rockwell"
        ),
    xaxis_title_text='Downloads in 2022',
    yaxis_title_text='Downloads in 2021',
    #FIXME: set up overlaying markers
)
downloads_chart.update_traces(marker_size=10,opacity=0.5)
downloads_chart.show()

In [148]:
downloads_per_month_chart = go.Figure(data = go.Histogram(
            x=zenodo_wide['downloads_per_month2022'],
            hovertemplate="%{y}<extra></extra>",
            xbins=dict(
                start=0,
                end=9,
                size=1
            ),
        )
    )
downloads_per_month_chart.update_layout(
    title={
            "text": f"Downloads per months in 2022",
            },
    font = dict(
            size = 14
            ),
    showlegend = False,
    hoverlabel = dict(
            font_size = 14,
            font_family = "Rockwell"
        ),
    xaxis_title_text='Downloads per month in 2022',
)
downloads_per_month_chart.update_traces(opacity=0.3)
downloads_per_month_chart.show()