# TODO
1. replicate all the other report graphs
    - graphs to create: 
        - revision.png [x]
        - time_at_editor.png [x]
        - main_issues.png [x]
        - download_histograms.png
        - downloads.png
2. create a dashboard containing all the graphs

In [1]:
import pandas as pd
import plotly.graph_objects as go
import numpy as np
from datetime import datetime

## Revision.png & time_at_editor.png

In [81]:
analysis_df = pd.read_stata('data/git-events.dta')



One or more strings in the dta file could not be decoded using utf-8, and
so the fallback encoding of latin-1 is being used.  This can happen when a file
has been incorrectly encoded by Stata or some other software. You should verify
the string values returned are correct.



In [60]:
analysis_df = analysis_df.sort_values(by=['MS', 'numeric_date'])
analysis_df['spell'] = (analysis_df['numeric_date']-analysis_df['numeric_date'].shift(1))/3600/24
analysis_df['matching'] = (analysis_df['MS'] == analysis_df['MS'].shift(1))
make_missing = analysis_df.index[analysis_df['matching']==False]
analysis_df.loc[make_missing, 'spell'] = np.NaN
analysis_df['at_editor'] = (analysis_df['branch_imputed'].shift(1) == analysis_df['branch_imputed']) | (analysis_df['branch_imputed'].shift(1) == 'author')
analysis_df['change'] = -(analysis_df['at_editor'].shift(1)==analysis_df['at_editor'])
analysis_df['spell_id'] = analysis_df.groupby('MS')['change'].transform(pd.Series.cumsum)
analysis_df['year'] = analysis_df['date'].apply(lambda x:x.split('-')[0])

In [58]:
collapsed = analysis_df[['MS', 'accepted_at', 'spell_id', 'at_editor', 'spell']].groupby(by=['MS', 'accepted_at', 'spell_id', 'at_editor']).sum().reset_index()
collapsed = collapsed[['MS', 'accepted_at', 'spell_id', 'at_editor', 'spell']]
collapsed = collapsed[collapsed['spell_id'] != 1]
collapsed['spell_id'] = np.floor((collapsed['spell_id']-1)/2)
collapsed['spell_id'] = collapsed['spell_id'].astype(int)


The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.



In [59]:
collapsed

Unnamed: 0,MS,accepted_at,spell_id,at_editor,spell
1,17262,1.637655e+09,0,True,20.385208
2,17262,1.637655e+09,1,False,37.604502
3,17262,1.637655e+09,1,True,0.007130
4,17262,1.637655e+09,2,False,10.851644
5,17262,1.637655e+09,2,True,0.003032
...,...,...,...,...,...
726,31561,1.679924e+09,1,True,0.008021
728,31601,1.693241e+09,0,True,23.143287
729,31601,1.693241e+09,1,False,37.988819
730,31601,1.693241e+09,1,True,0.003981


In [56]:
collapsed = analysis_df.groupby(by=['MS', 'accepted_at', 'spell_id', 'at_editor']).sum().reset_index()
collapsed = collapsed[['MS', 'accepted_at', 'spell_id', 'at_editor', 'spell']]
collapsed = collapsed[collapsed['spell_id'] != 1]
collapsed['spell_id'] = np.floor((collapsed['spell_id']-1)/2)
collapsed['spell_id'] = collapsed['spell_id'].astype(int)
# collapsed.loc[collapsed['spell_id']==-1,'spell_id'] = 0
# collapsed.drop_duplicates(['MS', 'spell_id'],inplace=True)
collapsed = collapsed.pivot(index=['MS', 'spell_id'], columns='at_editor', values=['spell', 'accepted_at']).reset_index()
collapsed.columns = ['MS', 'revision','time_at_author', 'time_at_editor', 'accepted_at0', 'accepted_at']
collapsed.drop('accepted_at0',axis=1,inplace=True)
collapsed['max_revision'] = collapsed.groupby('MS')['revision'].transform(pd.Series.max)


The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.



In [89]:
collapsed = pd.read_stata('temp/collapsed_accepted_at.dta')

In [91]:
time_at_editor_chart = go.Figure()
time_at_editor_chart.add_trace(go.Histogram(
        x=collapsed[collapsed['revision']==0]['time_at_editor'],
        hovertemplate="%{y}",
        name='First submission',
        )
    )
time_at_editor_chart.add_trace(go.Histogram(
        x=collapsed[collapsed['revision']>0]['time_at_editor'],
        hovertemplate="%{y}",
        name='Revision',
    )
)
time_at_editor_chart.update_layout(
    title={
            "text": f"Time at editor",
            },
    font = dict(
            size = 14
            ),
    showlegend = False,
    hoverlabel = dict(
            font_size = 14,
            font_family = "Rockwell"
        ),
    barmode='overlay',
    xaxis_title_text='Days at editor',
    yaxis_title_text='Frequency',
)
time_at_editor_chart.update_traces(opacity=0.5)
time_at_editor_chart.show()

In [92]:
revisions_chart = go.Figure(data = go.Histogram(
            x=collapsed[collapsed['revision']==0]['max_revision'],
            hovertemplate="%{y}<extra></extra>"
        )
    )
revisions_chart.update_layout(
    title={
            "text": f"Number of revision round for accepted packages",
            },
    font = dict(
            size = 14
            ),
    showlegend = False,
    hoverlabel = dict(
            font_size = 14,
            font_family = "Rockwell"
        ),
    barmode='overlay',
    xaxis_title_text='Accepted revison',
    yaxis_title_text='Frequency',
)
revisions_chart.update_traces(opacity=0.5)
revisions_chart.show()

## Number of revisions needed per year

In [20]:
collapsed_year = analysis_df.groupby(by=['MS', 'year', 'spell_id', 'at_editor']).sum().reset_index()
collapsed_year = collapsed_year[['MS', 'year', 'spell_id', 'at_editor', 'spell']]
collapsed_year = collapsed_year[collapsed_year['spell_id'] != 1]
collapsed_year['spell_id'] = np.floor((collapsed_year['spell_id']-1)/2)
collapsed_year['spell_id'] = collapsed_year['spell_id'].astype(int)
collapsed_year = collapsed_year.pivot(index=['MS', 'spell_id','year'], columns='at_editor', values=['spell']).reset_index()
collapsed_year.columns = ['MS', 'revision','year', 'time_at_author', 'time_at_editor']
collapsed_year['max_revision'] = collapsed_year.groupby('MS')['revision'].transform(pd.Series.max)


The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.



In [82]:
collapsed_year = pd.read_stata('temp/collapsed_year.dta')

In [86]:
collapsed_year = collapsed_year[['MS', 'year', 'max_revision']]
revisions = collapsed_year.groupby(['MS', 'year']).mean().reset_index()
revisions['max_revision'] = revisions['max_revision'].apply(lambda x:int(x))
revisions = revisions.groupby(['year','max_revision']).count().reset_index()
revisions.loc[revisions['year']=='2021','percent'] = revisions.loc[revisions['year']=='2021', 'MS']/37
revisions.loc[revisions['year']=='2022','percent'] = revisions.loc[revisions['year']=='2022', 'MS']/96
revisions.loc[revisions['year']=='2023','percent'] = revisions.loc[revisions['year']=='2023', 'MS']/64

In [88]:
revision_years_chart = go.Figure()
revision_years_chart.add_trace(go.Bar(
        x=revisions.loc[revisions['year']=='2021','max_revision'],
        y=revisions.loc[revisions['year']=='2021','percent'],
        hovertemplate="%{y:.1%}",
        name='2021',
        )
    )
revision_years_chart.add_trace(go.Bar(
        x=revisions.loc[revisions['year']=='2022','max_revision'],
        y=revisions.loc[revisions['year']=='2022','percent'],
        hovertemplate="%{y:.1%}",
        name='2022',
    )
)
revision_years_chart.add_trace(go.Bar(
        x=revisions.loc[revisions['year']=='2023','max_revision'],
        y=revisions.loc[revisions['year']=='2023','percent'],
        hovertemplate="%{y:.1%}",
        name='2023',
    )
)
revision_years_chart.update_layout(
    title={
            "text": f"Number of revisions needed in 2021-23",
            },
    font = dict(
            size = 14
            ),
    showlegend = False,
    hoverlabel = dict(
            font_size = 14,
            font_family = "Rockwell"
        ),
    barmode='group',
    xaxis_title_text='Revisions needed',
    yaxis_title_text='Percentage',
)
revision_years_chart.update_traces(opacity=0.5)
revision_years_chart.show()

## main_issues.png

In [157]:
issues = pd.read_stata("data/issues.dta")
issues.drop_duplicates(['MS'],inplace=True)

In [158]:
issues

Unnamed: 0,MS,cite_data,DAS,confidential_data,save_output,relative_path,include_data,stata_packages,matlab_toolboxes,requirements
0,31799,0,0,0,0,0,0,0,0,1
1,26607,1,0,0,0,0,0,0,0,0
2,31601,1,1,0,0,0,0,0,0,1
3,32201,0,1,0,0,0,0,0,0,1
4,23287,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
267,23976,0,0,0,0,0,0,0,0,0
268,24500,0,0,0,0,0,0,0,0,0
269,23269,0,0,0,0,0,0,0,0,0
270,25632,0,0,0,0,0,0,0,0,0


In [159]:
ms = analysis_df[['MS']].drop_duplicates(['MS'])

In [160]:
ms

Unnamed: 0,MS
0,17262
8,22602
13,24015
18,24242
20,24355
...,...
1188,31360
1190,31409
1195,31561
1200,31601


In [163]:
issues = ms.merge(issues, how='left',on='MS')
issues.drop('MS',axis=1,inplace=True)

In [174]:
mean_issues = dict()
for col in issues.columns:
    mean_issues[col] = issues[col].mean()

mean_issues = pd.DataFrame(mean_issues,index=[0]).melt()
mean_issues.columns = ['issue', 'value']
mean_issues['value'] = mean_issues['value'] * 100 
mean_issues.loc[0,'issue'] = "Cite data"
mean_issues.loc[2,'issue'] = "Confidential data"
mean_issues.loc[3,'issue'] = "Save output"
mean_issues.loc[4,'issue'] = "Relative path"
mean_issues.loc[5,'issue'] = "Include data"
mean_issues.loc[6,'issue'] = "Stata packages"
mean_issues.loc[7,'issue'] = "Matlab toolboxes"
mean_issues.loc[8,'issue'] = "Requirements"
mean_issues = mean_issues.sort_values('value').reset_index(drop=True)
mean_issues.drop(0,inplace=True)
mean_issues.reset_index(drop=True,inplace=True)

In [175]:
mean_issues

Unnamed: 0,issue,value
0,Include data,3.592814
1,Matlab toolboxes,3.592814
2,Confidential data,6.586826
3,Requirements,8.982036
4,Stata packages,12.57485
5,Save output,23.952096
6,DAS,43.712575
7,Cite data,53.293413


In [176]:
main_issues_chart = go.Figure(data=go.Scatter(
        x=mean_issues['value'],
        y=mean_issues['issue'],
        hovertemplate="%{x:.1f}<extra></extra>",
        orientation='h',
        mode='markers'
    )
)
for i in range(mean_issues.shape[0]):
    main_issues_chart.add_shape(
        type='line',
        x0=0, y0=mean_issues.loc[i,'issue'], 
        x1=mean_issues.loc[i,'value'], y1=mean_issues.loc[i,'issue'], 
        line_color="#cccccc"
    )

main_issues_chart.update_layout(
    title={
            "text": f"Main issues during revision of packages",
            },
    font = dict(
            size = 14
            ),
    showlegend = False,
    hoverlabel = dict(
            font_size = 14,
            font_family = "Rockwell"
        ),
    xaxis_title_text='Percent',
)
main_issues_chart.update_traces(marker_size=10)
main_issues_chart.show()

## Downloads.png & Downloads_histogram.png

In [107]:
zenodo23 = pd.read_csv('zenodo/zenodo_data.csv')
zenodo23['created_at'] = zenodo23['created'].apply(lambda x: datetime.strptime(x.split('T')[0],'%Y-%m-%d'))
zenodo23.drop(['downloads', 'views', 'revision', 'created', 'update_time'], axis=1, inplace=True)
zenodo23.columns = ['id', 'downloads', 'views', 'created_at']
zenodo23['year'] = '2023'
zenodo23.drop_duplicates(['id'], inplace=True)
creation_time23 = zenodo23[['id','created_at']]


In [105]:
zenodo22 = pd.read_csv('zenodo/zenodo_data_2022.csv')
zenodo22['created_at'] = zenodo22['created'].apply(lambda x: datetime.strptime(x.split('T')[0],'%Y-%m-%d'))
zenodo22.drop(['downloads', 'views', 'revision', 'created'], axis=1, inplace=True)
zenodo22.columns = ['id', 'downloads', 'views', 'created_at']
zenodo22['year'] = '2022'
creation_time22 = zenodo22[['id', 'created_at']]

In [119]:
zenodo21 = pd.read_csv('zenodo/zenodo_data_2021.csv')
zenodo21.drop(['downloads', 'views', 'revisions'], axis=1, inplace=True)
zenodo21.columns = ['id', 'downloads', 'views']
zenodo21['year'] = '2021'
zenodo21 = zenodo21.merge(creation_time22,how='left', on='id')
zenodo21 = zenodo21.merge(creation_time23,how='left', on='id')
zenodo21 = zenodo21[['id','downloads','views','year','created_at_x']]
zenodo21.columns = ['id','downloads','views','year','created_at']

In [120]:
zenodo21

Unnamed: 0,id,downloads,views,year,created_at
0,5338362,5.0,8.0,2021,2021-08-30
1,5338362,5.0,8.0,2021,2021-08-30
2,5235355,6.0,19.0,2021,2021-08-23
3,5235355,6.0,19.0,2021,2021-08-23
4,5221647,4.0,15.0,2021,2021-08-19
...,...,...,...,...,...
155,3923957,41.0,116.0,2021,2020-06-30
156,3908270,21.0,53.0,2021,2020-06-25
157,3908270,21.0,53.0,2021,2020-06-25
158,3366100,22.0,105.0,2021,2019-08-12


In [121]:
zenodo = pd.concat((zenodo21,zenodo22))
zenodo = pd.concat((zenodo, zenodo23))

In [123]:
zenodo['starts_at'] = "2021-09-14"
zenodo.loc[zenodo['year']=='2022','starts_at'] = "2022-09-07"
zenodo.loc[zenodo['year']=='2023','starts_at'] = "2023-09-07"
zenodo['starts_at'] = zenodo['starts_at'].apply(lambda x : datetime.strptime(x,'%Y-%m-%d'))

In [124]:
zenodo['since'] = 12 * (
    zenodo['starts_at'].dt.year-
    zenodo['created_at'].dt.year
    ) + (
    zenodo['starts_at'].dt.month-
    zenodo['created_at'].dt.month
    )
zenodo['downloads_per_month'] = zenodo['downloads']/zenodo['since']
zenodo.sort_values('id')

Unnamed: 0,id,downloads,views,year,created_at,starts_at,since,downloads_per_month
364,3366100,73.0,232.0,2023,2019-08-12,2023-09-07,49,1.489796
181,3366100,73.0,232.0,2023,2019-08-12,2023-09-07,49,1.489796
131,3366100,62.0,214.0,2022,2019-08-12,2022-09-07,37,1.675676
158,3366100,22.0,105.0,2021,2019-08-12,2021-09-14,25,0.880000
159,3366100,22.0,105.0,2021,2019-08-12,2021-09-14,25,0.880000
...,...,...,...,...,...,...,...,...
185,8192998,14.0,37.0,2023,2023-07-28,2023-09-07,2,7.000000
1,8212833,5.0,26.0,2023,2023-08-03,2023-09-07,1,5.000000
184,8212833,5.0,26.0,2023,2023-08-03,2023-09-07,1,5.000000
183,8226636,1.0,17.0,2023,2023-08-08,2023-09-07,1,1.000000


In [125]:
zenodo.drop(['created_at','starts_at','views'], axis=1,inplace=True)

In [126]:
zenodo21 = zenodo[zenodo.year=="2021"]
zenodo21.drop('year',axis=1,inplace=True)
zenodo21.columns = [col+'2021' if col != 'id' else col for col in zenodo21.columns]
zenodo22 = zenodo[zenodo.year=="2022"]
zenodo22.drop('year',axis=1,inplace=True)
zenodo22.columns = [col+'2022' if col != 'id' else col for col in zenodo22.columns]
zenodo23 = zenodo[zenodo.year=="2023"]
zenodo23.drop('year',axis=1,inplace=True)
zenodo23.columns = [col+'2023' if col != 'id' else col for col in zenodo23.columns]



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [127]:
zenodo_wide = zenodo22.merge(zenodo21, how='left', on='id')
zenodo_wide = zenodo_wide.merge(zenodo23,how='left', on='id')

In [145]:
zenodo_wide.loc[zenodo_wide['id'] == 5259883,'lbl'] = "Geography and Agricultural Productivity"
zenodo_wide.loc[zenodo_wide['id'] == 4619197,'lbl'] = "Quasi-Experimental Shift-Share Research Designs"
zenodo_wide.loc[zenodo_wide['id'] == 4448256,'lbl'] = "Identifying Shocks via Time-Varying Volatility"
zenodo_wide.loc[zenodo_wide['id'] == 4773516,'lbl'] = "Skill-Biased Structural Change"
zenodo_wide.loc[zenodo_wide['id'] == 3997900,'lbl'] = "Trade and Domestic Production Networks"
zenodo_wide.loc[zenodo_wide['id'] == 5104830,'lbl'] = "Measuring the Incentive to Collude"

In [177]:
zenodo_stata = pd.read_stata('temp/zenodo.dta')

In [182]:
len(zenodo_stata['zenodoid'].unique())

184

In [183]:
downloads_chart = go.Figure(data=go.Scatter(
        x=zenodo_stata['downloads_per_month2022'],
        y=zenodo_stata['downloads_per_month2021'],
        hovertemplate="2021-2022<extra></extra>",
        mode='markers',
        name='2021-2022'
    )
)

downloads_chart.add_trace(go.Scatter(
        x=zenodo_stata['downloads_per_month2023'],
        y=zenodo_stata['downloads_per_month2022'],
        hovertemplate="2022-2023<extra></extra>",
        mode='markers',
        name='2021-2022'
    )
)

downloads_chart.add_trace(go.Scatter(
        x=zenodo_stata['downloads_per_month2023'],
        y=zenodo_stata['downloads_per_month2021'],
        hovertemplate="2021-2023<extra></extra>",
        mode='markers',
        name='2021-2023'
    )
)

downloads_chart.update_layout(
    title={
            "text": f"Package Downloads by years",
            },
    font = dict(
            size = 14
            ),
    showlegend = False,
    hoverlabel = dict(
            font_size = 14,
            font_family = "Rockwell"
        ),
    xaxis_title_text='Downloads',
    yaxis_title_text='Downloads',
)
downloads_chart.update_traces(marker_size=10,opacity=0.5)
downloads_chart.show()

In [184]:
downloads_per_month_chart = go.Figure(data = go.Histogram(
            x=zenodo_stata['downloads_per_month2022'],
            hovertemplate="%{x} : %{y}<extra></extra>",
            xbins=dict(
                start=0,
                end=9,
                size=1
            ),
            name='2022'
        )
    )
downloads_per_month_chart.add_trace(go.Histogram(
            x=zenodo_stata['downloads_per_month2023'],
            hovertemplate="%{x} : %{y}<extra></extra>",
            xbins=dict(
                start=0,
                end=9,
                size=1
            ),
            name='2023'
        )
    )
downloads_per_month_chart.update_layout(
    title={
            "text": f"Downloads per months in 2022 & 2023",
            },
    font = dict(
            size = 14
            ),
    showlegend = True,
    hoverlabel = dict(
            font_size = 14,
            font_family = "Rockwell"
        ),
    xaxis_title_text='Downloads per month in 2022',
    barmode='overlay'
)
downloads_per_month_chart.update_traces(opacity=0.3)
downloads_per_month_chart.show()

# Top 5 tables

In [186]:
collapsed_year = pd.read_stata('temp/collapsed_year.dta')
time_tables = collapsed_year.groupby('MS').sum()[['time_at_author','time_at_editor']]
time_tables = time_tables.merge(collapsed_year[['MS','year']], how='left', on='MS')
time_tables = time_tables.drop_duplicates(['MS','year'])
time_tables['total_time'] = time_tables['time_at_author'] + time_tables['time_at_editor']
time_tables = time_tables[time_tables['year']=='2022']
time_tables['total_time'] = time_tables['total_time'].apply(lambda x: round(x))
time_tables['time_at_author'] = time_tables['time_at_author'].apply(lambda x: round(x))
time_tables['time_at_editor'] = time_tables['time_at_editor'].apply(lambda x: round(x))


The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.



In [187]:
total_tab = time_tables.sort_values(
                    'total_time',ascending=False
                ).drop(
                    'year',axis=1
                ).reset_index(
                    drop=True
                ).head()

author_tab = time_tables.sort_values(
                    'time_at_author',ascending=False
                ).drop(
                    'year',axis=1
                ).reset_index(
                    drop=True
                ).head()

editor_tab = time_tables.sort_values(
                    'time_at_editor',ascending=False
                ).drop(
                    'year',axis=1
                ).reset_index(
                    drop=True
                ).head()

In [188]:
odd_row='white'
even_row='lightgrey'
totals_table = go.Figure()
totals_table.add_trace(
    go.Table(
        header = dict(
                    values=list(total_tab.columns),
                    line_color='darkslategray',
                    fill_color='royalblue',
                    align=['left','center'],
                    font=dict(color='white', size=12),
                    height=40
                ),
        cells = dict(
                    values = [total_tab.loc[:,col] for col in total_tab.columns],
                    line_color='darkslategray',
                    fill_color = [[odd_row,even_row]*40],
                    align=['left', 'center'],
                    font_size=12,
                    height=30
                )
    )
)
totals_table.show()

In [189]:
odd_row='white'
even_row='lightgrey'
authors_table = go.Figure()
authors_table.add_trace(
    go.Table(
        header = dict(
                    values=list(author_tab.columns),
                    line_color='darkslategray',
                    fill_color='royalblue',
                    align=['left','center'],
                    font=dict(color='white', size=12),
                    height=40
                ),
        cells = dict(
                    values = [author_tab.loc[:,col] for col in author_tab.columns],
                    line_color='darkslategray',
                    fill_color = [[odd_row,even_row]*40],
                    align=['left', 'center'],
                    font_size=12,
                    height=30
                )
    )
)
authors_table.show()

In [190]:
odd_row='white'
even_row='lightgrey'
editors_table = go.Figure()
editors_table.add_trace(
    go.Table(
        header = dict(
                    values=list(editor_tab.columns),
                    line_color='darkslategray',
                    fill_color='royalblue',
                    align=['left','center'],
                    font=dict(color='white', size=12),
                    height=40
                ),
        cells = dict(
                    values = [editor_tab.loc[:,col] for col in editor_tab.columns],
                    line_color='darkslategray',
                    fill_color = [[odd_row,even_row]*40],
                    align=['left', 'center'],
                    font_size=12,
                    height=30
                )
    )
)
editors_table.show()

# Asserting dta-s and pandas frames are matching

In [2]:
import pandas as pd

In [3]:
# 1st run of csv_to_dta
git_data1 = pd.read_stata('data/git-events.dta')
issues_data1 = pd.read_stata('data/issues.dta')

One or more strings in the dta file could not be decoded using utf-8, and
so the fallback encoding of latin-1 is being used.  This can happen when a file
has been incorrectly encoded by Stata or some other software. You should verify
the string values returned are correct.
  git_data1 = pd.read_stata('data/git-events.dta')


In [5]:
# 2nd  run of csv_to_dta
git_data2 = pd.read_stata('data/git-events.dta')
issues_data2 = pd.read_stata('data/issues.dta')

One or more strings in the dta file could not be decoded using utf-8, and
so the fallback encoding of latin-1 is being used.  This can happen when a file
has been incorrectly encoded by Stata or some other software. You should verify
the string values returned are correct.
  git_data2 = pd.read_stata('data/git-events.dta')


In [8]:
assert git_data1.equals(git_data2)
assert issues_data1.equals(issues_data2)

In [31]:
test_durations = pd.read_stata('temp/test_durations.dta')
test_durations['max_revision'] = test_durations['max_revision'].apply(lambda x: int(x))

In [34]:
test_durations.to_csv('temp/test.csv')

In [52]:
collapsed.to_csv('temp/collapsed.csv')

In [48]:
collapsed

Unnamed: 0,MS,revision,time_at_author,time_at_editor,accepted_at,max_revision
0,17262,1,37.604502,20.385208,1.637655e+09,3
1,17262,2,10.851644,0.007130,1.637655e+09,3
2,17262,3,,0.003032,1.637655e+09,3
3,22602,1,5.060718,2.405174,1.672170e+09,2
4,22602,2,,0.001852,1.672170e+09,2
...,...,...,...,...,...,...
351,31561,1,35.252407,19.678229,1.679924e+09,2
352,31561,2,,0.008021,1.679924e+09,2
353,31601,1,37.988819,23.143287,1.693241e+09,2
354,31601,2,,0.003981,1.693241e+09,2


In [33]:
assert test_durations.equals(collapsed)

AssertionError: 