In [1]:
import pandas  as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objs as go
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.io as pio

In [2]:
# read iclr2017
df = pd.read_json('datasets/iclr_2017.json')


In [7]:
scores = df['recommendation']
scores = [ np.mean(score) for score in scores]

df['scores'] = scores
# drop rows with scores less than 4

print(len(df[df.accepted == True]))
print(len(df[df.accepted == False]))

quality_drop = df[df['scores'] < 4]
df = df[df['scores'] >=4]


172
215


In [8]:
# accepted paper
accepted = df[df['accepted'] == True]
# rejected paper
rejected = df[df['accepted'] == False]

In [9]:
# drop the outlier citation papers with  IQR METHOD
Q1_a = accepted['citation'].quantile(0.25)
Q3_a = accepted['citation'].quantile(0.75)
IQR_a = Q3_a - Q1_a
accepted_norm = accepted.query('(@Q1_a - 1.5 * @IQR_a) <= citation <= (@Q3_a + 1.5 * @IQR_a)')


# do the same for rejected papers
Q1_r = rejected['citation'].quantile(0.25)
Q3_r = rejected['citation'].quantile(0.75)
IQR_r = Q3_r - Q1_r
rejected_norm = rejected.query('(@Q1_r - 1.5 * @IQR_r) <= citation <= (@Q3_r + 1.5 * @IQR_r)')

# also get outliers 

accepted_outliers = accepted[~accepted.index.isin(accepted_norm.index)]
rejected_outliers = rejected[~rejected.index.isin(rejected_norm.index)]


In [10]:
print(len(accepted_norm), len(rejected_norm))
print(len(accepted_outliers), len(rejected_outliers))

152 184
20 31


In [11]:
print(accepted_norm['citation'].mean(), rejected_norm['citation'].mean(),
      accepted_outliers['citation'].mean(), rejected_outliers['citation'].mean())

221.27631578947367 34.65760869565217 2159.1 947.4193548387096


In [37]:
# create a bar plot of citation mean for accepted and rejected papers, and outliers 
fig = make_subplots(rows=1, cols=1)

fig.add_trace(go.Bar(x=['IQR Accepted', 'IQR Rejected', 'Outlier Accepted', 'Outlier Rejected '],
                        y=[accepted_norm['citation'].sum(), rejected_norm['citation'].sum(), 
                            accepted_outliers['citation'].sum(), rejected_outliers['citation'].sum()],marker=dict(color = ['#84c5fe', '#e32626', '#02273f', '#780448'])),
                        row=1, col=1)
# put alpha on colors
fig.update_traces(marker_line_color='rgb(8,48,107)',
                    marker_line_width=1.5, opacity=0.6)
texts = [ f'{len(accepted_norm)} Papers', f'{len(rejected_norm)} Papers', 
f'{len(accepted_outliers)} Papers', f'{len(rejected_outliers)} Papers']

fig.update_traces(texttemplate = texts, textposition = "outside")

#change the colour of trace text 
fig['layout'].update(plot_bgcolor='rgb(230, 230,230)')
fig['layout'].update(margin={'l': 0, 'r': 0, 't': 0, 'b': 0})
fig.update_traces(textfont_color = 'black')

# give title to y axis
fig.update_yaxes(title_text="Citation Sum", row=1, col=1)

fig.update_layout(height=400, width=600)
pio.write_image(fig, 'figures/outliers.pdf', format='pdf')
fig.show()


## Some plots

In [8]:
accept_no = len(df[df.accepted == True])
reject_no = len(df[df.accepted == False])

print('Accepted papers: ', accept_no)
print('Rejected papers: ', reject_no)

Accepted papers:  172
Rejected papers:  215


In [15]:
len_cited_accept = len(df[df.accepted == True].citation)
len_cited_reject = len(df[df.accepted == False].citation)
per_cited_accept = sum(df[df.accepted == True].citation)/len_cited_accept
per_cited_reject = sum(df[df.accepted == False].citation)/len_cited_reject

print('Per paper citations of accepted papers: ',per_cited_accept)
print('Per paper  citations of rejected papers: ',per_cited_reject)


Per paper citations of accepted papers:  446.6046511627907
Per paper  citations of rejected papers:  166.26511627906976


In [14]:
len_cited_accept = len(accepted_norm)
len_cited_reject = len(rejected_norm)
normalized_cited_accept = sum(accepted_norm.citation)/len_cited_accept
normalized_cited_reject = sum(rejected_norm.citation)/len_cited_reject

print('Normalized per paper citations of accepted papers: ',normalized_cited_accept)
print('Normalized per paper citations of rejected papers: ',normalized_cited_reject)

Normalized per paper citations of accepted papers:  221.27631578947367
Normalized per paper citations of rejected papers:  34.65760869565217


In [17]:
fig = make_subplots(rows=1, cols=2)

# do not show tick
fig.update_xaxes(showticklabels=False)
trace1 = go.Violin(y=accepted_norm.scores,name = 'IQR Accepted' ,box_visible=True, meanline_visible=True, points= 'all', marker=dict(color='#84c5fe'))
trace2 = go.Violin(y=rejected_norm.scores,name= 'IQR Rejected' ,box_visible=True, meanline_visible=True,  points= 'all',marker=dict(color='#b94e4e'))


trace5 = go.Violin(y=accepted_norm.citation, box_visible=True, meanline_visible=True, points= 'all', marker=dict(color='#84c5fe'),showlegend= False,)
trace6 = go.Violin(y=rejected_norm.citation, box_visible=True, meanline_visible=True,  points= 'all',marker=dict(color='#b94e4e'),showlegend= False)

fig.add_trace(trace5, row=1, col=1)
fig.add_trace(trace6, row=1, col=1)
fig.add_trace(trace1, row=1, col=2)
fig.add_trace(trace2, row=1, col=2)

#give title to col 1 by using x axis

fig.update_xaxes(title_text="Number of Citations", row=1, col=1)

fig.update_xaxes(title_text="Recommendation Scores", row=1, col=2)


#increase numbers in y axis for violin plot

fig['layout']['yaxis'].update(range=[-150, 1400])

fig['layout']['yaxis2'].update(range=[3, 9])
# make plot taller
fig['layout'].update(height=500, width=700)
fig['layout'].update(margin={'l': 0, 'r': 0, 't': 0, 'b': 0})
# make background gray
fig['layout'].update(plot_bgcolor='rgb(230, 230,230)')



pio.write_image(fig, 'figures/violin.pdf', format='pdf', width=900, height=400)

fig.show()

# Score vs Citation

In [23]:
fig = make_subplots(rows=1, cols=2,subplot_titles=("IQR Accepted", "IQR Rejected"))

trace1 =go.Histogram2d( 
    x=accepted_norm.scores,
    y=accepted_norm.citation,
    coloraxis = "coloraxis",
    colorscale='YlGnBu',
    nbinsx =8,
    nbinsy=15,
    )

trace2 = go.Histogram2d(
    x=rejected_norm.scores,
    y=rejected_norm.citation,
    coloraxis = "coloraxis",
    nbinsx =6,
    nbinsy=15)

tick_values = [i*100 for i in range(11)]
tick_text = [str(i) for i in tick_values]

tick_values2 = [i*20 for i in range(11)]
tick_text2 = [str(i) for i in tick_values2]

x_values = [i*0.5 for i in range(9,18)]
x_tick_text  = [str(i) for i in x_values]

fig.add_trace(trace1, row=1, col=1)
fig.add_trace(trace2, row=1, col=2)



fig.update_xaxes(title_text="Recommendation Scores", row=1, col=1)

fig.update_xaxes(title_text="Recommendation Scores", row=1, col=2)

fig.update_yaxes(title_text="Number of Citations", row=1, col=1)
fig.update_yaxes(title_text="Number of Citations", row=1, col=2)


fig.update_layout(coloraxis=dict(colorscale='ice'), showlegend=False)
fig['layout'].update(margin={'l': 0, 'r': 0, 't': 0, 'b': 0})
fig.update_yaxes(tickvals=tick_values, ticktext=tick_text, row=1, col=1)
fig.update_xaxes(tickvals=x_values, ticktext=x_tick_text, row=1, col=1)
fig.update_yaxes(tickvals=tick_values2, ticktext=tick_text2, row=1, col=2)
fig['layout'].update(height=300, width=900)
fig.show()
pio.write_image(fig, 'figures/histogram.pdf', format='pdf', width=1200, height=400)

