## Import Libraries
In order to generate a bag of words representation we need the following libraries

In [1]:
import numpy as np
import json

## Import Data


In [2]:
# data_set = np.array([ [x['venue'], x['citations']] for x in json.load(open('train-1.json'))[:3] if x['venue']])
data_set = np.array([ [x['venue'], x['citations']] for x in json.load(open('train-1.json')) if x['venue']])

## Proportion analysis of journals/venues
Aim of the code is to divide the data in n equal portions, where the 'lowest' division accounts for the group of X that contributes the least to a given Y.

In [3]:
def divide_data_in_portions(XY, proportion=4):
    #split data
    X, _ = np.split(XY, 2, 1)

    # Retrieve Unique variables
    unique_X = np.unique(X,return_counts=True)

    a = np.zeros((3,len(unique_X[0])))
    
    # Merge arrays together
    merge = [unique_X[0], unique_X[1].astype(int), a[0].astype(int), a[1].astype(int), a[2].astype(float)]
    
    # Compute citations per venue
    for x in XY :
        merge[2][np.where(merge[0] == x[0])[0][0]] += int(x[1])

    # Compute citations per topic divided by the amount of articles
    for (i, j) in enumerate(merge[3]):
        merge[3][i] = (merge[2][i]/merge[1][i])

    # Compute new summed citations
    summed_citations = np.sum(merge[3])

    result = []
    # Compute percentage of new total sum
    for (i, j) in enumerate(merge[3]):
        merge[4][i] = 100/summed_citations*merge[3][i]
        result.append((merge[0][i],merge[1][i],merge[2][i],merge[3][i],merge[4][i]))

    return result
    

division = divide_data_in_portions(data_set)

## Plot the line
In order to make the data more acessible, the data can be plotted

In [7]:
%pylab inline --no-import-all 

division = divide_data_in_portions(data_set)
dtype = [('field_of_study', 'S10'), ('count', int), ('summed_citations', int), ('average_citations', float), ('contribution', float)]

# create a structured array
structured_array = np.array(division, dtype=dtype)
sorted = np.sort(structured_array, order='contribution')[::-1]

print(sorted)


Populating the interactive namespace from numpy and matplotlib
[(b'SSST@EMNLP',    8,   3659, 457., 8.17092795)
 (b'EMNLP 2017',    2,    387, 193., 3.450742  )
 (b'IJCAI',    1,    169, 169., 3.02163419)
 (b'EMNLP 2019',    1,    144, 144., 2.57464688)
 (b'ALW@ACL',    7,    803, 114., 2.03826211)
 (b'CL',  106,  11873, 112., 2.00250313)
 (b'BlackboxNL',   22,   2445, 111., 1.98462364)
 (b'NMT@ACL',    9,    977, 108., 1.93098516)
 (b'NAACL-HLT',    6,    636, 106., 1.89522618)
 (b'SIGHAN',    7,    703, 100., 1.78794922)
 (b'SEMITIC@AC',    5,    472,  94., 1.68067227)
 (b'HLT',    4,    368,  92., 1.64491328)
 (b'TACL',   44,   3992,  90., 1.6091543 )
 (b'SemEval@CO',    1,     83,  83., 1.48399785)
 (b'WaC@EACL',    1,     77,  77., 1.3767209 )
 (b'SENSEVAL@A',    1,     75,  75., 1.34096192)
 (b'Computatio',  113,   8414,  74., 1.32308242)
 (b'ACL', 1947, 141182,  72., 1.28732344)
 (b'ANLP',  105,   7234,  68., 1.21580547)
 (b'ACL 2002',   13,    894,  68., 1.21580547)
 (b'NAACL 2