In [1]:
import pandas as pd
import spacy

In [2]:
rw_data = pd.read_csv("clean_acl_related_work_data.tsv", sep="\t", header=0)
example_data = pd.read_csv("clean_acl_rw_example_sentences.tsv", sep="\t", header=0)

# Dataframe shapes
rw_instance_count, rw_column_size = rw_data.shape
print('There are {:d} rows, {:d} columns in related work data'.format(rw_instance_count,rw_column_size))

ex_instance_count, ex_column_size = example_data.shape
print('There are {:d} rows, {:d} columns in example data'.format(ex_instance_count,ex_column_size))

# Column names
print("Related Work Data Columns")
print(rw_data.columns)
print("Examples Data Columns")
print(example_data.columns)

There are 5971 rows, 27 columns in related work data
There are 73139 rows, 47 columns in example data
Related Work Data Columns
Index(['acl_id', 'abstract', 'corpus_paper_id', 'pdf_hash', 'numcitedby',
       'url', 'publisher', 'address', 'year', 'month', 'booktitle', 'author',
       'title', 'clean_title', 'pages', 'doi', 'number', 'volume', 'journal',
       'editor', 'isbn', 'paragraph_xml', 'paragraph', 'cited_paper_marks',
       'cited_paper_titles', 'cited_papers_acl_ids', 'cited_papers_abstracts'],
      dtype='object')
Examples Data Columns
Index(['example_id', 'sentence', 'paragraph_xml', 'paragraph', 'citation_mark',
       'cited_acl_id', 'cited_abstract', 'cited_corpus_paper_id',
       'cited_pdf_hash', 'cited_numcitedby', 'cited_url', 'cited_publisher',
       'cited_address', 'cited_year', 'cited_month', 'cited_booktitle',
       'cited_author', 'cited_title', 'cited_pages', 'cited_doi',
       'cited_number', 'cited_volume', 'cited_journal', 'cited_editor',
       'c

In [3]:
cited_paper_titles = rw_data["cited_paper_titles"]

citation_counts = {}
pg_citation_counts = []

for papers in cited_paper_titles:
    titles = papers.split(" %%% ")
    for title in titles:
        try:
            citation_counts[title] += 1
        except:
            citation_counts[title] = 1

    pg_citation_counts.append(len(titles))

pg_citation_counts_df = pd.Series(pg_citation_counts)
citation_counts_df = pd.Series(citation_counts)

print("For related work data:")

print("\tTotal citation counts: {:d}".format(citation_counts_df.sum()))
print("\tCitation count per paragraph: {:f}".format(citation_counts_df.sum()/len(rw_data)))

print("\tNumber of unique citing papers: {:d}".format(rw_data["acl_id"].nunique()))
print("\tNumber of unique cited papers: {:d}".format(len(citation_counts_df)))
print("\tAvg. occurrence of a cited paper: {:f}".format(citation_counts_df.sum()/len(citation_counts_df)))

pg_citation_counts_df.value_counts().rename_axis('Citation count').to_frame('How many times occur')


For related work data:
	Total citation counts: 12950
	Citation count per paragraph: 2.168816
	Number of unique citing papers: 4605
	Number of unique cited papers: 6620
	Avg. occurrence of a cited paper: 1.956193


Unnamed: 0_level_0,How many times occur
Citation count,Unnamed: 1_level_1
1,2729
2,1488
3,822
4,435
5,231
6,131
7,63
8,34
9,20
10,9


In [4]:
example_counts = {}

for title in citation_counts:
    example_counts[title] = len(example_data[example_data["cited_clean_title"] == title])

example_counts_df = pd.Series(example_counts)

print("Unique citing paper count: {:d}".format(len(example_data["citing_clean_title"].unique())))
print("Non-zero exampled cited paper count: {:d}".format((example_counts_df != 0).sum()))


example_counts_df.value_counts().rename_axis('Example count').to_frame('How many times occur')

Unique citing paper count: 16338
Non-zero exampled cited paper count: 6594


Unnamed: 0_level_0,How many times occur
Example count,Unnamed: 1_level_1
1,882
2,757
3,669
4,506
5,448
...,...
101,1
200,1
143,1
149,1


In [5]:
nlp = spacy.load("en_core_sci_sm")

sentence_count_in_pg = []
word_count_in_pg = []

for i, row in rw_data.iterrows():
    sentence_count_in_pg.append(len(list(nlp(row["paragraph"]).sents)))
    word_count_in_pg.append(len(row["paragraph"].split()))

word_count_examples = []
for i, row in example_data.iterrows():
    word_count_examples.append(len(row["sentence"].split()))


print("\tSentence count per paragraph: {:f}".format(sum(sentence_count_in_pg)/len(rw_data)))
print("\tWord count per paragraph: {:f}".format(sum(word_count_in_pg)/len(rw_data)))
print("\tWord count per sentence: {:f}".format(sum(word_count_examples)/len(example_data)))

	Sentence count per paragraph: 4.220399
	Word count per paragraph: 98.666053
	Word count per sentence: 35.301166
