# About this Notebook
This is a first run through the compeition to try and understand the datatset and realise the problem at hand.

In [None]:
# Asthetics
import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)
warnings.filterwarnings('ignore', category=FutureWarning)

# Basic
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np
import json
import os
import random
from tqdm.autonotebook import tqdm
import string
import re
from functools import partial

# Visualizations
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set(style="whitegrid")
from wordcloud import WordCloud, STOPWORDS

# NLP
import spacy
nlp = spacy.load('en_core_web_lg') # , disable=['parser', 'ner'])
nlp.max_length = 40000000
nlp.add_pipe(nlp.create_pipe('sentencizer'))

# Data Description

train.csv -labels and metadata for the training set
train/tezt directory - the full text of the training/test set's publications in JSON format, broken into sections with section titles
* `id` - publication id - note that there are multiple rows for some training documents, indicating multiple mentioned datasets.
* `pub_title` - title of the publication (a small number of publications have the same title).
* `dataset_title` - the title of the dataset that is mentioned within the publication.
* `dataset_label` - a portion of the text that indicates the dataset.
* `cleaned_label` - the dataset_label, as passed through the clean_text function from the [Evaluation page](https://www.kaggle.com/c/coleridgeinitiative-show-us-the-data/overview/evaluation).

sample_submission.csv - a sample submission file in the correct format.
* `Id` - publication id.
* `PredictionString` - To be filled with equivalent of `cleaned_label` of train data.

In [None]:
RANDOM_SEED = 42

In [None]:
def seed_everything(seed=RANDOM_SEED):
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    random.seed(seed)

In [None]:
seed_everything()

In [None]:
nlp = spacy.load("en_core_web_lg")
# Merge noun phrases and entities for easier analysis
nlp.add_pipe(nlp.create_pipe('merge_entities'))
nlp.add_pipe(nlp.create_pipe('merge_noun_chunks'))

def find_conjunct_noun_chunks(text):
    doc = nlp(text)
    chunks = list(doc.noun_chunks)
    
    conjunct_groups = set()
    
    for chunk in chunks:
        #print(type(chunk.root), chunk.root.i)
        #print(type(chunk.root.head), chunk.root.head.i)
        #print(chunk.text, list(chunk.noun_chunks), chunk.start, chunk.end, chunk.root.text, chunk.root.dep_, chunk.root.head.text, chunk.conjuncts)
        if len(chunk.conjuncts) > 0:
            group = tuple(sorted([chunk.text] + [s.text for s in chunk.conjuncts]))
            conjunct_groups |= {group}
            
    return conjunct_groups

sample_text = "A number of longitudinal epidemiologic studies, including the Baltimore Longitudinal Study of Aging, the New Mexico Aging Process Study, and the Massachusetts Male Aging Study, have demonstrated age-related increases in the likelihood of developing hypogonadism."
find_conjunct_noun_chunks(sample_text)

In [None]:
sample_text = "The index comprises two categories, respectively cognitive skill (the latest test results from the Progress in International Reading Literacy Study, PIRLS; the Trends in International Mathematics and Science Study, TIMSS; the Programme for International Student Assessment, PISA; the initial output from the Programme for the International Assessment of Adult Competencies, PIAAC) and educational attainment (the latest literacy rate and graduation rates at the upper secondary and tertiary level)."
find_conjunct_noun_chunks(sample_text)

In [None]:
def find_appos_groups(text):
    doc = nlp(text)
    chunks = list(doc.noun_chunks)
    
    appos_groups = set()
    
    for chunk in chunks:
        if chunk.root.dep_ == "appos":
            appos_group = tuple(sorted([chunk.text, chunk.root.head.text]))
            # print(appos_group)
            appos_groups |= {appos_group}
            
    return appos_groups

sample_text = "The index comprises two categories, respectively cognitive skill (the latest test results from the Progress in International Reading Literacy Study, PIRLS; the Trends in International Mathematics and Science Study, TIMSS; the Programme for International Student Assessment, PISA; the initial output from the Programme for the International Assessment of Adult Competencies, PIAAC) and educational attainment (the latest literacy rate and graduation rates at the upper secondary and tertiary level)."
find_appos_groups(sample_text)

In [None]:
#sample_text = "A number of longitudinal epidemiologic studies, including the Baltimore Longitudinal Study of Aging, the New Mexico Aging Process Study, and the Massachusetts Male Aging Study, have demonstrated age-related increases in the likelihood of developing hypogonadism."
doc = nlp(sample_text)
chunks = list(doc.noun_chunks)
from pprint import pprint
print(sample_text)
pprint(chunks)
conjunct_groups = set()
appos_groups = set()

for chunk in chunks:
    print(type(chunk.root), chunk.root.i)
    print(type(chunk.root.head), chunk.root.head.i)
    print(chunk.text, list(chunk.noun_chunks), chunk.start, chunk.end, chunk.root.text, chunk.root.dep_,
            chunk.root.head.text, chunk.conjuncts)
    
    if chunk.root.dep_ == "appos":
        appos_group = tuple(sorted([chunk.text, chunk.root.head.text]))
        print(appos_group)
        appos_groups |= {appos_group}

    group = tuple(sorted([chunk.text] + [s.text for s in chunk.conjuncts]))
    if len(group) > 1:
        conjunct_groups |= {group}
    
print(conjunct_groups)
print(appos_groups)

In [None]:
ents = [(e.text, e.start_char, e.end_char, e.label_) for e in doc.ents]
pprint(ents)

In [None]:
from spacy import displacy

doc = nlp(sample_text)
displacy.render(doc, style="dep")

In [None]:
train_df = pd.read_csv('../input/coleridgeinitiative-show-us-the-data/train.csv')
sample_sub = pd.read_csv('../input/coleridgeinitiative-show-us-the-data/sample_submission.csv')
train_files_path = '../input/coleridgeinitiative-show-us-the-data/train'
test_files_path = '../input/coleridgeinitiative-show-us-the-data/test'

Let's have a look at the training data csv file...

In [None]:
train_df.head(10)

Let's get the text data from json files and append them to the table.

In [None]:
def read_append_return(filename, train_files_path=train_files_path, output='text', keep_list=False):
    json_path = os.path.join(train_files_path, (filename+'.json'))
    headings = []
    contents = []
    combined = []    
    with open(json_path, 'r') as f:
        json_decode = json.load(f)
        for data in json_decode:
            headings.append(data.get('section_title'))
            contents.append(data.get('text'))
            combined.append(data.get('section_title'))
            combined.append(data.get('text'))

    if not keep_list:
        headings = ' '.join(headings)
        contents = ' '.join(contents)
        combined = '\n\n '.join(combined)
    
    if output == 'text':
        return contents
    elif output == 'head':
        return headings
    else:
        return combined

In [None]:
tqdm.pandas()
train_df['text_list'] = train_df['Id'].progress_apply(lambda fn: read_append_return(fn, keep_list=True))
train_df['text'] = train_df['Id'].progress_apply(lambda fn: read_append_return(fn, keep_list=False))

In [None]:
train_df.head(10)

In [None]:
def find_section_index_of_dataset(text_list, dataset_label):
    for i, t in enumerate(text_list):
        if dataset_label.lower() in t.lower():
            return i
        
    return -10

# find_section_index_of_dataset(["abc", "def", "123"], "ef")

section_indexes = train_df.progress_apply(lambda r: find_section_index_of_dataset(r['text_list'], r['dataset_label']), axis=1)

In [None]:
import matplotlib.pyplot as plt
import numpy as np 
import scipy
print(scipy.stats.describe(section_indexes))
plt.hist(section_indexes, bins=100)

In [None]:
print(sample_sub.shape)
sample_sub.head()

In [None]:
import glob
glob.glob(test_files_path + "/*")

In [None]:
tqdm.pandas()
sample_sub['text'] = sample_sub['Id'].progress_apply(partial(read_append_return, train_files_path=test_files_path))

In [None]:
sample_sub.head()

In [None]:
r = sample_sub.iloc[2]
print(r.text)

Let's save the data now in case we needed that while creating model later.

In [None]:
train_df.to_csv('train_papers.csv')

# inspect examples

In [None]:
print(train_df.shape)
train_df.head()

In [None]:
print(train_df.pub_title.nunique()) # 論文数 14271

In [None]:
print(train_df.dataset_title.nunique()) # 45しかない
train_df.dataset_title.value_counts()

In [None]:
r = train_df.iloc[1000]

dataset_label = r.dataset_label
text = r.text

print(r)


import re

html_str = re.sub(f"({re.escape(dataset_label)})", r"<b style='color:navy'>\1</b>", text, flags=re.IGNORECASE)
html_str = re.sub("\n", r"<br>", html_str)

from IPython.display import HTML

display(HTML(html_str))

# training examples

This study used data from the **National Education Longitudinal Study** (NELS:88) to examine the effects of dual enrollment programs for high school students on college degree attainment. 

Using the nationally representative, longitudinal **National Education Longitudinal Study** of 1988 (NELS-88) data set, a logistic regression model was used to examine the extent to which outcome variables were differentially associated with gender for students participating in special education. 


This responsiveness allowed phase alignment with tidal predictions for a **NOAA tidal station** (New-

(別セクションにある微妙に違うmention)
All other cages were removed after 3 d of exposure, during which they were submerged approxiniately 34% of the time based on NOAA (1996) tidal predictions. 

(モデル名?)
With P-Surge, thousands of **SLOSH model** runs are made, forced by hurricane model input parameters from normal distributions centered on the current NHC official forecast,


Data used in the preparation of this article were obtained from the **Alzheimers Disease Neuroimaging Initiative** (ADNI) database (adni.loni.usc.edu). 

(略称含むパターン)
We examined the relation between PMI and structural integrity of Purkinje cells in autopsy cases with accurate PMI documented from the **Baltimore Longitudinal Study of Aging (BLSA)**. 

(ケースがラベルと合わないパターン)
Here, we choose locations corresponding to four **NOAA tide gauge** stations near each study site, Stations 8452660, 8531680, 8534720, and 8638863 for Narragansett Bay, Jamaica Bay, Atlantic City, and Norfolk, respectively (Table 2) .

School characteristics used to compare CFST schools to other NC public schools at program inception come from the National Center for Education Statistics Common Core of Data Public School Universe (NCES-CCD) and the North Carolina Department of Public Instruction's School Report Card compiled by the North Carolina Education Research Data Center (NCERDC).

In [None]:
query = "NOAA Tide Gauge"
rows = train_df[train_df.dataset_label == query]

rows.head(50)

In [None]:
def clean_text(text):
    return text.lower()
def find_sents(text, query):
    found_sents = []
    query = clean_text(query)
    text = clean_text(text)
    for s in nlp(text).sents:
        if query in s.text:
            found_sents.append(s)
            
    return found_sents

# inspect test data

In [None]:
print(sample_sub.shape)
sample_sub.head()

In [None]:
sample_sub[sample_sub.text.str.lower().str.contains("test")]

In [None]:
r = sample_sub.iloc[0]

text = r.text

print(r)

dataset_label = "covid"

import re

html_str = re.sub(f"({re.escape(dataset_label)})", r"<b style='color:navy'>\1</b>", text, flags=re.IGNORECASE)
html_str = re.sub("\n", r"<br>", html_str)

from IPython.display import HTML

display(HTML(html_str))

# Data Cleaning

In [None]:
def text_cleaning(text):
    '''
    Converts all text to lower case, Removes special charecters, emojis and multiple spaces
    text - Sentence that needs to be cleaned
    '''
    text = ''.join([k for k in text if k not in string.punctuation])
    text = re.sub('[^A-Za-z0-9]+', ' ', str(text).lower()).strip()
    text = re.sub(' +', ' ', text)
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    return text

In [None]:
tqdm.pandas()
train_df['text'] = train_df['text'].progress_apply(text_cleaning)

In [None]:
# tqdm.pandas()
# sample_sub['text'] = sample_sub['text'].progress_apply(text_cleaning)

## Generating Word cloud

In [None]:
text = ' '.join(train_df['text'].sample(frac=0.3))
wordcloud = WordCloud(background_color='white', stopwords=STOPWORDS, width=2560, height=1440).generate(text)

barplot_dim = (15, 15)
ax = plt.subplots(figsize=barplot_dim, facecolor='w')
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.tight_layout(pad=0)
plt.show()

As we can see there are metions of 'et al' pretty significantly in the test of papers. Which is in fact related to quoting papers. This this should be a significant factor in determining the citation titles. Let, hope so...

# Preparing text

In [None]:
def prepare_text(text, nlp=nlp):
    '''
    Returns the text after stop-word removal and lemmatization.
    text - Sentence to be processed
    nlp - Spacy NLP model
    '''
    doc = nlp(text)
    lemma_list = [token.lemma_ for token in doc if not token.is_stop]
    lemmatized_sentence = ' '.join(lemma_list)
    
    return lemmatized_sentence

In [None]:
# tqdm.pandas()
# train_df['text'] = train_df['text'].progress_apply(prepare_text)

# Model
This is a very naive model based on the assumption that topics having names of label or dataset_title in their content most porobaby are citing the same sources.

In [None]:
def clean_text(txt):
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt).lower()).strip()

In [None]:
temp_1 = [x.lower() for x in train_df['dataset_label'].unique()]
'''
Idea below of also using the 'dataset_title' is burrowed from
https://www.kaggle.com/josephassaker/coleridge-initiative-eda-na-ve-submission
'''
temp_2 = [x.lower() for x in train_df['dataset_title'].unique()]
temp_3 = [x.lower() for x in train_df['cleaned_label'].unique()]

existing_labels = set(temp_1 + temp_2 + temp_3)
id_list = []
lables_list = []
for index, row in tqdm(sample_sub.iterrows()):
    sample_text = row['text']
    row_id = row['Id']
    temp_df = train_df[train_df['text'] == text_cleaning(sample_text)]
    cleaned_labels = temp_df['cleaned_label'].to_list()
    for known_label in existing_labels:
        if known_label in sample_text.lower():
            cleaned_labels.append(clean_text(known_label))
    cleaned_labels = [clean_text(x) for x in cleaned_labels]
    cleaned_labels = set(cleaned_labels)
    lables_list.append('|'.join(cleaned_labels))
    id_list.append(row_id)

In [None]:
submission = pd.DataFrame()
submission['Id'] = id_list
submission['PredictionString'] = lables_list

In [None]:
submission.head()

In [None]:
submission.to_csv('submission.csv', index=False)

This notebook is a work in progress... This is just a first pass through the data to see what is the situation, along with a very Naive model. Even with that, trust me this used to have LB = 1.0 at some point in time. 😆😛  

**If you found this notebook useful and use parts of it in your work, please don't forget to show your appreciation by upvoting this kernel. That keeps me motivated and inspires me to write and share these public kernels.** 😊