# About
I added a *Naive CV* (**Local Validation**) so that you can track your improvement.

## References
Please check them out.
- [@prashansdixit](https://www.kaggle.com/prashansdixit)
 - [📝Coleridge Initiative-EDA📚 & Baseline Model🎯](https://www.kaggle.com/prashansdixit/coleridge-initiative-eda-baseline-model)
- [@mghfarahani](https://www.kaggle.com/mghfarahani)
 - [Coleridge Initiative - Analysis](https://www.kaggle.com/mghfarahani/coleridge-initiative-analysis)
- [@mlconsult](https://www.kaggle.com/mlconsult)
 - [score 57ish with additional govt datasets](https://www.kaggle.com/mlconsult/score-57ish-with-additional-govt-datasets)
 
## Process
- What
 - The objective of the competition is to identify the mention of datasets within scientific publications.
- How
 - By literally extracting context and compare with labels we collected. (Baseline)

# Setting

## Libraries

In [None]:
import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)
warnings.filterwarnings('ignore', category=FutureWarning)

import os, re, json, glob
from collections import defaultdict
from textblob import TextBlob
from functools import partial

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns

import spacy
nlp = spacy.load('en_core_web_lg', disable=['parser', 'ner'])
nlp.max_length = 4_000_000
import nltk
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from wordcloud import WordCloud, STOPWORDS

from tqdm.autonotebook import tqdm
import string

%matplotlib inline

os.listdir('../input/coleridgeinitiative-show-us-the-data')

## Config

In [None]:
COMPUTE_CV = True
KEN_TEXT_CLEANING = False
NLTK_STOPWORDS = True

In [None]:
sample_sub = pd.read_csv('../input/coleridgeinitiative-show-us-the-data/sample_submission.csv')

if len(sample_sub) > 4: COMPUTE_CV = False
    
if COMPUTE_CV: 
    print('this submission notebook will compute CV score but commit notebook will not')
else:
    print('this submission notebook will only be used to submit result')

# Data Exploration

In [None]:
train_df = pd.read_csv('../input/coleridgeinitiative-show-us-the-data/train.csv')
train_files_path = '../input/coleridgeinitiative-show-us-the-data/train'

if COMPUTE_CV: 
    sample_sub = train_df
else:
    sample_sub = pd.read_csv('../input/coleridgeinitiative-show-us-the-data/sample_submission.csv')
    test_files_path = '../input/coleridgeinitiative-show-us-the-data/test'

train_df.head()

**Data Description**
- `id` - publication id - note that there are multiple rows for some training documents, indicating multiple mentioned datasets.
- `pub_title` - title of the publication (a small number of publications have the same title).
- `dataset_title` - the title of the dataset that is mentioned within the publication.
- `dataset_label` - a portion of the text that indicates the dataset.
- `cleaned_label` - the dataset_label, as passed through the clean_text function from the Evaluation page.

In [None]:
sample_sub.head()

**sample_submission.csv** - a sample submission file in the correct format.
- `Id` - publication id.
- `PredictionString` - To be filled with equivalent of cleaned_label of train data.

In [None]:
train_df.info()

## Unique Values

In [None]:
[print(f'{col}: {len( train_df[col].unique() )}') for col in train_df.columns]

## Contents

In [None]:
def read_append_return(filename, train_files_path=train_files_path, output='text'):
    """
    Function to read json file and then return the text data from them and append to the dataframe
    """
    json_path = os.path.join(train_files_path, (filename+'.json'))
    headings = []
    contents = []
    combined = []
    with open(json_path, 'r') as f:
        json_decode = json.load(f)
        for data in json_decode:
            headings.append(data.get('section_title'))
            contents.append(data.get('text'))
            combined.append(data.get('section_title'))
            combined.append(data.get('text'))
    
    all_headings = ' '.join(headings)
    all_contents = ' '.join(contents)
    all_data = '. '.join(combined)
    
    if output == 'text':
        return all_contents
    elif output == 'head':
        return all_headings
    else:
        return all_data

In [None]:
%%time
tqdm.pandas()

train_df['text'] = train_df['Id'].progress_apply(read_append_return)

if not COMPUTE_CV:
    sample_sub['text'] = sample_sub['Id'].progress_apply(partial(read_append_return,
                                                             train_files_path=test_files_path))

train_df.head()

# Text Cleaning

In [None]:
if KEN_TEXT_CLEANING:
    
    # from https://www.kaggle.com/mlconsult/score-57ish-with-additional-govt-datasets
    def text_cleaning(text):
        '''
        Converts all text to lower case, Removes special charecters, emojis and multiple spaces
        text - Sentence that needs to be cleaned
        '''
        text = re.sub('[^A-Za-z0-9]+', ' ', str(text).lower()).strip()
        text = re.sub(' +', ' ', text)
        emoji_pattern = re.compile("["
                                   u"\U0001F600-\U0001F64F"  # emoticons
                                   u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                                   u"\U0001F680-\U0001F6FF"  # transport & map symbols
                                   u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                                   "]+", flags=re.UNICODE)
        text = emoji_pattern.sub(r'', text)
        return text
    
else:
    
    def text_cleaning(text):
        '''
        Converts all text to lower case, Removes special charecters, emojis and multiple spaces
        text - Sentence that needs to be cleaned
        '''
        text = ''.join([k for k in text if k not in string.punctuation])
        text = re.sub('[^A-Za-z0-9]+', ' ', str(text).lower()).strip()
        # text = re.sub("/'+/g", ' ', text)
        return text

In [None]:
%%time
tqdm.pandas()

train_df['text'] = train_df['text'].progress_apply(text_cleaning)

# Vizualization

In [None]:
words = list( train_df['cleaned_label'].values )

if NLTK_STOPWORDS:
    stopwords = stopwords.words('english')
else:
    stopwords = ['ourselves', 'hers', 'the', 'of', 'and', 'in', 'between', 'yourself', 'but', 'again', 'there', 'about', 'once', 'during', 'out', 'very', 'having', 'with', 'they', 'own', 'an', 'be', 'some', 'for', 'do', 'its', 'yours', 'such', 'into', 'of', 'most', 'itself', 'other', 'off', 'is', 's', 'am', 'or', 'who', 'as', 'from', 'him', 'each', 'the', 'themselves', 'until', 'below', 'are', 'we', 'these', 'your', 'his', 'through', 'don', 'nor', 'me', 'were', 'her', 'more', 'himself', 'this', 'down', 'should', 'our', 'their', 'while', 'above', 'both', 'up', 'to', 'ours', 'had', 'she', 'all', 'no', 'when', 'at', 'any', 'before', 'them', 'same', 'and', 'been', 'have', 'in', 'will', 'on', 'does', 'yourselves', 'then', 'that', 'because', 'what', 'over', 'why', 'so', 'can', 'did', 'not', 'now', 'under', 'he', 'you', 'herself', 'has', 'just', 'where', 'too', 'only', 'myself', 'which', 'those', 'i', 'after', 'few', 'whom', 't', 'being', 'if', 'theirs', 'my', 'against', 'a', 'by', 'doing', 'it', 'how', 'further', 'was', 'here', 'than', '2', '19', 'dataset', 'c', 'database']

split_words = []
for word in words:
    lo_w = []
    list_of_words = str(word).split()
    for w in list_of_words:
        if w not in stopwords:
            lo_w.append(w)
    split_words.append(lo_w)
    
allwords = []
for wordlist in split_words:
    allwords += wordlist

## 100 Most Common Words
`cleaned_label` - WordCloud

In [None]:
mostcommon = FreqDist(allwords).most_common(100)
wordcloud = WordCloud( width = 1600,
                      height = 800,
                      background_color = 'white',
                      stopwords = STOPWORDS ).generate(str(mostcommon))
fig = plt.figure(figsize=(30, 10), facecolor='white')
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Top 100 Most Common Words in cleaned_label', fontsize=50)
plt.tight_layout(pad=0)
plt.show()

mostcommon_small = FreqDist(allwords).most_common(25)
x, y = zip(*mostcommon_small)
plt.figure(figsize=(50, 30))
plt.margins(0.02)
plt.bar(x, y)
plt.xlabel('words', fontsize=50)
plt.ylabel('Frequency of Words', fontsize=50)
plt.yticks(fontsize=40)
plt.xticks(rotation=60, fontsize=40)
plt.title('Freq of 25 Most Common Words in cleaned_label', fontsize=60)
plt.show()

# Baseline model and Submission

In [None]:
def clean_text(txt):
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt).lower()).strip()

In [None]:
temp_1 = [x.lower() for x in train_df['dataset_label'].unique()]
temp_2 = [x.lower() for x in train_df['dataset_title'].unique()]
temp_3 = [x.lower() for x in train_df['cleaned_label'].unique()]

existing_labels = set(temp_1 + temp_2 + temp_3)

print(f'len(temp_1) = {len(temp_1)}')
print(f'len(temp_2) = {len(temp_2)}')
print(f'len(temp_3) = {len(temp_3)}')
print(f'len(existing_labels) = {len(existing_labels)}')

id_list = []
lables_list = []
for index, row in tqdm(sample_sub.iterrows()):
    sample_text = row['text']
    row_id = row['Id']
    temp_df = train_df[train_df['text'] == text_cleaning(sample_text)]
    cleaned_labels = temp_df['cleaned_label'].to_list()
    
    for known_label in existing_labels:
        if known_label in sample_text.lower():
            cleaned_labels.append(clean_text(known_label))
            
    cleaned_labels = [clean_text(x) for x in cleaned_labels]
    cleaned_labels = set(cleaned_labels)
    lables_list.append('|'.join(cleaned_labels))
    id_list.append(row_id)

In [None]:
sample_sub['Id'] = id_list
sample_sub['PredictionString'] = lables_list
sample_sub[['Id', 'PredictionString']].to_csv('submission.csv', index=False)

sample_sub[['Id', 'PredictionString']].head()

# Compute CV

In [None]:
def getMetric(col):
    def f1score(row):
        n = len(np.intersect1d(row.PredictionString.split('|'), row[col]))
        return 2*n / (len(row.PredictionString.split('|')) + len(row[col]))
    return f1score

def my_jaccard(strs): 
    str1, str2 = strs
    temp_list = []
    for sentence in str1.lower().split('|'):
        a = set(str1.lower().split()) 
        b = set(str2.lower().split())
        c = a.intersection(b)
        d = float(len(c)) / (len(a) + len(b) - len(c))
        temp_list.append(d)
    return sum(temp_list) / len(temp_list)   

In [None]:
if COMPUTE_CV:
    getMetric_score = sample_sub.apply(getMetric('cleaned_label'), axis=1)
    print('getMetric_score =', getMetric_score.mean())
    my_jaccard_score = sample_sub[['PredictionString', 'cleaned_label']].apply(my_jaccard, axis=1)
    print('my_jaccard_score =', my_jaccard_score.mean())
    
print(f'COMPUTE_CV = {COMPUTE_CV}')
print(f'KEN_TEXT_CLEANING = {KEN_TEXT_CLEANING}')
print(f'NLTK_STOPWORDS = {NLTK_STOPWORDS}')

|   | CV | LB |
| --- | --- | --- |
| KEN + MY_SW | 0.705 | 0.534 |
| MY_SW | 0.695 | 0.534 |
| NLTK_SW | 0.700 | 0.534 |
| KEN + NLTK_SW | 0.701 | 0.534 |