In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os, glob, pickle, time, gc, copy, sys
import warnings
from tqdm import tqdm
import re
from concurrent.futures import ThreadPoolExecutor

tqdm.pandas()
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 100) # 表示できる列数

sys.path.append('../src')
from utils import ri, pickle_load, pickle_save
import nltk

In [2]:
# config
params = {}
### params directory
params['INPUT_DIR'] = "../input/orig"
params['OUTPUT_DIR'] = "../output"

In [3]:
df_train = pd.read_csv(params['INPUT_DIR']+"/train.csv")
df_train.head()

Unnamed: 0,Id,pub_title,dataset_title,dataset_label,cleaned_label
0,d0fa7568-7d8e-4db9-870f-f9c6f668c17b,The Impact of Dual Enrollment on College Degre...,National Education Longitudinal Study,National Education Longitudinal Study,national education longitudinal study
1,2f26f645-3dec-485d-b68d-f013c9e05e60,Educational Attainment of High School Dropouts...,National Education Longitudinal Study,National Education Longitudinal Study,national education longitudinal study
2,c5d5cd2c-59de-4f29-bbb1-6a88c7b52f29,Differences in Outcomes for Female and Male St...,National Education Longitudinal Study,National Education Longitudinal Study,national education longitudinal study
3,5c9a3bc9-41ba-4574-ad71-e25c1442c8af,Stepping Stone and Option Value in a Model of ...,National Education Longitudinal Study,National Education Longitudinal Study,national education longitudinal study
4,c754dec7-c5a3-4337-9892-c02158475064,"Parental Effort, School Resources, and Student...",National Education Longitudinal Study,National Education Longitudinal Study,national education longitudinal study


In [4]:
df_train_reduced = pickle_load(params['OUTPUT_DIR']+"/df_train_reduced.pkl")
df_train_reduced.head()

Unnamed: 0,Id,pub_title,dataset_title,dataset_label,cleaned_label,text,clean_text
0,d0fa7568-7d8e-4db9-870f-f9c6f668c17b,The Impact of Dual Enrollment on College Degre...,National Education Longitudinal Study,National Education Longitudinal Study,national education longitudinal study,This study used data from the National Educati...,this study used data from the national educati...
1,2f26f645-3dec-485d-b68d-f013c9e05e60,Educational Attainment of High School Dropouts...,National Education Longitudinal Study,National Education Longitudinal Study,national education longitudinal study,Dropping out of high school is not necessarily...,dropping out of high school is not necessarily...
2,c5d5cd2c-59de-4f29-bbb1-6a88c7b52f29,Differences in Outcomes for Female and Male St...,National Education Longitudinal Study,National Education Longitudinal Study,national education longitudinal study,", stress satisfactory outcomes for all youth,...",stress satisfactory outcomes for all youth inc...
3,5c9a3bc9-41ba-4574-ad71-e25c1442c8af,Stepping Stone and Option Value in a Model of ...,National Education Longitudinal Study,National Education Longitudinal Study,national education longitudinal study,Federal Reserve Bank of Richmond S1. Accountin...,federal reserve bank of richmond s1 accounting...
4,c754dec7-c5a3-4337-9892-c02158475064,"Parental Effort, School Resources, and Student...",National Education Longitudinal Study,National Education Longitudinal Study,national education longitudinal study,This article investigates an important factor ...,this article investigates an important factor ...


# Train label

In [5]:
def jaccard(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    if len(a)==0 or len(b)==0:
        return 0
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

def clean_text(txt):
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt).lower()).strip()

In [6]:
df_label = df_train['cleaned_label'].value_counts().reset_index()
df_label.columns = ['cleaned_label', 'count']
df_label['-count'] = -df_label['count']
df_label = df_label.sort_values(['-count', 'cleaned_label']).reset_index(drop=True)
df_label['target'] = np.arange(len(df_label))
df_label['cleaned_label'] = df_label['cleaned_label'].apply(clean_text)

df_tmp1 = df_train[['dataset_label', 'cleaned_label', 'dataset_title']]
df_tmp2 = df_train[['dataset_title', 'cleaned_label', 'dataset_title']]
df_tmp3 = df_train[['cleaned_label', 'cleaned_label', 'dataset_title']]
df_tmp1.columns = ['label', 'cleaned_label', 'dataset_title']
df_tmp2.columns = ['label', 'cleaned_label', 'dataset_title']
df_tmp3.columns = ['label', 'cleaned_label', 'dataset_title']
df_label2 = pd.concat([df_tmp1, df_tmp2, df_tmp3])

df_label2['label'] = df_label2['label'].apply(lambda x: x.lower())
df_label2['cleaned_label_my'] = df_label2['label'].apply(clean_text)
df_label2['cleaned_label'] = df_label2['cleaned_label'].apply(clean_text)
df_label2 = df_label2[df_label2['label'].duplicated()==False].reset_index(drop=True)
df_label2 = pd.merge(df_label2, df_label, on='cleaned_label', how='left')
print(df_label2.shape)
print(np.sum(pd.isna(df_label2['target'])))
# print(df_label2[df_label2['target']==1])
df_label2 = df_label2.sort_values(['-count', 'label']).reset_index(drop=True)
df_label2['target_relabeled'] = df_label2['target']
df_label2['target_relabeled'][df_label2['cleaned_label']!=df_label2['cleaned_label_my']] =\
    np.arange(np.sum(df_label2['cleaned_label']!=df_label2['cleaned_label_my'])) + df_label2['target'].max()+1
print("not match: ", np.sum(df_label2['cleaned_label']!=df_label2['cleaned_label_my']))
print("unique cleaned_label_my: ", len(df_label2['cleaned_label_my'].unique()))
print(df_label2['target'].max())
print(df_label2['target_relabeled'].max())

df_label_train = df_label2.copy()
print(df_label_train.shape)
df_label_train.head()

(180, 7)
0
not match:  3
unique cleaned_label_my:  133
129
132
(180, 8)


Unnamed: 0,label,cleaned_label,dataset_title,cleaned_label_my,count,-count,target,target_relabeled
0,adni,adni,Alzheimer's Disease Neuroimaging Initiative (A...,adni,3673,-3673,0,0
1,alzheimer s disease neuroimaging initiative adni,alzheimer s disease neuroimaging initiative adni,Alzheimer's Disease Neuroimaging Initiative (A...,alzheimer s disease neuroimaging initiative adni,2400,-2400,1,1
2,alzheimer's disease neuroimaging initiative (a...,alzheimer s disease neuroimaging initiative adni,Alzheimer's Disease Neuroimaging Initiative (A...,alzheimer s disease neuroimaging initiative adni,2400,-2400,1,1
3,trends in international mathematics and scienc...,trends in international mathematics and scienc...,Trends in International Mathematics and Scienc...,trends in international mathematics and scienc...,1163,-1163,2,2
4,baltimore longitudinal study of aging,baltimore longitudinal study of aging,Baltimore Longitudinal Study of Aging (BLSA),baltimore longitudinal study of aging,1156,-1156,3,3


# train acronym

In [7]:

acronym_list = [
    ['National Education Longitudinal Study', 'nels'],
#     ['NOAA Tide Gauge', 'nan'],
    ['Sea, Lake, and Overland Surges from Hurricanes', 'slosh'],
    ['Coastal Change Analysis Program', 'c-cap'],
    ['Aging Integrated Database (AGID)', 'agid'],
    ["Alzheimer's Disease Neuroimaging Initiative (ADNI)", 'adni'],
    ['Baltimore Longitudinal Study of Aging (BLSA)', 'blsa'],
    ['Agricultural Resource Management Survey', 'arms'],
    ['Beginning Postsecondary Student', 'bps'],
    ["The National Institute on Aging Genetics of Alzheimer's Disease Data Storage Site (NIAGADS)", 'niagads'],
    ['Common Core of Data', 'ccd'],
#     ['Survey of Industrial Research and Development', 'nan'],
    ['Baccalaureate and Beyond', 'b&b'],
    ['International Best Track Archive for Climate Stewardship', 'IBTrACS'],
    ['National Teacher and Principal Survey', 'ntps'],
    ['Higher Education Research and Development Survey', 'herd'],
    ['Survey of Earned Doctorates', 'sed'],
    ['School Survey on Crime and Safety', 'ssocs'],
    ['World Ocean Database', 'wod'],
    ['Program for the International Assessment of Adult Competencies', 'piaac'],
    ['Early Childhood Longitudinal Study', 'ecls'],
#     ['Survey of Graduate Students and Postdoctorates in Science and Engineering', 'nan'],
    ['Trends in International Mathematics and Science Study', 'timss'],
    ['Education Longitudinal Study', 'els'],
    ['Optimum Interpolation Sea Surface Temperature', 'oisst'],
    ['National Assessment of Education Progress', 'naep'],
    ['High School Longitudinal Study', 'hsls'],
    ['Survey of Doctorate Recipients', 'sdr'],
    ['Rural-Urban Continuum Codes', 'rucc'],
#     ['Survey of Science and Engineering Research Facilities', 'nan'],
#     ['FFRDC Research and Development Survey', 'nan'],
#     ['Survey of State Government Research and Development', 'nan'],
    ['Advanced National Seismic System (ANSS) Comprehensive Catalog (ComCat)', 'comcat'],
#     ['Census of Agriculture', 'nan'],
    ['North American Breeding Bird Survey (BBS)', 'bbs'],
    ['COVID-19 Open Research Dataset (CORD-19)', 'cord-19'],
    ['Complexity Science Hub COVID-19 Control Strategies List (CCCSL)', 'cccsl'],
#     ['Our World in Data COVID-19 dataset', 'nan'],
    ['COVID-19 Precision Medicine Analytics Platform Registry (JH-CROWN)', 'jh-crown'],
    ['Characterizing Health Associated Risks, and Your Baseline Disease In SARS-COV-2 (CHARYBDIS)', 'charybdis'],
#     ['COVID-19 Deaths data', 'nan'],
#     ['SARS-CoV-2 genome sequence', 'nan'],
#     ['COVID-19 Image Data Collection', 'nan'],
    ['RSNA International COVID-19 Open Radiology Database (RICORD)', 'ricord'],
#     ['CAS COVID-19 antiviral candidate compounds dataset', 'nan'],
]
df_label_train_acronym = pd.DataFrame(acronym_list, columns=['dataset_title', 'acronym'])
df_label_train_acronym['acronym_clean'] = df_label_train_acronym['acronym'].apply(clean_text)
df_label_train_acronym

Unnamed: 0,dataset_title,acronym,acronym_clean
0,National Education Longitudinal Study,nels,nels
1,"Sea, Lake, and Overland Surges from Hurricanes",slosh,slosh
2,Coastal Change Analysis Program,c-cap,c cap
3,Aging Integrated Database (AGID),agid,agid
4,Alzheimer's Disease Neuroimaging Initiative (A...,adni,adni
5,Baltimore Longitudinal Study of Aging (BLSA),blsa,blsa
6,Agricultural Resource Management Survey,arms,arms
7,Beginning Postsecondary Student,bps,bps
8,The National Institute on Aging Genetics of Al...,niagads,niagads
9,Common Core of Data,ccd,ccd


# Detect acronym ver1

In [22]:
def detect_keywords(text, keywords):
    for keyword in keywords:
        if keyword in text:
            return True
    return False

def det_acronym_ver1(text, keywords, TH_LEN_CHAR = 3):
    ans = []
    # text = re.sub("-", " ", text)
    words = text.split()
    for i, word in enumerate(words):
        if word[0]!='(' or word[-1]!=')': continue # (XXX)の形でなければスルー
        acronym_cand = word[1:-1]
        if acronym_cand.lower()==acronym_cand: continue # 大文字が一つもないならスルー
        len_acronym_cand = len(acronym_cand)
        if len_acronym_cand<TH_LEN_CHAR: continue # 3文字以下ならスルー
        acronym_cand_lower = acronym_cand.lower()
        acronym_cand_reverse = acronym_cand_lower[::-1]
        words_cand = words[np.clip(i-len_acronym_cand*3, 0, i):i] # (XXX)の前の数単語を抽出
#         print("len_acronym_cand", len_acronym_cand)
#         print("i", i)
#         print(words_cand, 1)
        words_cand = ' '.join(words_cand).strip()
# #         print(words_cand, 1)
        words_cand = re.sub('[^A-Za-z0-9]+', ' ', words_cand).strip()
#         print(words_cand, 1)
        words_cand = words_cand.split(" ")
        words_cand_reverse = words_cand[::-1]
        idx = 0 # acronym_candの文字index
#         print("words_cand_reverse[0]", words_cand_reverse[0])
        for j, word in enumerate(words_cand_reverse):
            if idx==len_acronym_cand:
                break
            if len(word)==0:
                continue
            if word[0].lower()==acronym_cand_reverse[idx]:
                if idx==len_acronym_cand-1: # 1文字目を検出
                    idx_start = j # dataset名の1単語目のindex
                idx += 1
        if idx==len_acronym_cand:
            words_reverse = words_cand_reverse[:idx_start+1]
            dataset = ' '.join(words_reverse[::-1]).strip().lower()
            if detect_keywords(dataset, keywords):
                acronym = acronym_cand
#                 print("Acronym: {}".format(acronym))
#                 print("Dataset: {}".format(dataset))
                ans.append([acronym_cand, dataset])
        return ans
    
keywords = [
    'study',
    'studies',
    'data',
    'survey',
    'panel',
    'census',
    'cohort',
    'longitudinal',
    'registry',
]    
df_train_reduced['det_acronym'] = df_train_reduced['text'].progress_apply(lambda x: det_acronym_ver1(x, keywords))
df_train_reduced.head()

100%|██████████| 14316/14316 [00:10<00:00, 1423.15it/s]


Unnamed: 0,Id,pub_title,dataset_title,dataset_label,cleaned_label,text,clean_text,det_acronym
0,d0fa7568-7d8e-4db9-870f-f9c6f668c17b,The Impact of Dual Enrollment on College Degre...,National Education Longitudinal Study,National Education Longitudinal Study,national education longitudinal study,This study used data from the National Educati...,this study used data from the national educati...,[]
1,2f26f645-3dec-485d-b68d-f013c9e05e60,Educational Attainment of High School Dropouts...,National Education Longitudinal Study,National Education Longitudinal Study,national education longitudinal study,Dropping out of high school is not necessarily...,dropping out of high school is not necessarily...,[]
2,c5d5cd2c-59de-4f29-bbb1-6a88c7b52f29,Differences in Outcomes for Female and Male St...,National Education Longitudinal Study,National Education Longitudinal Study,national education longitudinal study,", stress satisfactory outcomes for all youth,...",stress satisfactory outcomes for all youth inc...,[]
3,5c9a3bc9-41ba-4574-ad71-e25c1442c8af,Stepping Stone and Option Value in a Model of ...,National Education Longitudinal Study,National Education Longitudinal Study,national education longitudinal study,Federal Reserve Bank of Richmond S1. Accountin...,federal reserve bank of richmond s1 accounting...,[]
4,c754dec7-c5a3-4337-9892-c02158475064,"Parental Effort, School Resources, and Student...",National Education Longitudinal Study,National Education Longitudinal Study,national education longitudinal study,This article investigates an important factor ...,this article investigates an important factor ...,"[[NELS, national education longitudinal study]]"


In [24]:
tmp = []
# ri(df_traintest)なしだとここでエラー
for i in range(len(df_train_reduced)):
    if df_train_reduced['det_acronym'][i] is not None:
        tmp += df_train_reduced['det_acronym'][i]
df_train_acronym = pd.DataFrame(tmp)
df_train_acronym.columns = ['acronym', 'base']
df_train_acronym['base_acronym'] = df_train_acronym['base']+"|"+df_train_acronym['acronym']
df_train_acronym_tmp = df_train_acronym[df_train_acronym['base_acronym'].duplicated()==False]


df_agg = df_train_acronym.groupby('base_acronym')['base'].agg(len).reset_index()
cols_tmp = df_agg.columns.tolist()
cols_tmp[-1] = 'count_BA'
df_agg.columns = cols_tmp
df_train_acronym = df_train_acronym[df_train_acronym['base_acronym'].duplicated()==False]
df_train_acronym = pd.merge(df_train_acronym_tmp, df_agg, on='base_acronym', how='left')

df_train_acronym['train'] = True
print(df_train_acronym.shape)
# acronym unique =256
# BA unique 325
df_train_acronym.head(30)

(325, 5)


Unnamed: 0,acronym,base,base_acronym,count_BA,train
0,NELS,national education longitudinal study,national education longitudinal study|NELS,24,True
1,NLS,national longitudinal study,national longitudinal study|NLS,1,True
2,BPS,beginning postsecondary student longitudinal s...,beginning postsecondary student longitudinal s...,2,True
3,NELS,national educational longitudinal survey of 1988,national educational longitudinal survey of 19...,2,True
4,HSB,high school and beyond longitudinal dataset,high school and beyond longitudinal dataset|HSB,1,True
5,NELS,national education longitudinal study of 1988,national education longitudinal study of 1988|...,4,True
6,NHES,national household education survey,national household education survey|NHES,4,True
7,NLSY,national longitudinal survey of youth,national longitudinal survey of youth|NLSY,4,True
8,ATUS,american time use survey,american time use survey|ATUS,1,True
9,TIMSS,trends in international mathematics and scienc...,trends in international mathematics and scienc...,194,True


In [25]:
df_label_acronym_ver1_all = df_train_acronym.copy()
print(df_label_acronym_ver1_all.shape)
df_label_acronym_ver1_all.head()

(325, 5)


Unnamed: 0,acronym,base,base_acronym,count_BA,train
0,NELS,national education longitudinal study,national education longitudinal study|NELS,24,True
1,NLS,national longitudinal study,national longitudinal study|NLS,1,True
2,BPS,beginning postsecondary student longitudinal s...,beginning postsecondary student longitudinal s...,2,True
3,NELS,national educational longitudinal survey of 1988,national educational longitudinal survey of 19...,2,True
4,HSB,high school and beyond longitudinal dataset,high school and beyond longitudinal dataset|HSB,1,True


In [26]:
def get_match(x, ref_labels, threshold=0.5):
    for label in ref_labels:
        if jaccard(x, label)>=threshold:
            return label
    return 'no_match'

df_label_acronym_ver1_all['match'] = df_label_acronym_ver1_all['base'].apply(get_match, ref_labels=df_label2['cleaned_label_my'])
print((df_label_acronym_ver1_all['match']=='no_match').sum())
print(df_label_acronym_ver1_all.shape)
df_label_acronym_ver1_all.head()

270
(325, 6)


Unnamed: 0,acronym,base,base_acronym,count_BA,train,match
0,NELS,national education longitudinal study,national education longitudinal study|NELS,24,True,education longitudinal study
1,NLS,national longitudinal study,national longitudinal study|NLS,1,True,education longitudinal study
2,BPS,beginning postsecondary student longitudinal s...,beginning postsecondary student longitudinal s...,2,True,beginning postsecondary students longitudinal ...
3,NELS,national educational longitudinal survey of 1988,national educational longitudinal survey of 19...,2,True,no_match
4,HSB,high school and beyond longitudinal dataset,high school and beyond longitudinal dataset|HSB,1,True,no_match


In [27]:
df_label_acronym_ver1_all[(df_label_acronym_ver1_all['match']!='no_match')&(df_label_acronym_ver1_all['match'].duplicated())].head(30)

Unnamed: 0,acronym,base,base_acronym,count_BA,train,match
1,NLS,national longitudinal study,national longitudinal study|NLS,1,True,education longitudinal study
5,NELS,national education longitudinal study of 1988,national education longitudinal study of 1988|...,4,True,education longitudinal study
11,WLS,wisconsin longitudinal study,wisconsin longitudinal study|WLS,1,True,education longitudinal study
13,NELS,national education longitudinal study 1988 stu...,national education longitudinal study 1988 stu...,1,True,education longitudinal study
59,ARMS,and resource management survey,and resource management survey|ARMS,4,True,agricultural resource management survey
65,BPS,beginning postsecondary students longitudinal ...,beginning postsecondary students longitudinal ...,6,True,beginning postsecondary students
66,NLTS2,national longitudinal transition study 2,national longitudinal transition study 2|NLTS2,2,True,national education longitudinal study
67,TIMSS,third international mathematics and science study,third international mathematics and science st...,6,True,trends in international mathematics and scienc...
81,TLSA,taiwan longitudinal study of aging,taiwan longitudinal study of aging|TLSA,1,True,baltimore longitudinal study of aging
105,TIMSS,trends in mathematics and science study,trends in mathematics and science study|TIMSS,2,True,trends in international mathematics and scienc...


In [28]:
def delete_keywords(x, keywords):
    x_split = x.split()
    x_new = []
    for word in x_split:
        if word not in keywords:
            x_new.append(word)
    x_new = ' '.join(x_new)
    return x_new

keywords2 = keywords + ['of', 'the', 'national', 'education']
df_label2['label-keywords'] = df_label2['cleaned_label_my'].apply(lambda x: delete_keywords(x, keywords2))
df_label2.head()

Unnamed: 0,label,cleaned_label,dataset_title,cleaned_label_my,count,-count,target,target_relabeled,label-keywords
0,adni,adni,Alzheimer's Disease Neuroimaging Initiative (A...,adni,3673,-3673,0,0,adni
1,alzheimer s disease neuroimaging initiative adni,alzheimer s disease neuroimaging initiative adni,Alzheimer's Disease Neuroimaging Initiative (A...,alzheimer s disease neuroimaging initiative adni,2400,-2400,1,1,alzheimer s disease neuroimaging initiative adni
2,alzheimer's disease neuroimaging initiative (a...,alzheimer s disease neuroimaging initiative adni,Alzheimer's Disease Neuroimaging Initiative (A...,alzheimer s disease neuroimaging initiative adni,2400,-2400,1,1,alzheimer s disease neuroimaging initiative adni
3,trends in international mathematics and scienc...,trends in international mathematics and scienc...,Trends in International Mathematics and Scienc...,trends in international mathematics and scienc...,1163,-1163,2,2,trends in international mathematics and science
4,baltimore longitudinal study of aging,baltimore longitudinal study of aging,Baltimore Longitudinal Study of Aging (BLSA),baltimore longitudinal study of aging,1156,-1156,3,3,baltimore aging


In [29]:
df_label_acronym_ver1_all['label-keywords'] = df_label_acronym_ver1_all['base'].apply(lambda x: delete_keywords(x, keywords))
df_label_acronym_ver1_all['len_label-keywords'] = df_label_acronym_ver1_all['label-keywords'].apply(lambda x: len(x.split()))
df_label_acronym_ver1_all.head()

Unnamed: 0,acronym,base,base_acronym,count_BA,train,match,label-keywords,len_label-keywords
0,NELS,national education longitudinal study,national education longitudinal study|NELS,24,True,education longitudinal study,national education,2
1,NLS,national longitudinal study,national longitudinal study|NLS,1,True,education longitudinal study,national,1
2,BPS,beginning postsecondary student longitudinal s...,beginning postsecondary student longitudinal s...,2,True,beginning postsecondary students longitudinal ...,beginning postsecondary student,3
3,NELS,national educational longitudinal survey of 1988,national educational longitudinal survey of 19...,2,True,no_match,national educational of 1988,4
4,HSB,high school and beyond longitudinal dataset,high school and beyond longitudinal dataset|HSB,1,True,no_match,high school and beyond dataset,5


In [30]:
df_label_acronym_ver1_all['match_train-keywords'] = df_label_acronym_ver1_all['label-keywords'].apply(lambda x:
    get_match(x, ref_labels=df_label2['label-keywords'], threshold=0.75))
df_label_acronym_ver1_all.head()

Unnamed: 0,acronym,base,base_acronym,count_BA,train,match,label-keywords,len_label-keywords,match_train-keywords
0,NELS,national education longitudinal study,national education longitudinal study|NELS,24,True,education longitudinal study,national education,2,no_match
1,NLS,national longitudinal study,national longitudinal study|NLS,1,True,education longitudinal study,national,1,no_match
2,BPS,beginning postsecondary student longitudinal s...,beginning postsecondary student longitudinal s...,2,True,beginning postsecondary students longitudinal ...,beginning postsecondary student,3,beginning postsecondary student
3,NELS,national educational longitudinal survey of 1988,national educational longitudinal survey of 19...,2,True,no_match,national educational of 1988,4,no_match
4,HSB,high school and beyond longitudinal dataset,high school and beyond longitudinal dataset|HSB,1,True,no_match,high school and beyond dataset,5,no_match


In [31]:
df_label_acronym_ver1_all[
    (df_label_acronym_ver1_all['match_train-keywords']!='no_match')
    &(df_label_acronym_ver1_all['match_train-keywords'].duplicated())].head(30)

Unnamed: 0,acronym,base,base_acronym,count_BA,train,match,label-keywords,len_label-keywords,match_train-keywords
65,BPS,beginning postsecondary students longitudinal ...,beginning postsecondary students longitudinal ...,6,True,beginning postsecondary students,beginning postsecondary students,3,beginning postsecondary students
105,TIMSS,trends in mathematics and science study,trends in mathematics and science study|TIMSS,2,True,trends in international mathematics and scienc...,trends in mathematics and science,5,trends in international mathematics and science
163,TIMSS,trends in international mathematics and scienc...,trends in international mathematics and scienc...,1,True,trends in international mathematics and scienc...,trends in international mathematics and scienc...,7,trends in international mathematics and science
222,BPS,beginning postsecondary students longitudinal ...,beginning postsecondary students longitudinal ...,1,True,beginning postsecondary students,beginning postsecondary students,3,beginning postsecondary students
238,ECLS,early childhood longitudinal studies,early childhood longitudinal studies|ECLS,1,True,early childhood longitudinal study,early childhood,2,early childhood
285,ARMS,agricultural resource management survey data,agricultural resource management survey data|ARMS,1,True,agricultural resource management survey,agricultural resource management,3,agricultural resource management
287,ARMS,agricultural resource and management survey,agricultural resource and management survey|ARMS,1,True,agricultural resource management survey,agricultural resource and management,4,agricultural resource management
289,ARMS,agricultural resource management surveys,agricultural resource management surveys|ARMS,4,True,agricultural resource management survey,agricultural resource management surveys,4,agricultural resource management
291,BPS,beginning postsecondary students longitudinal ...,beginning postsecondary students longitudinal ...,1,True,beginning postsecondary students,beginning postsecondary students 2,4,beginning postsecondary students


In [32]:
idx_tmp = df_label_acronym_ver1_all['match_train-keywords']=='no_match'
print(df_label_acronym_ver1_all[idx_tmp].shape)
# high school effectiveness studyのみ誤削除
df_label_acronym_ver1_all[idx_tmp==False]

(303, 9)


Unnamed: 0,acronym,base,base_acronym,count_BA,train,match,label-keywords,len_label-keywords,match_train-keywords
2,BPS,beginning postsecondary student longitudinal s...,beginning postsecondary student longitudinal s...,2,True,beginning postsecondary students longitudinal ...,beginning postsecondary student,3,beginning postsecondary student
9,TIMSS,trends in international mathematics and scienc...,trends in international mathematics and scienc...,194,True,trends in international mathematics and scienc...,trends in international mathematics and science,6,trends in international mathematics and science
14,BPS,beginning postsecondary students longitudinal ...,beginning postsecondary students longitudinal ...,1,True,beginning postsecondary students,beginning postsecondary students,3,beginning postsecondary students
56,ARMS,agricultural resource management survey,agricultural resource management survey|ARMS,128,True,agricultural resource management survey,agricultural resource management,3,agricultural resource management
64,SSOCS,school survey on crime and safety,school survey on crime and safety|SSOCS,5,True,school survey on crime and safety,school on crime and safety,5,school on crime and safety
65,BPS,beginning postsecondary students longitudinal ...,beginning postsecondary students longitudinal ...,6,True,beginning postsecondary students,beginning postsecondary students,3,beginning postsecondary students
93,ECLS,early childhood longitudinal study,early childhood longitudinal study|ECLS,10,True,early childhood longitudinal study,early childhood,2,early childhood
105,TIMSS,trends in mathematics and science study,trends in mathematics and science study|TIMSS,2,True,trends in international mathematics and scienc...,trends in mathematics and science,5,trends in international mathematics and science
163,TIMSS,trends in international mathematics and scienc...,trends in international mathematics and scienc...,1,True,trends in international mathematics and scienc...,trends in international mathematics and scienc...,7,trends in international mathematics and science
194,WOD,world ocean database,world ocean database|WOD,3,True,world ocean database,world ocean database,3,world ocean database


In [33]:
idx_tmp = df_label_acronym_ver1_all['match']=='no_match'
print(df_label_acronym_ver1_all[idx_tmp].shape)
df_label_acronym_ver1_all[idx_tmp==False]

(270, 9)


Unnamed: 0,acronym,base,base_acronym,count_BA,train,match,label-keywords,len_label-keywords,match_train-keywords
0,NELS,national education longitudinal study,national education longitudinal study|NELS,24,True,education longitudinal study,national education,2,no_match
1,NLS,national longitudinal study,national longitudinal study|NLS,1,True,education longitudinal study,national,1,no_match
2,BPS,beginning postsecondary student longitudinal s...,beginning postsecondary student longitudinal s...,2,True,beginning postsecondary students longitudinal ...,beginning postsecondary student,3,beginning postsecondary student
5,NELS,national education longitudinal study of 1988,national education longitudinal study of 1988|...,4,True,education longitudinal study,national education of 1988,4,no_match
9,TIMSS,trends in international mathematics and scienc...,trends in international mathematics and scienc...,194,True,trends in international mathematics and scienc...,trends in international mathematics and science,6,trends in international mathematics and science
11,WLS,wisconsin longitudinal study,wisconsin longitudinal study|WLS,1,True,education longitudinal study,wisconsin,1,no_match
13,NELS,national education longitudinal study 1988 stu...,national education longitudinal study 1988 stu...,1,True,education longitudinal study,national education 1988 1994,4,no_match
14,BPS,beginning postsecondary students longitudinal ...,beginning postsecondary students longitudinal ...,1,True,beginning postsecondary students,beginning postsecondary students,3,beginning postsecondary students
21,NELS,national education longitudinal survey,national education longitudinal survey|NELS,3,True,national education longitudinal study,national education,2,no_match
22,HSES,high school effectiveness study,high school effectiveness study|HSES,1,True,high school longitudinal study,high school effectiveness,3,no_match


In [34]:
df_label_acronym_ver1_all['acronym_clean'] = df_label_acronym_ver1_all['acronym'].apply(clean_text)
df_label_acronym_ver1_all['match_train_acronym'] = df_label_acronym_ver1_all['acronym_clean'].apply(lambda x:
    x in df_label_train_acronym['acronym_clean'].tolist()
                                                                                                   )
df_label_acronym_ver1_all[df_label_acronym_ver1_all['match_train_acronym']]

Unnamed: 0,acronym,base,base_acronym,count_BA,train,match,label-keywords,len_label-keywords,match_train-keywords,acronym_clean,match_train_acronym
0,NELS,national education longitudinal study,national education longitudinal study|NELS,24,True,education longitudinal study,national education,2,no_match,nels,True
2,BPS,beginning postsecondary student longitudinal s...,beginning postsecondary student longitudinal s...,2,True,beginning postsecondary students longitudinal ...,beginning postsecondary student,3,beginning postsecondary student,bps,True
3,NELS,national educational longitudinal survey of 1988,national educational longitudinal survey of 19...,2,True,no_match,national educational of 1988,4,no_match,nels,True
5,NELS,national education longitudinal study of 1988,national education longitudinal study of 1988|...,4,True,education longitudinal study,national education of 1988,4,no_match,nels,True
9,TIMSS,trends in international mathematics and scienc...,trends in international mathematics and scienc...,194,True,trends in international mathematics and scienc...,trends in international mathematics and science,6,trends in international mathematics and science,timss,True
13,NELS,national education longitudinal study 1988 stu...,national education longitudinal study 1988 stu...,1,True,education longitudinal study,national education 1988 1994,4,no_match,nels,True
14,BPS,beginning postsecondary students longitudinal ...,beginning postsecondary students longitudinal ...,1,True,beginning postsecondary students,beginning postsecondary students,3,beginning postsecondary students,bps,True
15,ECLS,early childhood longitudinal study kindergarte...,early childhood longitudinal study kindergarte...,1,True,no_match,early childhood kindergarten class of 1998 99,7,no_match,ecls,True
21,NELS,national education longitudinal survey,national education longitudinal survey|NELS,3,True,national education longitudinal study,national education,2,no_match,nels,True
36,BLSA,baltimore longitudinal study of aging,baltimore longitudinal study of aging|BLSA,58,True,baltimore longitudinal study of aging,baltimore of aging,3,no_match,blsa,True


In [36]:
# NLTS2, ECLSKが問題か
print(df_label_acronym_ver1_all.shape)
df_label_acronym_ver1_all.to_csv(params['OUTPUT_DIR']+"/df_label_acronym_ver1_all_210619_02.csv", index=None)

(325, 11)
