# データ読み込み、シリアライズ
> train_data, test_dataをシリアライズし、pandas.Dataframe型に置き換える

In [1]:
import pandas as pd
import scipy as sc
import numpy as np
import sklearn
import pickle
import pathlib as Path
import matplotlib.pyplot as plt
import japanize_matplotlib
import seaborn as sb
sb.set(font='IPAexGothic')

import multiprocessing
import itertools
import collections
import datetime
import gc

from tqdm._tqdm_notebook import tqdm

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

data_path = Path.Path("../data")
img_path = Path.Path("../img")
result_path = Path.Path("../result")

prefix = 'ana001'

Please use `tqdm.notebook.*` instead of `tqdm._tqdm_notebook.*`


## データ読み込み
> 今回のjsonファイルは1行1データ  
> 1行の文字列をdictに直すのは json.loads() を使用する  
> versions列のみ、list[dict{}, ...] 形式となっており、そのままDF化すると複数行定義となってしまう為、暫定的にlistで囲っておく

In [49]:
import json

# jsonファイルから一行ずつ読み出す関数(generator)
def get_data_iter(fpath):
    
    # 対象のパスを開く
    with open(fpath, 'r') as f:
        
        # 一行ごとに切り出す
        for l in f:
            
            # 一行ごとに返却
            yield l
            
# 値が複数のリストになっているものを、入れ子にする関数
def modify_data(data):
    
    # 'versions'列がある場合
    if data.get('versions') is not None:
    
        # 'versions'列を入れ子にする(※lenを1にする)
        data['versions'] = [data['versions']]
    
    return data

# 対象のjsonファイルをDF化する
def json_to_df(path):
    
    return pd.DataFrame([modify_data(json.loads(line)) for line in get_data_iter(path)])

In [51]:
# 訓練データ読み込み
train_df = json_to_df(data_path / 'train_data.json')

# pickle化しておく
train_df.to_pickle(data_path / 'train_df.pkl')

train_df

Unnamed: 0,id,submitter,authors,title,comments,journal-ref,doi,report-no,categories,license,abstract,versions,update_date,authors_parsed,doi_cites,cites
0,hep-ph/9902295,Michael Kraemer,Mark E. Hayes (University College London) and ...,Heavy-Flavour Production at HERA,"LaTeX, 21 pages, 13 Postscript figures. Summar...","J.Phys.G25:1477-1493,1999",10.1088/0954-3899/25/7/332,"CERN-TH/99-30, UCL/HEP 99-03",hep-ph hep-ex,,We review the theoretical and experimental s...,"[[{'version': 'v1', 'created': 'Wed, 10 Feb 19...",2008-11-26,"[[Hayes, Mark E., , University College London]...",1,
1,1403.7138,Aigen Li,"Qi Li, S.L. Liang, Aigen Li (University of Mis...",Spectropolarimetric Constraints on the Nature ...,"5 pages, 2 figures; accepted for publication i...",,10.1093/mnrasl/slu021,,astro-ph.GA,http://arxiv.org/licenses/nonexclusive-distrib...,While it is well recognized that interstella...,"[[{'version': 'v1', 'created': 'Thu, 27 Mar 20...",2015-06-19,"[[Li, Qi, , University of Missouri], [Liang, S...",8,7.0
2,1405.5857,Michael Mortonson,"Michael J. Mortonson, Uro\v{s} Seljak",A joint analysis of Planck and BICEP2 B modes ...,"13 pages, 4 figures; submitted to JCAP; refere...",JCAP10(2014)035,10.1088/1475-7516/2014/10/035,,astro-ph.CO gr-qc hep-ph hep-th,http://arxiv.org/licenses/nonexclusive-distrib...,We analyze BICEP2 and Planck data using a mo...,"[[{'version': 'v1', 'created': 'Thu, 22 May 20...",2014-10-17,"[[Mortonson, Michael J., ], [Seljak, Uroš, ]]",122,188.0
3,1807.01034,Evangelos Thomas Karamatskos,"Evangelos T. Karamatskos, Sebastian Raabe, Ter...",Molecular movie of ultrafast coherent rotation...,9 Figures,"Nat Commun 10, 3364 (2019)",10.1038/s41467-019-11122-y,,physics.chem-ph physics.atom-ph quant-ph,http://arxiv.org/licenses/nonexclusive-distrib...,Recording molecular movies on ultrafast time...,"[[{'version': 'v1', 'created': 'Tue, 3 Jul 201...",2020-05-19,"[[Karamatskos, Evangelos T., ], [Raabe, Sebast...",6,8.0
4,1905.05921,Juanjuan Gu,Juanjuan Gu and Yun Jing,A Modified Mixed Domain Method for Modeling Ac...,,,10.1121/10.0001454,,physics.med-ph physics.comp-ph,http://arxiv.org/licenses/nonexclusive-distrib...,"In this paper, phase correction and amplitud...","[[{'version': 'v1', 'created': 'Wed, 15 May 20...",2020-07-15,"[[Gu, Juanjuan, ], [Jing, Yun, ]]",0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
851519,1708.06097,Haim Diamant,Rony Granek and Haim Diamant,Membrane undulations in a structured fluid: Un...,10 pages,"Eur. Phys. J. E 41, 1 (2018)",10.1140/epje/i2018-11607-x,,cond-mat.soft,http://arxiv.org/licenses/nonexclusive-distrib...,The dynamics of membrane undulations inside ...,"[[{'version': 'v1', 'created': 'Mon, 21 Aug 20...",2018-02-20,"[[Granek, Rony, ], [Diamant, Haim, ]]",0,
851520,1904.07627,Chonglong Liu,"C. L. Liu, Xiao-Dong Yu, D. M. Tong",Flag Additivity in Quantum Resource Theories,6 pages,"Phys. Rev. A 99, 042322(2019)",10.1103/PhysRevA.99.042322,,quant-ph,http://arxiv.org/licenses/nonexclusive-distrib...,Quantum resource theories offer a powerful f...,"[[{'version': 'v1', 'created': 'Tue, 16 Apr 20...",2019-04-17,"[[Liu, C. L., ], [Yu, Xiao-Dong, ], [Tong, D. ...",2,
851521,1507.01140,Othman Benomar,"O. Benomar, M. Takata, H. Shibahashi, T. Ceill...",Nearly-uniform internal rotation of solar-like...,Accepted to MNRAS,,10.1093/mnras/stv1493,,astro-ph.SR,http://arxiv.org/licenses/nonexclusive-distrib...,The rotation rates in the deep interior and ...,"[[{'version': 'v1', 'created': 'Sat, 4 Jul 201...",2015-08-06,"[[Benomar, O., ], [Takata, M., ], [Shibahashi,...",42,
851522,2006.03674,Sukriti Manna,"Troy D Loeffler, Sukriti Manna, Tarak K Patra,...",Active Learning A Neural Network Model For Gol...,,,10.1002/cctc.202000774,,physics.comp-ph cond-mat.mtrl-sci,http://arxiv.org/licenses/nonexclusive-distrib...,Small metal clusters are of fundamental scie...,"[[{'version': 'v1', 'created': 'Fri, 5 Jun 202...",2020-07-21,"[[Loeffler, Troy D, ], [Manna, Sukriti, ], [Pa...",0,


In [52]:
# テストデータ読み込み
test_df = json_to_df(data_path / 'test_data.json')

# pickle化しておく
test_df.to_pickle(data_path / 'test_df.pkl')

test_df

Unnamed: 0,id,submitter,authors,title,comments,journal-ref,doi,report-no,categories,license,abstract,versions,update_date,authors_parsed,doi_cites
0,1605.00995,Simonetta Abenda,Simonetta Abenda,On a family of KP multi-line solitons associat...,"48 pages, 5 figures. Revised manuscript. Added...","J. Geom. Phys. 119 (2017), 112-138",10.1016/j.geomphys.2017.04.005,,math-ph math.MP,http://arxiv.org/licenses/nonexclusive-distrib...,We classify the soliton data in the totally ...,"[[{'version': 'v1', 'created': 'Tue, 3 May 201...",2019-06-27,"[[Abenda, Simonetta, ]]",5
1,1206.6911,Hanqing Zheng,"L. Y. Dai, Meng Shi, Guang-Yi Tang, H. Q. Zheng",On the Nature of X(4260),Refined analysis with new experimental data in...,"Phys. Rev. D 92, 014020 (2015)",10.1103/PhysRevD.92.014020,,hep-ph hep-ex,http://arxiv.org/licenses/nonexclusive-distrib...,We study the property of $X(4260)$ resonance...,"[[{'version': 'v1', 'created': 'Thu, 28 Jun 20...",2015-07-22,"[[Dai, L. Y., ], [Shi, Meng, ], [Tang, Guang-Y...",23
2,cond-mat/0504055,Haim Diamant,"B. Lin, M. Meron, B. Cui, S. A. Rice, H. Diamant",From random walk to single-file diffusion,"4 pages, 4 figures","Phys Rev Lett 94, 216001 (2005)",10.1103/PhysRevLett.94.216001,,cond-mat.soft cond-mat.mtrl-sci physics.chem-ph,,We report an experimental study of diffusion...,"[[{'version': 'v1', 'created': 'Sun, 3 Apr 200...",2007-05-23,"[[Lin, B., ], [Meron, M., ], [Cui, B., ], [Ric...",93
3,astro-ph/9907297,Tod E. Strohmayer,Tod E. Strohmayer,Spin Down of Pulsations in the Cooling Tail of...,"16 pages, AASTEX preprint with 7 embedded figu...",,10.1086/312258,,astro-ph,,We report the discovery with the proportiona...,"[[{'version': 'v1', 'created': 'Wed, 21 Jul 19...",2009-10-31,"[[Strohmayer, Tod E., ]]",24
4,1104.5407,Lie-Wen Chen,"Lie-Wen Chen, Jian-Zhong Gu",Correlations between the nuclear breathing mod...,"9 pages, 6 figures. Discussions and references...","J.Phys.G39:035104,2012",10.1088/0954-3899/39/3/035104,,nucl-th astro-ph.SR nucl-ex,http://arxiv.org/licenses/nonexclusive-distrib...,Based on microscopic Hartree-Fock + random p...,"[[{'version': 'v1', 'created': 'Thu, 28 Apr 20...",2012-03-27,"[[Chen, Lie-Wen, ], [Gu, Jian-Zhong, ]]",12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59079,1210.4112,Hamish Gordon,"LHCb collaboration: R. Aaij, C. Abellan Beteta...",Measurement of the D+/- production asymmetry i...,"12 pages, 5 figures",Phys. Lett. B. 718 (2013) 902-907,10.1016/j.physletb.2012.11.038,LHCb-PAPER-2012-026; CERN-PH-EP-2012-305,hep-ex,http://creativecommons.org/licenses/by/3.0/,The asymmetry in the production cross-sectio...,"[[{'version': 'v1', 'created': 'Mon, 15 Oct 20...",2012-12-20,"[[LHCb collaboration, , ], [Aaij, R., ], [Bete...",22
59080,1701.03465,In Sung Jang,"Myung Gyoon Lee, In Sung Jang, Rachael Beaton,...",The Carnegie-Chicago Hubble Program: Discovery...,"6 pages, 4 figures, 1 table, accepted for publ...",,10.3847/2041-8213/835/2/L27,,astro-ph.GA,http://arxiv.org/licenses/nonexclusive-distrib...,Ultra-faint dwarf galaxies (UFDs) are the fa...,"[[{'version': 'v1', 'created': 'Thu, 12 Jan 20...",2017-02-08,"[[Lee, Myung Gyoon, ], [Jang, In Sung, ], [Bea...",5
59081,1709.10428,Vincent Beaud,Vincent Beaud and Simone Warzel,Bounds on the entanglement entropy of droplet ...,15 pages,,10.1063/1.5007035,,math-ph math.MP,http://arxiv.org/licenses/nonexclusive-distrib...,We consider a class of one-dimensional quant...,"[[{'version': 'v1', 'created': 'Fri, 29 Sep 20...",2018-02-14,"[[Beaud, Vincent, ], [Warzel, Simone, ]]",6
59082,gr-qc/9803020,Kengo Maeda,"Kengo Maeda, Akihiro Ishibashi, and Makoto Narita",Chronology Protection and Non-Naked Singularity,17 pages including 3 eps figures. Accepted for...,"Class.Quant.Grav.15:1637-1651,1998",10.1088/0264-9381/15/6/018,TIT/HEP-377/COSMO-87,gr-qc,,We test the chronology protection conjecture...,"[[{'version': 'v1', 'created': 'Thu, 5 Mar 199...",2011-07-19,"[[Maeda, Kengo, ], [Ishibashi, Akihiro, ], [Na...",5


- memo
> 各所にNone文字列があり、nanと同義  
> idは統一的でない -> indexを新たにidとしなおしても良い  
> authorsはlist化されていない  
> commentsから数値を抜き出せる  
> jornal-ref(掲載誌)から出版年等を割り出せる  
> doi, は別IDのようなので、必要がない?  
> report-noは"大学のプレプリントサーバー等による識別子"とある、サーバーによって被引用数に違いがある?  
> categoriesはサーバーのシステム上のカテゴリ  
> licenceによって被引用数に違いがある？  
> abstが一番のキモ?  
> update_dateは更新日時  
> authors_parsedには著者名がlistのlistになっている  
> doi_citesは低精度被引用数、citiesがない場合は考慮しなければならない