# Getting back to 100,000
We'd like our sample to be as close as possible to 100,000 records. So, seeing as we've lost a considerable amount after these first few processes, we'll fetch more using the same method as in **01_sample_generating**. Then we will run the records through the same processes as above, then randomly select records to drop to get the sample to an even 100,000.

In [1]:
import pandas as pd
from pathlib import Path
import time
from habanero import Crossref, WorksContainer
#Set up directories
data_dir = Path('../data')
input_dir = data_dir / 'input'
output_dir = data_dir / 'output'
df = pd.read_csv(input_dir / 'phase2_sample.csv').set_index('Unnamed: 0')
df.index.names = ['Index']

In [2]:
cr = Crossref(mailto='dennis.donathanii@protonmail.com')
while len(set(df['DOI'])) < 106000:
    search = cr.works(filter={'type':'journal-article'}, sample=100)
    x = WorksContainer(search)
    ndf = pd.DataFrame(x.works)
    df = pd.concat([df,ndf], ignore_index=True)
    time.sleep(5)
df

Unnamed: 0,indexed,reference-count,publisher,issue,license,content-domain,short-container-title,published-print,DOI,type,created,page,source,is-referenced-by-count,title,prefix,volume,author,member,reference,container-title,language,link,deposited,score,resource,issued,references-count,journal-issue,URL,ISSN,issn-type,subject,published,alternative-id,published-online,archive,update-policy,assertion,funder,article-number,accepted,abstract,original-title,subtitle,published-other,editor,relation,update-to,translator,clinical-trial-number
0,"{'date-parts': [[2022, 10, 7]], 'date-time': '...",14,Wiley,1,"[{'start': {'date-parts': [[2015, 9, 1]], 'dat...","{'domain': [], 'crossmark-restriction': False}",['Syst. Dyn. Rev.'],{'date-parts': [[2000]]},10.1002/(sici)1099-1727(200021)16:1<27::aid-sd...,journal-article,"{'date-parts': [[2002, 9, 10]], 'date-time': '...",27-41,Crossref,57,['The validation of commercial system dynamics...,10.1002,16,"[{'given': 'Geoff', 'family': 'Coyle', 'sequen...",311.0,[{'key': '10.1002/(SICI)1099-1727(200021)16:1<...,['System Dynamics Review'],en,[{'URL': 'https://api.wiley.com/onlinelibrary/...,"{'date-parts': [[2021, 7, 1]], 'date-time': '2...",0.0,{'primary': {'URL': 'https://onlinelibrary.wil...,{'date-parts': [[2000]]},14,"{'issue': '1', 'published-print': {'date-parts...",http://dx.doi.org/10.1002/(sici)1099-1727(2000...,"['0883-7066', '1099-1727']","[{'value': '0883-7066', 'type': 'print'}, {'va...","['Management of Technology and Innovation', 'S...",{'date-parts': [[2000]]},,,,,,,,,,,,,,,,,
1,"{'date-parts': [[2022, 3, 29]], 'date-time': '...",12,Springer Science and Business Media LLC,1,"[{'start': {'date-parts': [[1979, 3, 1]], 'dat...","{'domain': [], 'crossmark-restriction': False}",['MTB'],"{'date-parts': [[1979, 3]]}",10.1007/bf02653972,journal-article,"{'date-parts': [[2007, 7, 17]], 'date-time': '...",57-62,Crossref,20,['Effect of system geometry on the leaching be...,10.1007,10,"[{'given': 'C.', 'family': 'Vu', 'sequence': '...",297.0,"[{'key': 'BF02653972_CR1', 'volume-title': 'Ph...",['Metallurgical Transactions B'],en,[{'URL': 'http://link.springer.com/content/pdf...,"{'date-parts': [[2019, 5, 20]], 'date-time': '...",0.0,{'primary': {'URL': 'http://link.springer.com/...,"{'date-parts': [[1979, 3]]}",12,"{'issue': '1', 'published-print': {'date-parts...",http://dx.doi.org/10.1007/bf02653972,"['0360-2141', '1543-1916']","[{'value': '0360-2141', 'type': 'print'}, {'va...","['Materials Chemistry', 'Metals and Alloys', '...","{'date-parts': [[1979, 3]]}",['BF02653972'],,,,,,,,,,,,,,,,
2,"{'date-parts': [[2022, 3, 30]], 'date-time': '...",0,Wiley,3,"[{'start': {'date-parts': [[2017, 11, 1]], 'da...","{'domain': [], 'crossmark-restriction': False}",['RECIEL'],"{'date-parts': [[2017, 11]]}",10.1111/reel.12221,journal-article,"{'date-parts': [[2017, 12, 1]], 'date-time': '...",243-254,Crossref,2,['The international law on transboundary haze ...,10.1111,26,"[{'given': 'Shawkat', 'family': 'Alam', 'seque...",311.0,,"['Review of European, Comparative &amp; Intern...",en,[{'URL': 'https://api.wiley.com/onlinelibrary/...,"{'date-parts': [[2017, 12, 1]], 'date-time': '...",0.0,{'primary': {'URL': 'http://doi.wiley.com/10.1...,"{'date-parts': [[2017, 11]]}",0,"{'issue': '3', 'published-print': {'date-parts...",http://dx.doi.org/10.1111/reel.12221,['2050-0386'],"[{'value': '2050-0386', 'type': 'print'}]","['Law', 'Management, Monitoring, Policy and La...","{'date-parts': [[2017, 11]]}",,"{'date-parts': [[2017, 11, 28]]}",['Portico'],,,,,,,,,,,,,,
3,"{'date-parts': [[2022, 4, 3]], 'date-time': '2...",0,Crop Science Society of Japan,1-2,,"{'domain': [], 'crossmark-restriction': False}","['Japanese journal of crop science', 'Jpn. J. ...",{'date-parts': [[1951]]},10.1626/jcs.20.219,journal-article,"{'date-parts': [[2011, 9, 20]], 'date-time': '...",219-222,Crossref,0,['Studies on the influence of pruning on the v...,10.1626,20,"[{'given': 'C.', 'family': 'TSUDA', 'sequence'...",632.0,,['Japanese Journal of Crop Science'],en,[{'URL': 'http://www.jstage.jst.go.jp/article/...,"{'date-parts': [[2021, 4, 30]], 'date-time': '...",0.0,{'primary': {'URL': 'http://www.jstage.jst.go....,{'date-parts': [[1951]]},0,"{'issue': '1-2', 'published-print': {'date-par...",http://dx.doi.org/10.1626/jcs.20.219,"['0011-1848', '1349-0990']","[{'value': '0011-1848', 'type': 'print'}, {'va...","['Genetics', 'Agronomy and Crop Science', 'Foo...",{'date-parts': [[1951]]},,,,,,,,,,,,,,,,,
4,"{'date-parts': [[2022, 3, 31]], 'date-time': '...",60,Elsevier BV,6,"[{'start': {'date-parts': [[2018, 12, 1]], 'da...","{'domain': ['clinicalkey.fr', 'elsevier.com', ...",['Revue de Pneumologie Clinique'],"{'date-parts': [[2018, 12]]}",10.1016/j.pneumo.2018.09.002,journal-article,"{'date-parts': [[2018, 10, 10]], 'date-time': ...",391-399,Crossref,0,['Le tabagisme et l’aide à l’arrêt du tabac de...,10.1016,74,"[{'given': 'J.', 'family': 'Perriot', 'sequenc...",78.0,[{'key': '10.1016/j.pneumo.2018.09.002_bib0305...,['Revue de Pneumologie Clinique'],fr,[{'URL': 'https://api.elsevier.com/content/art...,"{'date-parts': [[2019, 10, 26]], 'date-time': ...",0.0,{'primary': {'URL': 'https://linkinghub.elsevi...,"{'date-parts': [[2018, 12]]}",60,"{'issue': '6', 'published-print': {'date-parts...",http://dx.doi.org/10.1016/j.pneumo.2018.09.002,['0761-8417'],"[{'value': '0761-8417', 'type': 'print'}]",['Pulmonary and Respiratory Medicine'],"{'date-parts': [[2018, 12]]}",['S0761841718301792'],,,http://dx.doi.org/10.1016/elsevier_cm_policy,"[{'value': 'Elsevier', 'name': 'publisher', 'l...",,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106102,"{'date-parts': [[2022, 3, 30]], 'date-time': '...",2,Elsevier BV,2,"[{'start': {'date-parts': [[1988, 4, 1]], 'dat...","{'domain': [], 'crossmark-restriction': False}",[British Journal of Oral and Maxillofacial Sur...,"{'date-parts': [[1988, 4]]}",10.1016/0266-4356(88)90016-2,journal-article,"{'date-parts': [[2004, 4, 29]], 'date-time': '...",171-172,Crossref,1,[A champy plate template],10.1016,26,"[{'given': 'M.T.', 'family': 'Simpson', 'seque...",78,"[{'key': '10.1016/0266-4356(88)90016-2_BIB1', ...",[British Journal of Oral and Maxillofacial Sur...,en,[{'URL': 'https://api.elsevier.com/content/art...,"{'date-parts': [[2019, 2, 9]], 'date-time': '2...",0.0,{'primary': {'URL': 'https://linkinghub.elsevi...,"{'date-parts': [[1988, 4]]}",2,"{'issue': '2', 'published-print': {'date-parts...",http://dx.doi.org/10.1016/0266-4356(88)90016-2,[0266-4356],"[{'value': '0266-4356', 'type': 'print'}]","[Otorhinolaryngology, Oral Surgery, Surgery]","{'date-parts': [[1988, 4]]}",[0266435688900162],,,,,,,,,,,,,,,,
106103,"{'date-parts': [[2023, 1, 10]], 'date-time': '...",41,"Impact Journals, LLC",3,,"{'domain': [], 'crossmark-restriction': False}",[Oncotarget],"{'date-parts': [[2018, 1, 9]]}",10.18632/oncotarget.23280,journal-article,"{'date-parts': [[2017, 12, 15]], 'date-time': ...",3946-3955,Crossref,27,[Validation of a hypoxia related gene signatur...,10.18632,9,"[{'given': 'Lingjian', 'family': 'Yang', 'sequ...",7892,"[{'key': '1', 'doi-asserted-by': 'crossref', '...",[Oncotarget],en,[{'URL': 'https://www.oncotarget.com/lookup/do...,"{'date-parts': [[2020, 7, 15]], 'date-time': '...",0.0,{'primary': {'URL': 'https://www.oncotarget.co...,"{'date-parts': [[2017, 12, 12]]}",41,"{'issue': '3', 'published-print': {'date-parts...",http://dx.doi.org/10.18632/oncotarget.23280,[1949-2553],"[{'value': '1949-2553', 'type': 'electronic'}]",[Oncology],"{'date-parts': [[2017, 12, 12]]}","[23280, 29423096]","{'date-parts': [[2017, 12, 12]]}",,,,,,,,,,,,,,,
106104,"{'date-parts': [[2022, 10, 16]], 'date-time': ...",33,AIP Publishing,9,,"{'domain': ['aip.scitation.org'], 'crossmark-r...",,"{'date-parts': [[1996, 9]]}",10.1063/1.869021,journal-article,"{'date-parts': [[2002, 7, 26]], 'date-time': '...",2365-2374,Crossref,77,[An experimental study of deep water plunging ...,10.1063,8,"[{'given': 'Marc', 'family': 'Perlin', 'sequen...",317,"[{'key': '10.1063/1.869021_r1', 'doi-asserted-...",[Physics of Fluids],en,[{'URL': 'http://aip.scitation.org/doi/10.1063...,"{'date-parts': [[2017, 11, 20]], 'date-time': ...",0.0,{'primary': {'URL': 'http://aip.scitation.org/...,"{'date-parts': [[1996, 9]]}",33,"{'issue': '9', 'published-print': {'date-parts...",http://dx.doi.org/10.1063/1.869021,"[1070-6631, 1089-7666]","[{'value': '1070-6631', 'type': 'print'}, {'va...","[Condensed Matter Physics, Fluid Flow and Tran...","{'date-parts': [[1996, 9]]}",,,,http://dx.doi.org/10.1063/aip-crossmark-policy...,,,,,,,,,,,,,
106105,"{'date-parts': [[2022, 4, 5]], 'date-time': '2...",5,Oxford University Press (OUP),8,"[{'start': {'date-parts': [[1999, 8, 1]], 'dat...","{'domain': [], 'crossmark-restriction': False}",,"{'date-parts': [[2002, 12, 12]]}",10.1002/bjs.1155,journal-article,"{'date-parts': [[2006, 6, 16]], 'date-time': '...",1099-1100,Crossref,0,[Authors' reply],10.1093,86,"[{'given': 'R E K', 'family': 'Marshall', 'seq...",286,"[{'key': '2021070922214553300_bib1', 'doi-asse...",[British Journal of Surgery],en,[{'URL': 'https://api.wiley.com/onlinelibrary/...,"{'date-parts': [[2021, 7, 10]], 'date-time': '...",0.0,{'primary': {'URL': 'https://academic.oup.com/...,"{'date-parts': [[1999, 8]]}",5,"{'issue': '8', 'published-online': {'date-part...",http://dx.doi.org/10.1002/bjs.1155,"[0007-1323, 1365-2168]","[{'value': '0007-1323', 'type': 'print'}, {'va...",[Surgery],"{'date-parts': [[1999, 8]]}",,"{'date-parts': [[2002, 12, 12]]}",,,,,,,,,,"{'date-parts': [[1999, 8]]}",,,,,


In [3]:
df.to_csv(input_dir / '01_raw_data.csv')