# Getting back to 100,000
We'd like our sample to be as close as possible to 100,000 records. So, seeing as we've lost a considerable amount after these first few processes, we'll fetch more using the same method as in **01_sample_generating**. Then we will run the records through the same processes as above, then randomly select records to drop to get the sample to an even 100,000.

In [1]:
import pandas as pd
from pathlib import Path
import time
from habanero import Crossref, WorksContainer
#Set up directories
data_dir = Path('../data')
input_dir = data_dir / 'input'
output_dir = data_dir / 'output'
df = pd.read_csv(input_dir / 'phase2_sample.csv').set_index('Unnamed: 0')
df.index.names = ['Index']

In [2]:
cr = Crossref(mailto='dennis.donathanii@protonmail.com')
while len(set(df['DOI'])) < 105000:
    search = cr.works(filter={'type':'journal-article'}, sample=100)
    x = WorksContainer(search)
    ndf = pd.DataFrame(x.works)
    df = pd.concat([df,ndf], ignore_index=True)
    time.sleep(5)
df

Unnamed: 0,indexed,reference-count,publisher,issue,license,content-domain,short-container-title,published-print,DOI,type,created,page,source,is-referenced-by-count,title,prefix,volume,author,member,reference,container-title,language,link,deposited,score,resource,issued,references-count,journal-issue,URL,ISSN,issn-type,subject,published,alternative-id,published-online,archive,update-policy,assertion,funder,article-number,accepted,abstract,original-title,subtitle,published-other,editor,update-to,relation
0,"{'date-parts': [[2022, 10, 7]], 'date-time': '...",14,Wiley,1,"[{'start': {'date-parts': [[2015, 9, 1]], 'dat...","{'domain': [], 'crossmark-restriction': False}",['Syst. Dyn. Rev.'],{'date-parts': [[2000]]},10.1002/(sici)1099-1727(200021)16:1<27::aid-sd...,journal-article,"{'date-parts': [[2002, 9, 10]], 'date-time': '...",27-41,Crossref,57,['The validation of commercial system dynamics...,10.1002,16,"[{'given': 'Geoff', 'family': 'Coyle', 'sequen...",311.0,[{'key': '10.1002/(SICI)1099-1727(200021)16:1<...,['System Dynamics Review'],en,[{'URL': 'https://api.wiley.com/onlinelibrary/...,"{'date-parts': [[2021, 7, 1]], 'date-time': '2...",0.0,{'primary': {'URL': 'https://onlinelibrary.wil...,{'date-parts': [[2000]]},14,"{'issue': '1', 'published-print': {'date-parts...",http://dx.doi.org/10.1002/(sici)1099-1727(2000...,"['0883-7066', '1099-1727']","[{'value': '0883-7066', 'type': 'print'}, {'va...","['Management of Technology and Innovation', 'S...",{'date-parts': [[2000]]},,,,,,,,,,,,,,,
1,"{'date-parts': [[2022, 3, 29]], 'date-time': '...",12,Springer Science and Business Media LLC,1,"[{'start': {'date-parts': [[1979, 3, 1]], 'dat...","{'domain': [], 'crossmark-restriction': False}",['MTB'],"{'date-parts': [[1979, 3]]}",10.1007/bf02653972,journal-article,"{'date-parts': [[2007, 7, 17]], 'date-time': '...",57-62,Crossref,20,['Effect of system geometry on the leaching be...,10.1007,10,"[{'given': 'C.', 'family': 'Vu', 'sequence': '...",297.0,"[{'key': 'BF02653972_CR1', 'volume-title': 'Ph...",['Metallurgical Transactions B'],en,[{'URL': 'http://link.springer.com/content/pdf...,"{'date-parts': [[2019, 5, 20]], 'date-time': '...",0.0,{'primary': {'URL': 'http://link.springer.com/...,"{'date-parts': [[1979, 3]]}",12,"{'issue': '1', 'published-print': {'date-parts...",http://dx.doi.org/10.1007/bf02653972,"['0360-2141', '1543-1916']","[{'value': '0360-2141', 'type': 'print'}, {'va...","['Materials Chemistry', 'Metals and Alloys', '...","{'date-parts': [[1979, 3]]}",['BF02653972'],,,,,,,,,,,,,,
2,"{'date-parts': [[2022, 3, 30]], 'date-time': '...",0,Wiley,3,"[{'start': {'date-parts': [[2017, 11, 1]], 'da...","{'domain': [], 'crossmark-restriction': False}",['RECIEL'],"{'date-parts': [[2017, 11]]}",10.1111/reel.12221,journal-article,"{'date-parts': [[2017, 12, 1]], 'date-time': '...",243-254,Crossref,2,['The international law on transboundary haze ...,10.1111,26,"[{'given': 'Shawkat', 'family': 'Alam', 'seque...",311.0,,"['Review of European, Comparative &amp; Intern...",en,[{'URL': 'https://api.wiley.com/onlinelibrary/...,"{'date-parts': [[2017, 12, 1]], 'date-time': '...",0.0,{'primary': {'URL': 'http://doi.wiley.com/10.1...,"{'date-parts': [[2017, 11]]}",0,"{'issue': '3', 'published-print': {'date-parts...",http://dx.doi.org/10.1111/reel.12221,['2050-0386'],"[{'value': '2050-0386', 'type': 'print'}]","['Law', 'Management, Monitoring, Policy and La...","{'date-parts': [[2017, 11]]}",,"{'date-parts': [[2017, 11, 28]]}",['Portico'],,,,,,,,,,,,
3,"{'date-parts': [[2022, 4, 3]], 'date-time': '2...",0,Crop Science Society of Japan,1-2,,"{'domain': [], 'crossmark-restriction': False}","['Japanese journal of crop science', 'Jpn. J. ...",{'date-parts': [[1951]]},10.1626/jcs.20.219,journal-article,"{'date-parts': [[2011, 9, 20]], 'date-time': '...",219-222,Crossref,0,['Studies on the influence of pruning on the v...,10.1626,20,"[{'given': 'C.', 'family': 'TSUDA', 'sequence'...",632.0,,['Japanese Journal of Crop Science'],en,[{'URL': 'http://www.jstage.jst.go.jp/article/...,"{'date-parts': [[2021, 4, 30]], 'date-time': '...",0.0,{'primary': {'URL': 'http://www.jstage.jst.go....,{'date-parts': [[1951]]},0,"{'issue': '1-2', 'published-print': {'date-par...",http://dx.doi.org/10.1626/jcs.20.219,"['0011-1848', '1349-0990']","[{'value': '0011-1848', 'type': 'print'}, {'va...","['Genetics', 'Agronomy and Crop Science', 'Foo...",{'date-parts': [[1951]]},,,,,,,,,,,,,,,
4,"{'date-parts': [[2022, 3, 31]], 'date-time': '...",60,Elsevier BV,6,"[{'start': {'date-parts': [[2018, 12, 1]], 'da...","{'domain': ['clinicalkey.fr', 'elsevier.com', ...",['Revue de Pneumologie Clinique'],"{'date-parts': [[2018, 12]]}",10.1016/j.pneumo.2018.09.002,journal-article,"{'date-parts': [[2018, 10, 10]], 'date-time': ...",391-399,Crossref,0,['Le tabagisme et l’aide à l’arrêt du tabac de...,10.1016,74,"[{'given': 'J.', 'family': 'Perriot', 'sequenc...",78.0,[{'key': '10.1016/j.pneumo.2018.09.002_bib0305...,['Revue de Pneumologie Clinique'],fr,[{'URL': 'https://api.elsevier.com/content/art...,"{'date-parts': [[2019, 10, 26]], 'date-time': ...",0.0,{'primary': {'URL': 'https://linkinghub.elsevi...,"{'date-parts': [[2018, 12]]}",60,"{'issue': '6', 'published-print': {'date-parts...",http://dx.doi.org/10.1016/j.pneumo.2018.09.002,['0761-8417'],"[{'value': '0761-8417', 'type': 'print'}]",['Pulmonary and Respiratory Medicine'],"{'date-parts': [[2018, 12]]}",['S0761841718301792'],,,http://dx.doi.org/10.1016/elsevier_cm_policy,"[{'value': 'Elsevier', 'name': 'publisher', 'l...",,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105102,"{'date-parts': [[2022, 4, 6]], 'date-time': '2...",0,Springer Science and Business Media LLC,1,"[{'start': {'date-parts': [[1930, 12, 1]], 'da...","{'domain': [], 'crossmark-restriction': False}",[Neophilologus],"{'date-parts': [[1930, 12]]}",10.1007/bf01510212,journal-article,"{'date-parts': [[2005, 4, 18]], 'date-time': '...",248-249,Crossref,0,"[Filocolo, Filocopo, Filopono]",10.1007,15,"[{'given': 'G. A.', 'family': 'Nauta', 'sequen...",297,,[Neophilologus],en,[{'URL': 'http://link.springer.com/article/10....,"{'date-parts': [[2019, 5, 3]], 'date-time': '2...",0.0,{'primary': {'URL': 'http://link.springer.com/...,"{'date-parts': [[1930, 12]]}",0,"{'issue': '1', 'published-print': {'date-parts...",http://dx.doi.org/10.1007/bf01510212,"[0028-2677, 1572-8668]","[{'value': '0028-2677', 'type': 'print'}, {'va...","[Literature and Literary Theory, Linguistics a...","{'date-parts': [[1930, 12]]}",[BF01510212],,,,,,,,,,,,,,
105103,"{'date-parts': [[2022, 4, 2]], 'date-time': '2...",0,Elsevier BV,3,"[{'start': {'date-parts': [[2020, 2, 1]], 'dat...","{'domain': [], 'crossmark-restriction': False}",[Biophysical Journal],"{'date-parts': [[2020, 2]]}",10.1016/j.bpj.2019.11.2324,journal-article,"{'date-parts': [[2020, 2, 7]], 'date-time': '2...",411a,Crossref,0,[Contributions of the Transmembrane Domain to ...,10.1016,118,"[{'given': 'Aerial M.', 'family': 'Pratt', 'se...",78,,[Biophysical Journal],en,[{'URL': 'https://api.elsevier.com/content/art...,"{'date-parts': [[2021, 2, 7]], 'date-time': '2...",0.0,{'primary': {'URL': 'https://linkinghub.elsevi...,"{'date-parts': [[2020, 2]]}",0,"{'issue': '3', 'published-print': {'date-parts...",http://dx.doi.org/10.1016/j.bpj.2019.11.2324,[0006-3495],"[{'value': '0006-3495', 'type': 'print'}]",[Biophysics],"{'date-parts': [[2020, 2]]}",[S0006349519332576],,,,,,,,,,,,,,
105104,"{'date-parts': [[2022, 4, 4]], 'date-time': '2...",7,Elsevier BV,,"[{'start': {'date-parts': [[2016, 1, 1]], 'dat...","{'domain': ['elsevier.com', 'sciencedirect.com...",[Procedia Structural Integrity],{'date-parts': [[2016]]},10.1016/j.prostr.2016.06.306,journal-article,"{'date-parts': [[2016, 7, 22]], 'date-time': '...",2447-2455,Crossref,2,[A Probabilistic Fatigue Assessment Diagram To...,10.1016,2,"[{'given': 'S.', 'family': 'Jallouf', 'sequenc...",78,[{'key': '10.1016/j.prostr.2016.06.306_bib0001...,[Procedia Structural Integrity],en,[{'URL': 'https://api.elsevier.com/content/art...,"{'date-parts': [[2018, 9, 10]], 'date-time': '...",0.0,{'primary': {'URL': 'https://linkinghub.elsevi...,{'date-parts': [[2016]]},7,,http://dx.doi.org/10.1016/j.prostr.2016.06.306,[2452-3216],"[{'value': '2452-3216', 'type': 'print'}]",[General Medicine],{'date-parts': [[2016]]},[S2452321616303171],,,http://dx.doi.org/10.1016/elsevier_cm_policy,"[{'value': 'Elsevier', 'name': 'publisher', 'l...",,,,,,,,,,
105105,"{'date-parts': [[2022, 4, 7]], 'date-time': '2...",10,American Physical Society (APS),7,"[{'start': {'date-parts': [[1991, 10, 1]], 'da...","{'domain': [], 'crossmark-restriction': False}",[Phys. Rev. A],,10.1103/physreva.44.4757,journal-article,"{'date-parts': [[2002, 7, 27]], 'date-time': '...",4757-4760,Crossref,1,[Effect of squeezed light on the photon-number...,10.1103,44,"[{'given': 'Lu-Bi', 'family': 'Deng', 'sequenc...",16,"[{'key': 'PhysRevA.44.4757Cc1R1', 'doi-asserte...",[Physical Review A],en,[{'URL': 'http://link.aps.org/article/10.1103/...,"{'date-parts': [[2017, 6, 15]], 'date-time': '...",0.0,{'primary': {'URL': 'https://link.aps.org/doi/...,"{'date-parts': [[1991, 10, 1]]}",10,"{'issue': '7', 'published-print': {'date-parts...",http://dx.doi.org/10.1103/physreva.44.4757,"[1050-2947, 1094-1622]","[{'value': '1050-2947', 'type': 'print'}, {'va...","[Atomic and Molecular Physics, and Optics]","{'date-parts': [[1991, 10, 1]]}",,"{'date-parts': [[1991, 10, 1]]}",,,,,,,,,,,,,


In [3]:
df.to_csv(input_dir / '01_raw_data.csv')