# Homework №1 - Data collecting and cleansing

## Data preparing

In [50]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re
import requests
import time
from typing import List
from tqdm import tqdm_notebook

In [2]:
data = pd.read_csv('./data_2.csv', sep=',')

In [3]:
data

Unnamed: 0,DOI,Date,Journal,Title,Name,measurement_error,measurement_wavelength,measurement_method,normalised_name,raw_value,specifier
0,10.1016/j.optlaseng.2008.09.005,1/14/2009,Optics and Lasers in Engineering,AHIGHPRECISIONMEASUREMENTTECHNIQUEFOREVALUATIN...,Alcohol,0.00000,,el_mylogic,CCO,1.34749,Refractive index (n)
1,10.1016/j.jiec.2018.12.038,12/29/2018,Journal of Industrial and Engineering Chemistry,IMPROVINGUNDERSTANDINGSOLVENTEFFECTSINTERMOLEC...,aceticacid,0.00000,,el_mylogic,CC(O)=O,1.7,n
2,10.1016/j.jlumin.2015.02.010,2/14/2015,Journal of Luminescence,SPECTROSCOPICPROPERTIESTELLURITEGLASSESCODOPED...,Phosphate,0.00000,600 nm,el_mylogic,[O-][P]([O-])([O-])=O,1.54,Refractive index (~600 nm)
3,10.1039/B717069F_© The Royal Society of Chemis...,,,,BK7,0.00000,,snowball,,1.518,refractive index
4,10.1529/biophysj.106.094946,10/23/2008,Biophysical Journal,QUANTITATIVECHARACTERIZATIONBIOLOGICALLIQUIDSF...,ethanol,0.00126,,el_cde_text,CCO,1.35434 ± 0.00126,refractive index
...,...,...,...,...,...,...,...,...,...,...,...
4995,10.1016/j.sse.2007.06.004,7/30/2007,Solid-State Electronics,RADIATIVELINEWIDTHASINGLEIMPURITYMOLECULEINABI...,Biphenyl,0.00000,,el_cde_tables,c1ccc(cc1)c2ccccc2,1.68,Refractive indices
4996,10.1039/B822982A,7/21/2009,Lab on a Chip,Tunable Liquid Gradient Refractive Index (L-GR...,CaCl2,0.00000,,rsc_cde_text,"[['Ca', 1.0], ['Cl', 2.0]]",∼1.41,nD
4997,10.1078/0030-4026-00175,11/5/2004,Optik,OPTIMIZATIONCONDUCTINGPOLYMERTHINFILMOPTICALEL...,He-Ne,0.00000,,el_cde_text,"[['He-', 1.0], ['Ne', 1.0]]",1.717,refractive index
4998,10.1016/j.optmat.2018.06.013,6/18/2018,Optical Materials,ENHANCEDPHOTOELECTRICPERFORMANCENANORODTIO2FIL...,ITO,0.00000,,el_cde_text,"[['IT', 1.0], ['O', 1.0]]",1.92,refractive index


Let's check how many gaps we have in the lines, try to fill them

In [4]:
columns = list(data.columns)

In [None]:
columns

In [5]:
print("Missing values distribution by column: ")
print(data.isnull().mean())
print("")

Missing values distribution by column: 
DOI                       0.0000
Date                      0.0842
Journal                   0.0842
Title                     0.0842
Name                      0.0010
measurement_error         0.0000
measurement_wavelength    0.8858
measurement_method        0.0000
normalised_name           0.4116
raw_value                 0.0000
specifier                 0.0000
dtype: float64



Now we can easily look at the distribution of missing values in the given dataset, for example: in the column 'measurement_wavelength' 88,58% of data is missing, in 'normalised_name' - 41,16%, which is sucks actually:( Good news: in columns 'DOI', 'measurement_error', 'measurement_method', 'raw_value' and 'specifier' there are no missing values at all! Cool!

In [6]:
print("Column datatypes: ")
print(data.dtypes)

Column datatypes: 
DOI                        object
Date                       object
Journal                    object
Title                      object
Name                       object
measurement_error         float64
measurement_wavelength     object
measurement_method         object
normalised_name            object
raw_value                  object
specifier                  object
dtype: object


Now let's see at the type of data we have. All the columns have object as their datatype aside from 'measurement_error'. In pandas, object means either string or mixed type (numerical and non-numerical type mixed).

Finally, let’s make sure we remove any trailing characters and whitespace using 'strip':

In [7]:
str_cols = list(data.columns)
str_cols.remove('measurement_error')

In [8]:
for i in str_cols:
    data[i] = data[i].str.strip()

In [9]:
data.head()

Unnamed: 0,DOI,Date,Journal,Title,Name,measurement_error,measurement_wavelength,measurement_method,normalised_name,raw_value,specifier
0,10.1016/j.optlaseng.2008.09.005,1/14/2009,Optics and Lasers in Engineering,AHIGHPRECISIONMEASUREMENTTECHNIQUEFOREVALUATIN...,Alcohol,0.0,,el_mylogic,CCO,1.34749,Refractive index (n)
1,10.1016/j.jiec.2018.12.038,12/29/2018,Journal of Industrial and Engineering Chemistry,IMPROVINGUNDERSTANDINGSOLVENTEFFECTSINTERMOLEC...,aceticacid,0.0,,el_mylogic,CC(O)=O,1.7,n
2,10.1016/j.jlumin.2015.02.010,2/14/2015,Journal of Luminescence,SPECTROSCOPICPROPERTIESTELLURITEGLASSESCODOPED...,Phosphate,0.0,600 nm,el_mylogic,[O-][P]([O-])([O-])=O,1.54,Refractive index (~600 nm)
3,10.1039/B717069F_© The Royal Society of Chemis...,,,,BK7,0.0,,snowball,,1.518,refractive index
4,10.1529/biophysj.106.094946,10/23/2008,Biophysical Journal,QUANTITATIVECHARACTERIZATIONBIOLOGICALLIQUIDSF...,ethanol,0.00126,,el_cde_text,CCO,1.35434 ± 0.00126,refractive index


In [10]:
missing_by_row = data.isnull().sum(axis=1)
sorted_rows = data.loc[missing_by_row.sort_values(ascending=False).index]
print("Top 10 rows with the most missing values:")
print(sorted_rows.head(10))

Top 10 rows with the most missing values:
                                                    DOI Date Journal Title  \
3056  10.1016/S0963-9969(01)00105-3Food Research Int...  NaN     NaN   NaN   
2553                                 10.1039/C6AN00509H  NaN     NaN   NaN   
1481                                 10.1039/C7AN01576C  NaN     NaN   NaN   
2999                                 10.1039/C6TC02368A  NaN     NaN   NaN   
4034                                 10.1039/C0CP02270E  NaN     NaN   NaN   
2570       10.1016/j.renene.2018.02.018Renewable Energy  NaN     NaN   NaN   
1443                                 10.1039/C8SC04479A  NaN     NaN   NaN   
4052                                 10.1039/C2JM14369K  NaN     NaN   NaN   
4059  10.1016/S1044-5803(03)00075-5Materials Charact...  NaN     NaN   NaN   
4060                                   10.1039/B414064H  NaN     NaN   NaN   

                         Name  measurement_error measurement_wavelength  \
3056                 gly

Oops, it seems like in some cases in the column 'DOI' the Journal name sticks to the DOI, let's fix it

In [11]:
print(data.loc[3056, 'DOI'])

10.1016/S0963-9969(01)00105-3Food Research International


In [12]:
DOI_column = data['DOI'].tolist()

In [13]:
DOI_pattern = re.compile(r'^10\.\d{4,9}\/[-._;()\/:A-Z0-9]+(?=_)')

In [14]:
DOI_column[0:9]

['10.1016/j.optlaseng.2008.09.005',
 '10.1016/j.jiec.2018.12.038',
 '10.1016/j.jlumin.2015.02.010',
 '10.1039/B717069F_© The Royal Society of Chemistry 2008',
 '10.1529/biophysj.106.094946',
 '10.1039/B211264G',
 '10.1016/j.saa.2018.06.016',
 '10.1016/j.tsf.2018.01.058',
 '10.1016/S0921-5107(98)00209-8']

In [15]:
DOI_example = '10.1016/S0963-9969(01)00105-3Food Research International'
DOI_example

'10.1016/S0963-9969(01)00105-3Food Research International'

In [16]:
DOI_match_example = re.match(DOI_pattern, DOI_example)

In [17]:
DOI_match_example

In [21]:
def is_valid_doi(doi_str:str) -> bool:
    """
    Check if a DOI is valid and corresponds to an article on the internet.
    
    Args:
        doi_str (str): A string representing the DOI to be checked.
        
    Returns:
        bool: True if the DOI is valid and corresponds to an article with metadata available on the internet, False otherwise.
        
    Example Usage:
        >>> is_valid_doi('10.1016/j.jacc.2020.02.068')
        True
    """
    # Construct the API URL for the DOI
    url = f"https://api.crossref.org/works/{doi_str}"
    
    # Make an HTTP request to the API
    response = requests.get(url)
    
    # Check if the request was successful
    if response.status_code == 200:
        # Extract the metadata from the response
        metadata = response.json()["message"]
        # Check if the metadata contains a title (i.e., the DOI is valid)
        if "title" in metadata:
            return True
    # If the request failed or the metadata does not contain a title, the DOI is invalid
    return False

In [22]:
def clean_DOI(initial_DOI_list: List[str]) -> List[str]:
    """
    Clean a list of DOIs by removing invalid DOIs and keeping only valid DOIs with metadata available on the internet.

    Args:
        initial_DOI_list (List[str]): A list of strings representing the DOIs to be cleaned.

    Returns:
        List[str]: A list of strings representing the cleaned DOIs. Each element of the returned list is either a valid DOI or the string 'invalid' if the DOI is not valid or does not have metadata available on the internet.

    Example Usage:
        >>> clean_DOI(['10.1016/j.jacc.2020.02.068', '10.3390/bs10010012', '10.1038/nature12373', '10.1162/REST_a_00136'])
        ['10.1016/j.jacc.2020.02.068', '10.3390/bs10010012', 'invalid', 'invalid']
    """
    DOI_pattern = re.compile(r'^10\.\d{4,9}\/[-._;()\/:A-Z0-9]+', flags=re.IGNORECASE)
    cleaned_DOI_list = []
    for DOI in tqdm(initial_DOI_list):
        DOI_match = re.findall(DOI_pattern, DOI)
        if DOI_match:
            DOI_cleaned = DOI_match[0].strip('_')
            if is_valid_doi(DOI_cleaned):
                cleaned_DOI_list.append(DOI_cleaned)
            else:
                cleaned_DOI_list.append('invalid')
        else:
            cleaned_DOI_list.append('invalid')
            print(DOI)
        # Sleep for 0.125 seconds
        time.sleep(0.125)
    return cleaned_DOI_list

In [23]:
DOI_new = clean_DOI(DOI_column)

100%|██████████████████████████████████████████████████████████████████████████████| 5000/5000 [54:07<00:00,  1.54it/s]


In [24]:
print(len(DOI_column))
print(len(DOI_new))

5000
5000


In [25]:
DOI_new.count('invalid')

173

In [26]:
DOI_new[4990:]

['10.1016/j.optmat.2014.04.044',
 '10.1039/B413474E',
 '10.1016/j.molliq.2015.03.012',
 '10.1016/j.jphotochem.2017.08.050',
 '10.1016/j.molliq.2013.09.035',
 '10.1016/j.sse.2007.06.004',
 '10.1039/B822982A',
 '10.1078/0030-4026-00175',
 '10.1016/j.optmat.2018.06.013',
 '10.1016/S0042-207X(01)00296-2']

In [29]:
invalid_ids = [i for i, x in enumerate(DOI_new) if x == 'invalid']
print(invalid_ids)

[49, 71, 75, 86, 97, 108, 114, 131, 136, 221, 231, 389, 416, 485, 516, 632, 634, 637, 653, 657, 721, 731, 747, 812, 850, 904, 913, 921, 970, 973, 999, 1087, 1099, 1114, 1118, 1143, 1158, 1178, 1195, 1221, 1267, 1349, 1372, 1392, 1413, 1435, 1475, 1487, 1512, 1518, 1519, 1533, 1543, 1548, 1559, 1569, 1576, 1581, 1662, 1663, 1748, 1755, 1775, 1805, 1858, 1882, 1911, 1936, 1953, 1961, 1987, 2022, 2042, 2055, 2057, 2139, 2180, 2237, 2265, 2414, 2498, 2526, 2536, 2557, 2570, 2608, 2687, 2717, 2722, 2787, 2790, 2818, 2830, 2872, 2873, 2903, 3031, 3056, 3084, 3095, 3108, 3155, 3157, 3162, 3178, 3185, 3232, 3306, 3315, 3344, 3377, 3411, 3412, 3413, 3445, 3462, 3518, 3537, 3555, 3565, 3577, 3588, 3598, 3608, 3614, 3682, 3704, 3734, 3746, 3814, 3858, 3914, 3926, 3946, 3947, 3963, 4031, 4056, 4059, 4062, 4092, 4127, 4152, 4160, 4161, 4170, 4245, 4249, 4264, 4272, 4278, 4282, 4330, 4336, 4371, 4391, 4420, 4477, 4500, 4520, 4545, 4551, 4578, 4628, 4681, 4737, 4768, 4885, 4932, 4933, 4945, 4961, 497

In [35]:
invalid_elements = [DOI_column[i] for i in invalid_ids]
print(invalid_elements, invalid_ids)

['10.1016/j.jallcom.2017.03.270Journal of Alloys and Compounds', '10.1016/j.optcom.2015.04.046Optics Communications', '10.1016/j.tsf.2016.01.038Thin Solid Films', '10.1016/S0167-9317(02)01012-2Microelectronic Engineering', '10.1016/j.chroma.2015.07.062Journal of Chromatography A', '10.1016/j.talanta.2015.11.051Talanta', '10.1038/ncomms8', '10.1016/S0021-9673(00)00517-3Journal of Chromatography A', '10.1016/j.mee.2004.03.068Microelectronic Engineering', '10.1016/S0022-3093(02)00967-5Journal of Non-Crystalline Solids', '10.1016/j.ijleo.2018.04.126Optik', '10.1016/j.carbon.2018.01.009Carbon', '10.1016/j.optmat.2004.02.014Optical Materials', '10.1016/j.expthermflusci.2018.02.036Experimental Thermal and Fluid Science', '10.1016/j.bios.2016.05.082Biosensors and Bioelectronics', '10.1016/j.yofte.2018.07.003Optical Fiber Technology', '10.1016/S0168-9002(01)00872-5Nuclear Instruments and Methods in Physics Research Section A: Accelerators, Spectrometers, Detectors and Associated Equipment', '10

In [37]:
data.iloc[invalid_ids,:].to_csv("data_2_invalid.tsv", sep='\t')

In [38]:
pwd

'C:\\Users\\bocha\\Algorithms and Big Data in Chemistry and Materials'

In [44]:
DOI_no_invalid_elements = [re.findall(r'^10\.\d{4,9}\/[-._;()\/:A-Z0-9]+\d', invalid_element) for invalid_element in invalid_elements]

In [45]:
DOI_no_invalid_elements

[[],
 [],
 [],
 ['10.1016/S0167-9317(02)01012-2'],
 [],
 [],
 [],
 ['10.1016/S0021-9673(00)00517-3'],
 [],
 ['10.1016/S0022-3093(02)00967-5'],
 [],
 [],
 [],
 [],
 [],
 [],
 ['10.1016/S0168-9002(01)00872-5'],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 ['10.1016/S0927-0248(01)00058-7'],
 ['10.1016/S0009-2614(00)00534-0'],
 ['10.1016/S0040-6090(00)01205-0'],
 [],
 ['10.1163/15685520360685983'],
 ['10.1016/S0022-2313(01)00371-4'],
 [],
 [],
 [],
 ['10.1016/S0021-9797(03)00452-1'],
 ['10.1016/S0040-6090(00)00712-4'],
 [],
 ['10.1016/S0030-3992(02)00073-7'],
 [],
 ['10.1016/S1369-8001(00)00054-8'],
 [],
 ['10.1016/S1386-9477(02)01046-9'],
 [],
 [],
 ['10.1016/S0146-6410(02)00157-6'],
 ['10.1016/S0168-9002(01)01946-5'],
 ['10.1016/S0168-583X(01)01292-7'],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 ['10.1016/S0032-5910(02)00212-7'],
 [],
 [],
 [],
 [],
 [],
 [],
 ['10.1016/S0925-4005(03)00257-0'],
 [],
 ['10.1016/S0022-3093(02)00921-3'],
 [],
 ['10.1016/S00

In [58]:
status_DOI = [is_valid_doi(DOI_no_invalid_element[0]) for DOI_no_invalid_element in tqdm(DOI_no_invalid_elements) if DOI_no_invalid_element]




  0%|                                                                                          | 0/173 [00:00<?, ?it/s][A[A[A


  2%|█▉                                                                                | 4/173 [00:00<00:19,  8.81it/s][A[A[A


  5%|███▊                                                                              | 8/173 [00:00<00:18,  8.93it/s][A[A[A


  6%|████▋                                                                            | 10/173 [00:01<00:23,  6.94it/s][A[A[A


 10%|███████▉                                                                         | 17/173 [00:01<00:15, 10.34it/s][A[A[A


 14%|███████████▋                                                                     | 25/173 [00:02<00:11, 12.84it/s][A[A[A


 16%|████████████▋                                                                    | 27/173 [00:03<00:19,  7.39it/s][A[A[A


 17%|█████████████▌                                                             

In [59]:
status_DOI

[True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 True,
 False,
 True,
 True,
 False,
 True,
 True,
 True]

In [60]:
DOI_pattern_2 = re.compile(r'^10\.\d{4,9}\/[-._;()\/:A-Z0-9]+\d', flags=re.IGNORECASE)

In [61]:
#DOI_new_backup = DOI_new.copy()

In [62]:
with open('DOI_new_backup.txt', 'w') as DOI_file:
    for DOI in DOI_new_backup:
        DOI_file.write(DOI+'\n')

In [75]:
yet_invalid_DOI = {}
for id_x,DOI in tqdm(enumerate(DOI_new)):
    if DOI == 'invalid':
        DOI_to_cor = DOI_column[id_x]
        DOI_pot_cor = re.findall(DOI_pattern_2, DOI_to_cor)
        if len(DOI_pot_cor)>0:
            if is_valid_doi(DOI_pot_cor[0]):
                DOI_new[id_x] = DOI_pot_cor[0]
            else:
                yet_invalid_DOI[id_x]=DOI_to_cor
        else:
            yet_invalid_DOI[id_x]=DOI_to_cor




0it [00:00, ?it/s][A[A[A


50it [00:00, 111.20it/s][A[A[A


72it [00:00, 76.63it/s] [A[A[A


80it [00:01, 48.40it/s][A[A[A


87it [00:01, 35.47it/s][A[A[A


98it [00:02, 31.24it/s][A[A[A


109it [00:02, 28.73it/s][A[A[A


115it [00:03, 23.80it/s][A[A[A


132it [00:03, 24.72it/s][A[A[A


137it [00:04, 20.99it/s][A[A[A


222it [00:04, 69.80it/s][A[A[A


232it [00:05, 55.48it/s][A[A[A


390it [00:05, 140.20it/s][A[A[A


417it [00:07, 69.74it/s] [A[A[A


486it [00:07, 86.57it/s][A[A[A


517it [00:08, 81.40it/s][A[A[A


633it [00:08, 121.95it/s][A[A[A


648it [00:09, 74.93it/s] [A[A[A


659it [00:10, 52.99it/s][A[A[A


722it [00:10, 69.96it/s][A[A[A


732it [00:11, 58.92it/s][A[A[A


748it [00:12, 48.06it/s][A[A[A


813it [00:12, 70.54it/s][A[A[A


851it [00:12, 73.71it/s][A[A[A


905it [00:13, 85.10it/s][A[A[A


915it [00:13, 68.21it/s][A[A[A


923it [00:14, 54.07it/s][A[A[A


971it [00:14, 68.08it/s][A[A

In [76]:
yet_invalid_DOI

{114: '10.1038/ncomms8',
 136: '10.1016/j.mee.2004.03.068Microelectronic Engineering',
 2057: '10.1016/S0026-2692(03)00137-XMicroelectronics Journal',
 2790: '10.1016/j.snb.2004.06.015',
 3095: '10.1016/S1350-4495(99)00047-XInfrared Physics & Technology',
 3178: '10.3389/fpls.2014.00',
 3555: '10.1002/jbio.201700',
 4420: '10.1016/S0925-4005(99)00427-XSensors and Actuators B: Chemical',
 4500: '10.1063/1.4765',
 4681: '10.1016/S0038-092X(00)00013-XSolar Energy'}

In [68]:
len(DOI_new)

5000

In [71]:
DOI_new.count('invalid')

173

In [74]:
DOI_new = DOI_new_backup.copy()

In [78]:
yet_invalid_DOI = {114: '10.1038/ncomms8',
 136: '10.1016/j.mee.2004.03.068Microelectronic Engineering',
 2057: '10.1016/S0026-2692(03)00137-XMicroelectronics Journal',
 2790: '10.1016/j.snb.2004.06.015',
 3095: '10.1016/S1350-4495(99)00047-XInfrared Physics & Technology',
 3178: '10.3389/fpls.2014.00',
 3555: '10.1002/jbio.201700',
 4420: '10.1016/S0925-4005(99)00427-XSensors and Actuators B: Chemical',
 4500: '10.1063/1.4765',
 4681: '10.1016/S0038-092X(00)00013-XSolar Energy'}

In [85]:
DOI_new[114] = 'NA'
DOI_new[136] = '10.1016/j.mee.2004.03.068'
DOI_new[2057] = '10.1016/S0026-2692(03)00137-X'
DOI_new[2790] = '10.1016/j.snb.2004.06.015'
DOI_new[3095] = '10.1016/S1350-4495(99)00047-X'
DOI_new[3178] = 'NA'
DOI_new[3555] = 'NA'
DOI_new[4420] = '10.1016/S0925-4005(99)00427-X'
DOI_new[4500] = 'NA'
DOI_new[4681] = '10.1016/S0038-092X(00)00013-X'

In [90]:
data['DOI'] = DOI_new

In [93]:
data.to_csv('data_2_DOI_new.tsv', sep='\t')

In [95]:
pip install crossref-commons

Collecting crossref-commons
  Downloading crossref_commons-0.0.7-py3-none-any.whl (14 kB)
Collecting ratelimit>=2.2.1
  Downloading ratelimit-2.2.1.tar.gz (5.3 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: ratelimit
  Building wheel for ratelimit (setup.py): started
  Building wheel for ratelimit (setup.py): finished with status 'done'
  Created wheel for ratelimit: filename=ratelimit-2.2.1-py3-none-any.whl size=5944 sha256=efa5253c0898d32057e18d349b5489cc01cfee250cfee106fd3a93b43e5e88cb
  Stored in directory: c:\users\bocha\appdata\local\pip\cache\wheels\27\5f\ba\e972a56dcbf5de9f2b7d2b2a710113970bd173c4dcd3d2c902
Successfully built ratelimit
Installing collected packages: ratelimit, crossref-commons
Successfully installed crossref-commons-0.0.7 ratelimit-2.2.1
Note: you may need to restart the kernel to use updated packages.




In [96]:
import crossref_commons.retrieval

In [98]:
tmp = crossref_commons.retrieval.get_publication_as_json('10.1016/j.jallcom.2017.03.270')

In [99]:
for key,value in tmp.items():
    print(key)

indexed
reference-count
publisher
license
funder
content-domain
short-container-title
published-print
DOI
type
created
page
update-policy
source
is-referenced-by-count
title
prefix
volume
author
member
reference
container-title
original-title
language
link
deposited
score
resource
subtitle
short-title
issued
references-count
alternative-id
URL
relation
ISSN
issn-type
subject
published
assertion


In [100]:
tmp

{'indexed': {'date-parts': [[2022, 12, 31]],
  'date-time': '2022-12-31T11:11:56Z',
  'timestamp': 1672485116510},
 'reference-count': 22,
 'publisher': 'Elsevier BV',
 'license': [{'start': {'date-parts': [[2017, 7, 1]],
    'date-time': '2017-07-01T00:00:00Z',
    'timestamp': 1498867200000},
   'content-version': 'tdm',
   'delay-in-days': 0,
   'URL': 'https://www.elsevier.com/tdm/userlicense/1.0/'}],
 'funder': [{'DOI': '10.13039/501100004085',
   'name': 'Ministry of Education, Science and Technology',
   'doi-asserted-by': 'publisher',
   'award': ['NRF-2015R1A1A1A05027848']},
  {'DOI': '10.13039/501100003725',
   'name': 'National Research Foundation of Korea',
   'doi-asserted-by': 'publisher'}],
 'content-domain': {'domain': ['elsevier.com', 'sciencedirect.com'],
  'crossmark-restriction': True},
 'short-container-title': ['Journal of Alloys and Compounds'],
 'published-print': {'date-parts': [[2017, 7]]},
 'DOI': '10.1016/j.jallcom.2017.03.270',
 'type': 'journal-article',
 

In [103]:
tmp['indexed']['date-parts'][0]

[2022, 12, 31]

In [106]:
publish_date = f"{tmp['indexed']['date-parts'][0][1]}/{tmp['indexed']['date-parts'][0][2]}/{tmp['indexed']['date-parts'][0][0]}"

In [107]:
publish_date

'12/31/2022'

In [108]:
article_title = tmp['title'][0]

In [109]:
article_title

'Improving light extraction in light-emitting diodes using zinc-tin-oxide layers'

In [111]:
print(tmp['short-container-title'])
print(tmp['container-title'])
print(tmp['original-title'])

['Journal of Alloys and Compounds']
['Journal of Alloys and Compounds']
[]


In [112]:
journal_title = tmp['short-container-title'][0]

In [113]:
journal_title

'Journal of Alloys and Compounds'

In [None]:
#to do: сделать функцию
#проверка1: АПИ вернуло что-то (лен тмп больше 0)
#проверка2: indexed, title, short-container-title ЕСТЬ -> (try (выполнить) exept ('NA'))
#сделать словарь: ключ - DOI, значения - лист(indexed, title, short-container-title) -> pandas.df -> примёрджить к нашей data по DOI