In [None]:
import pandas as pd
import numpy as np
import re

In [None]:
#load all the papers
data_paper = pd.read_csv('./papers.csv')

In [None]:
# delete the papers include these keywords
keywords = [keyword.lower() for keyword in ['First-principles', 'first principle', 'DFT', 'Ab Initio', 'ab initio', 'initio', 'numerical',
                                            'Density functional theory', 'Simulation', 'model', 'review', 'perspective',
                                            'insight', 'challenge', 'Computational', 'calculations',
                                            'modeling', 'Recent progress', 'reactor', 'Development', 'simulation', 'prediction', 
                                            'theoretical', 'view', 'Machine learning', 'storage system','Density-functional','electronic']]


mask = data_paper['文献标题'].apply(lambda x: any(keyword in x.lower() for keyword in keywords))


data_paper_include_key = data_paper[mask]


data_paper_dele_key = data_paper[~mask]


data_paper_include_key = data_paper_include_key.reset_index(drop=True)
data_paper_dele_key = data_paper_dele_key.reset_index(drop=True)


In [None]:
# keep the papers include these keywords
keywords = ['hydrogen', 'hydrogenation', 'dehydrogenation']

mask = data_paper_dele_key['文献标题'].str.contains('|'.join(keywords), case=False, na=False)


df_include = data_paper_dele_key[mask]
df_exclude = data_paper_dele_key[~mask]

df_include = df_include.reset_index(drop=True)
df_exclude = df_exclude.reset_index(drop=True)

In [None]:
# keep the papers include one of the elemtns listed as below

metal_elements_full = {
    'Li': 'Lithium', 'Be': 'Beryllium', 'Na': 'Sodium', 'Mg': 'Magnesium', 'Al': 'Aluminum', 'K': 'Potassium', 
    'Ca': 'Calcium', 'Sc': 'Scandium', 'Ti': 'Titanium', 'V': 'Vanadium', 'Cr': 'Chromium', 'Mn': 'Manganese', 
    'Fe': 'Iron', 'Co': 'Cobalt', 'Ni': 'Nickel', 'Cu': 'Copper', 'Zn': 'Zinc', 'Ga': 'Gallium', 'Rb': 'Rubidium', 
    'Sr': 'Strontium', 'Y': 'Yttrium', 'Zr': 'Zirconium', 'Nb': 'Niobium', 'Mo': 'Molybdenum', 'Tc': 'Technetium', 
    'Ru': 'Ruthenium', 'Rh': 'Rhodium', 'Pd': 'Palladium', 'Ag': 'Silver', 'Cd': 'Cadmium', 'In': 'Indium', 
    'Sn': 'Tin', 'Sb': 'Antimony', 'Cs': 'Cesium', 'Ba': 'Barium', 'La': 'Lanthanum', 'Ce': 'Cerium', 
    'Pr': 'Praseodymium', 'Nd': 'Neodymium', 'Pm': 'Promethium', 'Sm': 'Samarium', 'Eu': 'Europium', 
    'Gd': 'Gadolinium', 'Tb': 'Terbium', 'Dy': 'Dysprosium', 'Ho': 'Holmium', 'Er': 'Erbium', 'Tm': 'Thulium', 
    'Yb': 'Ytterbium', 'Lu': 'Lutetium', 'Hf': 'Hafnium', 'Ta': 'Tantalum', 'W': 'Tungsten', 'Re': 'Rhenium', 
    'Os': 'Osmium', 'Ir': 'Iridium', 'Pt': 'Platinum', 'Au': 'Gold', 'Hg': 'Mercury', 'Tl': 'Thallium', 
    'Pb': 'Lead', 'Bi': 'Bismuth', 'Fr': 'Francium', 'Ra': 'Radium', 'Ac': 'Actinium', 'Th': 'Thorium', 
    'Pa': 'Protactinium', 'U': 'Uranium', 'Np': 'Neptunium', 'Pu': 'Plutonium', 'Am': 'Americium', 'Cm': 'Curium', 
    'Bk': 'Berkelium', 'Cf': 'Californium', 'Es': 'Einsteinium', 'Fm': 'Fermium', 'Md': 'Mendelevium', 
    'No': 'Nobelium', 'Lr': 'Lawrencium', 'Rf': 'Rutherfordium', 'Db': 'Dubnium', 'Sg': 'Seaborgium', 
    'Bh': 'Bohrium', 'Hs': 'Hassium', 'Mt': 'Meitnerium', 'Ds': 'Darmstadtium', 'Rg': 'Roentgenium', 
    'Cn': 'Copernicium', 'Nh': 'Nihonium', 'Fl': 'Flerovium', 'Mc': 'Moscovium', 'Lv': 'Livermorium', 
    'Ts': 'Tennessine', 'Og': 'Oganesson'
}


element_pattern = re.compile(r'\b(?:' + '|'.join(list(metal_elements_full.keys()) + list(metal_elements_full.values())) + r')\b')


mask = df_include['文献标题'].apply(lambda x: bool(element_pattern.search(x)))


data_paper_with_elements = df_include[mask]


data_paper_without_elements = df_include[~mask]


data_paper_with_elements = data_paper_with_elements.reset_index(drop=True)
data_paper_without_elements = data_paper_without_elements.reset_index(drop=True)


In [None]:
# Specifically keep the papers that investigate MgH2 or MH4
keywords = ['MgH2', 'H4']


mask = data_paper_with_elements['文献标题'].apply(lambda x: any(keyword in x.lower() for keyword in keywords))


data_paper_waiting_download_exclude_MgH2_H4 = data_paper_with_elements[~mask]
data_paper_waiting_download_include_H4 = data_paper_with_elements[mask]


data_paper_waiting_download_exclude_H4 = data_paper_waiting_download_exclude_H4.reset_index(drop=True)
data_paper_waiting_download_include_H4 = data_paper_waiting_download_include_H4.reset_index(drop=True)

# Out put:
# data_paper_include_key 包含了关键词的文献
# data_paper_dele_key 不包含关键词的文献

