##### Literature verification script
*Created on January 30, 2023*
___
- Read the files from the *Studies list* and check their correct denotation in the literature file.
- Check that all files marked as downloaded in the literature file are actually present in the *Studies list* folder

In [5]:
import os

import pandas as pd


In [2]:
BASE_PATH = r'C:\Users\hso20\OneDrive\Plocha\IES\Diploma-Thesis'
LIT_FILE_PATH = BASE_PATH + r'\Literature\Literature.xlsx'
LIT_FILE_QUERY_SHEET_NAME = 'Query literature' # Name of the sheet with query data
STUDIES_PATH = BASE_PATH + r'\Studies list'

In [56]:
def readLitFileIntoList(lit_file_path, get_downloaded_info_too = False):
    lit_data = pd.read_excel(LIT_FILE_PATH, sheet_name=LIT_FILE_QUERY_SHEET_NAME)
    labels = list(lit_data['Label'])
    if get_downloaded_info_too:
        downloaded = list(lit_data['Downloaded'])
        return labels, downloaded
    return labels

def readFilesIntoList(folder_path):
    files = [f.replace('.pdf', '') for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))]
    return files

In [57]:
# Construct the two lists
labels, downloaded = readLitFileIntoList(LIT_FILE_PATH, get_downloaded_info_too=True)
files = readFilesIntoList(STUDIES_PATH)

In [62]:
# Get the list of files which have no label in the literature file
missing_in_lit = [i for i in files if i not in labels[:100]]
if not missing_in_lit == []:
    print('X There files are missing from the literature file, but are downloaded:', missing_in_lit)
else:
    print('✓ There are no unlabeled files.')

# Get the list of studies marked as downloaded in the excel, and check whether they are indeed downloaded
not_downloaded = []
for label, d in zip(labels, downloaded):
    if d == 'YES':
        file_path = STUDIES_PATH + f'\{label}.pdf'
        if not os.path.exists(file_path): # File not in download folder
            not_downloaded.append(label)
if not not_downloaded == []:
    print(f'X There files are undownloaded files, which are marked as downloaded in the literature file:', not_downloaded)
else:
    print('✓ All files marked as downloaded are indeed that.')

✓ There are no unlabeled files.
✓ All files marked as downloaded are indeed that.
