#### Literature helper
*Created on January 27, 2023*
___

This notebook serves for the transformation of the semi-transformed query output data, and merge them with the externally obtained study citations

The output will be a new excel file, where the literature will be paired with their citations

In [1]:
import re

import pandas as pd
import numpy as np

In [2]:
# Static
FOLDER_PATH = r'C:\Users\hso20\OneDrive\Plocha\IES\Diploma-Thesis\Tools and meta-analysis theory\Scholar-bot\data'


In [3]:
main_data = pd.read_excel(f'{FOLDER_PATH}\Query_data_clean.xlsx', sheet_name='main')
citation_data = pd.read_excel(f'{FOLDER_PATH}\Query_data_clean.xlsx', sheet_name='raw_citations')


In [130]:
# Useful functions

def quoteAllCitations(df):
    '''Input the citation data frame (with citations and study titles),
    and return a new data frame with all study titles quoted, if they are not already.

    Preferably run this right after reading the source citation data set.
    '''
    pass

def extractStudyTitle(citation):
    '''Input a citation and extract the study title. If no title is found,
    return None.
    '''
    rgx = r'"(.*)"' # Extract between double quotes # Alternative - r'"([^"]*)"'
    match = re.search(rgx, citation)
    if match is None:
        return None
    searched_title = match.group(1) # Title of searched study
    return searched_title


def handleMissingTitles(df):
    '''Input the citation data with missing study titles and try to replace these titles with
    titles obtained from the citations. Leave None in place of those studies, where neither can
    be found.
    '''
    if not 'study_title' in df.columns:
        raise ValueError('Incorrect data set.')
    na_bool = df['study_title'].isna()
    new_titles = []
    # If study title is present, leave the title, otherwise extract from citation
    for isna, row_zip in zip(na_bool, df.iterrows()):
        row = row_zip[1]
        if not isna:
            new_title = row['study_title']
        else:
            new_title = extractStudyTitle(row['citation'])
        new_titles.append(new_title)

    # Replace the column with new values
    df['study_title'] = new_titles 
    return df

In [135]:
# Subset the citation data
citation_data = citation_data[['study_title', 'citation']]

# Replace all missing titles with title from their citation, if the citation is available
citation_data = handleMissingTitles(citation_data)

# Drop all rows without a study title - unidentifiable
if any(citation_data['study_title'].isna()):
    missing_rows = sum(citation_data['study_title'].isna())
    print(f'Dropping {missing_rows} rows from the dataset...')
    citation_data = citation_data.dropna(subset='study_title')

# Drop rows with duplicate column value
if any(citation_data['study_title'].duplicated()):
    citation_data = citation_data.drop_duplicates(subset='study_title', keep='first')

# Merge the data frames together
merged_data = pd.merge(main_data, citation_data, how='left', on='study_title')

# Output to excel
merged_data.to_excel("data/merged_data.xlsx", index=False, sheet_name='merged')