In [1]:
import pandas as pd

In [15]:
def concatenate_author_info(start_year, stop_year, base_path='../database/'):
    """
    Loads author information from CSV files for a range of years, adds an 'award_year' column,
    and concatenates them into a single DataFrame.
    
    Parameters:
    - start_year: The starting year of the range (inclusive).
    - stop_year: The stopping year of the range (inclusive).
    - base_path: The base path where the CSV files are stored.
    
    Returns:
    - A concatenated DataFrame containing all the author information across the specified years.
    """
    concatenated_df = pd.DataFrame()
    
    for year in range(start_year, stop_year + 1):
        file_name = f'author_info_{year}.csv'
        file_path = f'{base_path}{file_name}'

        # Load the CSV file into a DataFrame
        df = pd.read_csv(file_path, index_col=0)
        # Add 'award_year' column
        df['award_year'] = year
        # Concatenate to the main DataFrame
        concatenated_df = pd.concat([concatenated_df, df], ignore_index=True)
    
    return concatenated_df

In [60]:
def concatenate_publication_info(start_year, stop_year, base_path='../database/'):
    """
    Loads publication information from CSV files for a range of years and concatenates them into a single DataFrame.
    
    Parameters:
    - start_year: The starting year of the range (inclusive).
    - stop_year: The stopping year of the range (inclusive).
    - base_path: The base path where the CSV files are stored.
    
    Returns:
    - A concatenated DataFrame containing all the publication information across the specified years.
    """
    concatenated_df = pd.DataFrame()
    
    for year in range(start_year, stop_year + 1):
        file_name = f'pub_info_{year-3}_{year+3}.csv'
        file_path = f'{base_path}{file_name}'

        # Load the CSV file into a DataFrame
        df = pd.read_csv(file_path, index_col=0)
        # Add 'award_year' column
        df['award_year'] = year
        # Concatenate to the main DataFrame
        concatenated_df = pd.concat([concatenated_df, df], ignore_index=True)
    
    return concatenated_df

In [13]:
def aggregate_info():
    """
    Loads funding, author, and publication information from CSV files and merges them into a single DataFrame.
    
    Parameters:
    - funding_info: The DataFrame of funding information. 
    - author_info: The DataFrame of author information. 
    - publication_info: The DataFrame of publication information.
    - base_path: The base path where the CSV files are stored.
    
    Returns:
    - A merged DataFrame containing all funding, author, and publication information.
    """
    # Load csv files into DataFrames
    funding_info = pd.read_csv('../database/funding_info.csv', index_col=0)
    author_info = pd.read_csv('../database/author_info.csv', index_col=0)
    publication_info = pd.read_csv('../database/publication_info.csv', index_col=0)

    # Merge funding information with author information 
    aggregated_info = pd.merge(funding_info, author_info, on=['first_name', 'middle_name', 'last_name', 'email', 'institution', 'award_year'], how='inner')

    # Merge publication information with the aggregated information
    aggregated_info = pd.merge(aggregated_info, publication_info, on=['first_name', 'middle_name', 'last_name', 'email', 'award_year'], how='inner')

    return aggregated_info

In [None]:
author_info = concatenate_author_info(2011, 2020)
author_info.to_csv('../database/author_info.csv')

In [None]:
publication_info = concatenate_publication_info(2011, 2020)
publication_info.to_csv('../database/publication_info.csv')

In [None]:
content_analysis = aggregate_info()
content_analysis.to_csv('../database/content_analysis.csv')