In [23]:
import os
import pandas as pd

In [24]:
def process_arxiv_files(
    directory_path="./",
    starts_with="0_arxiv_",
    ends_with=".csv",
    output_file="0_arxiv.csv",
):
    # Get a list of all files that match the pattern
    file_list = [
        file
        for file in os.listdir(directory_path)
        if file.startswith(starts_with) and file.endswith(ends_with)
    ]

    # If no files found, return a message
    if not file_list:
        return "No files found matching the pattern."

    # Read all the CSV files and set the first column as the index
    dfs = [
        pd.read_csv(os.path.join(directory_path, file), index_col=0)
        for file in file_list
    ]

    # Concatenate all the DataFrames into one
    combined_df = pd.concat(dfs, ignore_index=True)

    # Get initial number of rows
    initial_row_count = len(combined_df)
    print(f"Initial number of rows: {initial_row_count}")

    # Drop NaN values and duplicates
    combined_df.drop_duplicates(inplace=True)

    # Get final number of rows after cleaning
    final_row_count = len(combined_df)
    print(f"Final number of rows after cleaning: {final_row_count}")

    # Save the final DataFrame to a CSV file
    combined_df.to_csv(output_file, index=False)

    return initial_row_count, final_row_count, output_file


def process_all_files(directory_path="./data", file_list=None, output_file="0_all.csv"):
    if file_list is None:
        return "No file list provided."

    # Initialize an empty list to store dataframes
    dfs = []

    # Iterate over provided file list
    for file in file_list:
        file_path = os.path.join(directory_path, file)

        # Read CSV, handle unnamed first column (either index or not)
        df = pd.read_csv(file_path)

        # Check if the first column is unnamed (index) and drop it if necessary
        if df.columns[0] == "Unnamed: 0":
            df.set_index(df.columns[0], inplace=True)

        dfs.append(df)

    # Concatenate all the DataFrames into one
    combined_df = pd.concat(dfs, ignore_index=True)

    # Get initial number of rows
    initial_row_count = len(combined_df)
    print(f"Initial number of rows: {initial_row_count}")

    # Drop NaN values and duplicates
    combined_df.drop_duplicates(inplace=True)

    # Get final number of rows after cleaning
    final_row_count = len(combined_df)
    print(f"Final number of rows after cleaning: {final_row_count}")

    # Save the final DataFrame to a CSV file
    combined_df.to_csv(os.path.join(directory_path, output_file), index=False)

    return initial_row_count, final_row_count, output_file

Since arxiv have limits of `1000` returned articles per query, sometimes, you need to specify queries (as it's implemented шт `0_fetch_arxiv.py`, however, if not everything was launched in one time, we can merge it afterwards:

In [25]:
from utils_fetch import start_date, end_date, query_terms_list, year_list, DATA_DIR

In [26]:
DATA_DIR

'data/'

In [27]:
# Call the function and return the result
process_arxiv_files(directory_path=DATA_DIR, 
                    starts_with='0_arxiv_', 
                    ends_with = '.csv',
                    output_file=os.path.join(DATA_DIR, '0_arxiv.csv'))

Initial number of rows: 681453
Final number of rows after cleaning: 5972


(681453, 5972, 'data/0_arxiv.csv')

In [28]:
# Call the function with specified directory and file list
file_list = [
    '0_arxiv.csv', 
    '0_bio_med.csv', 
    '0_ncbi.csv']
process_all_files(directory_path=DATA_DIR , file_list=file_list)

Initial number of rows: 57631
Final number of rows after cleaning: 57558


(57631, 57558, '0_all.csv')