In [None]:
import os
import pandas as pd

In [None]:
def process_arxiv_files(directory_path='./data', file_list=None, output_file='0_all.csv'):

    if file_list is None:
        return "No file list provided."

    # Initialize an empty list to store dataframes
    dfs = []

    # Iterate over provided file list
    for file in file_list:
        file_path = os.path.join(directory_path, file)
        
        # Read CSV, handle unnamed first column (either index or not)
        df = pd.read_csv(file_path)
        
        # Check if the first column is unnamed (index) and drop it if necessary
        if df.columns[0] == 'Unnamed: 0':
            df.set_index(df.columns[0], inplace=True)

        dfs.append(df)

    # Concatenate all the DataFrames into one
    combined_df = pd.concat(dfs, ignore_index=True)

    # Get initial number of rows
    initial_row_count = len(combined_df)
    print(f"Initial number of rows: {initial_row_count}")

    # Drop NaN values and duplicates
    combined_df.drop_duplicates(inplace=True)

    # Get final number of rows after cleaning
    final_row_count = len(combined_df)
    print(f"Final number of rows after cleaning: {final_row_count}")

    # Save the final DataFrame to a CSV file
    combined_df.to_csv(os.path.join(directory_path, output_file), index=False)

    return initial_row_count, final_row_count, output_file

In [None]:
# Call the function with specified directory and file list
file_list = [
    '0_arxiv_unique.csv', 
    '0_bio_med.csv', 
    '0_ncbi.csv']
process_arxiv_files(directory_path='./data', file_list=file_list)