Thesis meta-database cleaning



In [9]:
import pandas as pd
import glob
import os

# Path to the directory containing your CSV files
csv_directory = "Exported CSVs/Zotero exports"

# Use glob to get all CSV files in the directory
csv_files = glob.glob(os.path.join(csv_directory, "*.csv"))

# Initialize an empty list to store DataFrames
dfs = []

# Loop through each CSV file and append to the list
for csv_file in csv_files:
    # Read the CSV file
    df = pd.read_csv(csv_file)

    # Optional: Add a column indicating the source file
    file_name = os.path.basename(csv_file)
    df['source_file'] = file_name

    # Append to the list
    dfs.append(df)

    # Print status (optional)
    print(f"Loaded {file_name} with {len(df)} rows")

# Concatenate all DataFrames in the list
combined_df = pd.concat(dfs, ignore_index=True)

# Show the dimensions of the combined DataFrame
print(f"\nCombined DataFrame has {combined_df.shape[0]} rows and {combined_df.shape[1]} columns")

# Save the combined DataFrame to a new CSV file
combined_df.to_csv("combined_dataset.csv", index=False)
print("Combined data saved to combined_dataset.csv")

Loaded ACL.csv with 110 rows
Loaded NeurIPS.csv with 459 rows
Loaded CVPR.csv with 108 rows

Combined DataFrame has 677 rows and 88 columns
Combined data saved to combined_dataset.csv


In [10]:
combined_df.head()

Unnamed: 0,Key,Item Type,Publication Year,Author,Title,Publication Title,ISBN,ISSN,DOI,Url,...,Version,System,Code,Code Number,Section,Session,Committee,History,Legislative Body,source_file
0,ZLSVM7UD,conferencePaper,2024.0,"Li, Haoran; Guo, Dadi; Li, Donghao; Fan, Wei; ...",PrivLM-Bench: A Multi-level Privacy Evaluation...,Proceedings of the 62nd Annual Meeting of the ...,,,10.18653/v1/2024.acl-long.4,https://aclanthology.org/2024.acl-long.4,...,,,,,,,,,,ACL.csv
1,WBN4DR9E,preprint,2024.0,"Hu, Yong; Meng, Fandong; Zhou, Jie",CSCD-NS: a Chinese Spelling Check Dataset for ...,,,,10.48550/arXiv.2211.08788,http://arxiv.org/abs/2211.08788,...,,,,,,,,,,ACL.csv
2,9XF59PKK,conferencePaper,2024.0,"Poesina, Eduard; Caragea, Cornelia; Ionescu, Radu",A Novel Cartography-Based Curriculum Learning ...,Proceedings of the 62nd Annual Meeting of the ...,,,10.18653/v1/2024.acl-long.15,https://aclanthology.org/2024.acl-long.15,...,,,,,,,,,,ACL.csv
3,IVP948X3,conferencePaper,2024.0,"Wang, Xiyao; Zhou, Yuhang; Liu, Xiaoyu; Lu, Ho...",Mementos: A Comprehensive Benchmark for Multim...,Proceedings of the 62nd Annual Meeting of the ...,,,10.18653/v1/2024.acl-long.25,https://aclanthology.org/2024.acl-long.25,...,,,,,,,,,,ACL.csv
4,9FJT7KT3,conferencePaper,2024.0,"Xia, Congying; Xing, Chen; Du, Jiangshu; Yang,...",FOFO: A Benchmark to Evaluate LLMs’ Format-Fol...,Proceedings of the 62nd Annual Meeting of the ...,,,10.18653/v1/2024.acl-long.40,https://aclanthology.org/2024.acl-long.40,...,,,,,,,,,,ACL.csv


In [11]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 677 entries, 0 to 676
Data columns (total 88 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Key                   677 non-null    object 
 1   Item Type             677 non-null    object 
 2   Publication Year      220 non-null    float64
 3   Author                676 non-null    object 
 4   Title                 677 non-null    object 
 5   Publication Title     176 non-null    object 
 6   ISBN                  72 non-null     float64
 7   ISSN                  2 non-null      object 
 8   DOI                   221 non-null    object 
 9   Url                   221 non-null    object 
 10  Abstract Note         569 non-null    object 
 11  Date                  220 non-null    object 
 12  Date Added            677 non-null    object 
 13  Date Modified         677 non-null    object 
 14  Access Date           221 non-null    object 
 15  Pages                 1

In [12]:
combined_df.to_csv("combined_dataset_full_column.csv", index=False)

In [13]:
df_select_columns = combined_df[['Key', 'Item Type', 'Publication Year', 'Author', 'Title', 'Publication Title', 'DOI', 'Url', 'Abstract Note', 'Date', 'Conference Name', 'source_file']].copy()
df_select_columns.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 677 entries, 0 to 676
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Key                677 non-null    object 
 1   Item Type          677 non-null    object 
 2   Publication Year   220 non-null    float64
 3   Author             676 non-null    object 
 4   Title              677 non-null    object 
 5   Publication Title  176 non-null    object 
 6   DOI                221 non-null    object 
 7   Url                221 non-null    object 
 8   Abstract Note      569 non-null    object 
 9   Date               220 non-null    object 
 10  Conference Name    173 non-null    object 
 11  source_file        677 non-null    object 
dtypes: float64(1), object(11)
memory usage: 63.6+ KB


In [14]:
import numpy as np
df_select_columns['Field'] = np.nan
df_select_columns['Novel Database'] = np.nan
df_select_columns['Representativity Mentions'] = np.nan
df_select_columns['Similarity Mentions'] = np.nan
df_select_columns['Diversity Mentions'] = np.nan
df_select_columns['Notes'] = np.nan

df_select_columns.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 677 entries, 0 to 676
Data columns (total 18 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Key                        677 non-null    object 
 1   Item Type                  677 non-null    object 
 2   Publication Year           220 non-null    float64
 3   Author                     676 non-null    object 
 4   Title                      677 non-null    object 
 5   Publication Title          176 non-null    object 
 6   DOI                        221 non-null    object 
 7   Url                        221 non-null    object 
 8   Abstract Note              569 non-null    object 
 9   Date                       220 non-null    object 
 10  Conference Name            173 non-null    object 
 11  source_file                677 non-null    object 
 12  Field                      0 non-null      float64
 13  Novel Database             0 non-null      float64

In [15]:
df_select_columns.rename(columns={'source_file': 'Source File',}, inplace=True)
df_select_columns.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 677 entries, 0 to 676
Data columns (total 18 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Key                        677 non-null    object 
 1   Item Type                  677 non-null    object 
 2   Publication Year           220 non-null    float64
 3   Author                     676 non-null    object 
 4   Title                      677 non-null    object 
 5   Publication Title          176 non-null    object 
 6   DOI                        221 non-null    object 
 7   Url                        221 non-null    object 
 8   Abstract Note              569 non-null    object 
 9   Date                       220 non-null    object 
 10  Conference Name            173 non-null    object 
 11  Source File                677 non-null    object 
 12  Field                      0 non-null      float64
 13  Novel Database             0 non-null      float64

In [16]:
df_select_columns.to_csv("cleaned_dataset_v1.csv", index=False)

In [18]:
neurips_df = df_select_columns[df_select_columns['Source File'] == 'NeurIPS.csv']
print(f"Total NeurIPS articles: {len(neurips_df)}")
neurips_df.head()

Total NeurIPS articles: 459


Unnamed: 0,Key,Item Type,Publication Year,Author,Title,Publication Title,DOI,Url,Abstract Note,Date,Conference Name,Source File,Field,Novel Database,Representativity Mentions,Similarity Mentions,Diversity Mentions,Notes
110,IHBLLIBV,dataset,2025.0,"Hauser, Jakob Elias",HiST-LLM,,10.5281/ZENODO.14671247,https://zenodo.org/doi/10.5281/zenodo.14671247,Large Language Models (LLMs) have the potentia...,2025-01-16,,NeurIPS.csv,,,,,,
111,58JV2V3K,journalArticle,,"Dong, Linfeng; Wang, Wei; Qiao, Yu; Sun, Xiao",LucidAction: A Hierarchical and Multi-model Da...,,,,,,,NeurIPS.csv,,,,,,
112,VRL2G4I9,journalArticle,,"Zhang, Hang; Sun, Jiawei; Chen, Renqi; Liu, We...",Empowering and Assessing the Utility of Large ...,,,,Large language models (LLMs) have demonstrated...,,,NeurIPS.csv,,,,,,
113,IZFMDATR,journalArticle,,"Luo, Jialin; Wang, Yuanzhi; Gu, Ziqi; Qiu, Yid...","MMM-RS: A Multi-modal, Multi-GSD, Multi-scene ...",,,,"Recently, the diffusion-based generative parad...",,,NeurIPS.csv,,,,,,
114,I249SJTS,journalArticle,,"Chen, Cheng; Zhu, Junchen; Luo, Xu; Shen, Heng...",CoIN: A Benchmark of Continual Instruction Tun...,,,,Instruction tuning demonstrates impressive per...,,,NeurIPS.csv,,,,,,


In [19]:
if len(neurips_df) >= 50:
    random_neurips = neurips_df.sample(n=50, random_state=42)  # random_state for reproducibility
else:
    print(f"Warning: Only {len(neurips_df)} NeurIPS articles available. Selecting all of them.")
    random_neurips = neurips_df

random_neurips.to_csv("random_neurips_selection.csv", index=False)
