In [37]:
unavailable_papers = []
duplicates = []
papers_directory = "papers/Robotic planning - Evolutionary robotics"
#initial csv
initial_csv_file = "papers/Robotic planning - Evolutionary robotics.csv"
#csv directory - after removing unavailable papers
csv_file_after_removed_unavailable_papers = "temp.csv"
#csv directory - after removing duplicates
final_output_csv = "Robotic planning - Evolutionary robotics_final.csv"

In [38]:
import os
import csv

def check_unavailable_papers(initial_csv_file, papers_directory):
    

    # Check if the papers directory exists
    if not os.path.exists(papers_directory):
        print(f"The directory '{papers_directory}' does not exist.")
        return

    # Read the CSV file with the appropriate encoding
    with open(initial_csv_file, 'r', encoding='utf-8-sig',errors='replace') as file:
        reader = csv.reader(file)
        # next(reader)  # Skip the header row if it exists
        for col in reader:
            file_name = col[3]  # Assuming the fourth column contains file names
            file_path = os.path.join(papers_directory, file_name)
            if not file_path.endswith(".pdf"):
                file_path += ".pdf"
            if ":" in file_path:
                file_path = file_path.replace(":", "_")
            if not os.path.exists(file_path):
                unavailable_papers.append(file_name)

    if unavailable_papers:
        print("Unavailable papers:")
        for paper in unavailable_papers:
            print(paper)
    else:
        print("All papers are available.")



In [39]:

def remove_unavailable_papers(initial_csv_file, csv_file_after_removed_unavailable_papers):

    if not unavailable_papers:
        print("No unavailable papers to remove.")
        return

    # Read the original CSV file and write a new one excluding unavailable papers
    with open(initial_csv_file, 'r', encoding='utf-8-sig',errors='replace') as infile, open(csv_file_after_removed_unavailable_papers, 'w', newline='', encoding='utf-8-sig',errors='replace') as outfile:
        reader = csv.reader(infile)
        writer = csv.writer(outfile)

        for row in reader:
            file_name = row[3]  # Assuming the fourth column contains file names
            if file_name not in unavailable_papers:
                writer.writerow(row)

    print(f"Unavailable papers removed. Updated CSV saved as '{csv_file_after_removed_unavailable_papers}'.")




In [40]:
from collections import defaultdict


def find_duplicate_records(csv_file):
    records = defaultdict(int)

    # Read the CSV file with the appropriate encoding
    with open(csv_file, 'r', encoding='utf-8-sig',errors='replace') as file:
        reader = csv.reader(file)
        # header = next(reader)  # Skip the header row
        for row in reader:
            record = tuple(row)  # Convert the row to a tuple to make it hashable
            records[record] += 1
            if records[record] == 2:  # Only add to duplicates list the first time a duplicate is found
                duplicates.append(record)

    if duplicates:
        print("Duplicate records found:")
        for duplicate in duplicates:
            print(duplicate)
    else:
        print("No duplicate records found.")
    return duplicates


In [41]:
def remove_duplicate_records(csv_file_after_removed_duplicates, final_output_csv):
    records = set()
    duplicates_removed = 0

    # Read the original CSV file and write a new one excluding duplicate records
    with open(csv_file_after_removed_duplicates, 'r', encoding='utf-8-sig',errors='replace') as infile, open(final_output_csv, 'w', newline='', encoding='utf-8-sig',errors='replace') as outfile:
        reader = csv.reader(infile)
        writer = csv.writer(outfile)

        for row in reader:
            record = tuple(row)  # Convert the row to a tuple to make it hashable
            if record not in records:
                writer.writerow(row)
                records.add(record)
            else:
                duplicates_removed += 1

    print(f"Duplicate records removed: {duplicates_removed}. Updated CSV saved as '{final_output_csv}'.")


In [42]:
#check unavilable papers
check_unavailable_papers(initial_csv_file, papers_directory)

All papers are available.


In [43]:

remove_unavailable_papers(initial_csv_file, csv_file_after_removed_unavailable_papers)

No unavailable papers to remove.


In [44]:
# Find duplicate records
if os.path.exists(csv_file_after_removed_unavailable_papers):
  find_duplicate_records(csv_file_after_removed_unavailable_papers)
else:
  find_duplicate_records(initial_csv_file)

No duplicate records found.


In [45]:
# Remove duplicate records
if os.path.exists(csv_file_after_removed_unavailable_papers):
    remove_duplicate_records(csv_file_after_removed_unavailable_papers, final_output_csv)
else:
    remove_duplicate_records(initial_csv_file,final_output_csv)


Duplicate records removed: 0. Updated CSV saved as 'Robotic planning - Evolutionary robotics_final.csv'.
