In [3]:
import csv

document_claims, document_alignment = {}, {}

# Open the CSV file
with open('wikipedia_forgeries_corpus.csv', 'r', encoding='utf-8') as file:
    # Create a CSV reader
    reader = csv.DictReader(file)

    # Iterate over each row in the CSV
    for row in reader:
        # Access the "Document claims" column and process its content
        document_claims.update({row['Page URL']:row['Document claims']})

        document_alignment.update({row['Page URL']:row['Page ID']})

In [30]:
import requests
from bs4 import BeautifulSoup

def get_references(url):

    # Send a GET request to the URL
    response = requests.get(url.strip())

    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        # Parse the HTML content of the page
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find the references section by its class
        references_section = soup.find_all('div', {'class': 'reflist'})

        references_section = references_section[0]

        # Initialize a dictionary to store references and their numbers
        references = {}


        # Extract text and references numbers from the references section
        if references_section:
            ref_tags = references_section.find_all('span', class_='reference-text')

            for index, tag in enumerate(ref_tags, start=1):
                ref_number = f"[{index}]"

                ref_text = tag.get_text(separator=' ', strip=True)

                links = tag.find_all('a')
                boolean = False
                try:
                  for link in links:
                    ref_id = link.get('href').replace('#', '')
                    bib = soup.find('cite', {'id': ref_id})
                    if bib != None:
                      references[ref_number] = bib.get_text(separator=' ', strip=True)
                      boolean = True

                  if boolean == False:
                    references[ref_number] = ref_text

                except:
                    references[ref_number] = ref_text

            return references
        else:
            print("References section not found at", url)
            return "References section not found."
    else:
        print("Failed to fetch Wikipedia page.")
        return "Failed to fetch Wikipedia page."

In [33]:
import csv
import re

pattern = r'\[\d+\]'

with open('references.tsv', 'w', newline='', encoding='utf-8') as tsvfile:

    # Define TSV writer
    writer = csv.DictWriter(tsvfile, delimiter='\t', fieldnames=['Page ID', 'Document URL', 'Reference number', 'Reference'])
    writer.writeheader()


    for url, text in document_claims.items():

        # Find all matches
        citations = re.findall(pattern, text)
        citations = list(set(citations))

        # Get the references section of the Wikipedia page
        references = get_references(url)

        # Write data to CSV
        ref_texts = {}

        if isinstance(references, dict):

            if len(references) == 0:
              print(f'No references found in {url}')

            # Iterate through references
            for ref_number, ref_text in references.items():
                # Check if the reference number is in the citations
                if ref_number in citations:
                    # If the reference text is already encountered, append the reference number
                    if ref_text in ref_texts:
                        ref_texts[ref_text].append(ref_number)
                    else:
                        ref_texts[ref_text] = [ref_number]

            # Write the data to CSV (moved outside the loop over references)
            for ref_text, ref_numbers in ref_texts.items():
                for wiki_url, wiki_id in document_alignment.items():
                    if url == wiki_url:
                      writer.writerow({'Page ID': wiki_id, 'Document URL': url, 'Reference number': ','.join(ref_numbers), 'Reference': ref_text})