##### Process Overview
- Inputs: PDF-directory (holds PDF files)

- Outputs: clean_corpus.json file 

Step 1: Creating a "Raw" Corpus

In [3]:
import fitz  # PyMuPDF
import os
import json

# Function to extract text from a PDF file
def extract_text_from_pdf(pdf_path):
    try:
        doc = fitz.open(pdf_path)
        text = ""
        for page_num in range(len(doc)):
            page = doc.load_page(page_num)
            page_text = page.get_text()
            print(f"Page {page_num} text length: {len(page_text)}")
            text += page_text
        return text
    except Exception as e:
        print(f"Error extracting text from {pdf_path}: {e}")
        return ""

# Function to extract the date format
def extract_date_format(time_str):
    parts = time_str.split()
    date_format = ' '.join(parts[:3])
    return date_format

# Function to extract metadata from a news story
def extract_meta_info(news_story):
    lines = news_story.strip().split('\n')
    
    header = lines[0].strip() if len(lines) > 0 else "N/A"
    
    media = "N/A"
    time = "N/A"
    length = " "
    byline = " "
    
    if len(lines) > 1:
        media = lines[1].strip() if lines[1].strip() != "" else "N/A"
    for line in lines:
        if any(month in line for month in ["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"]):
            time = line.replace("Load-Date:", "").strip()
        elif "Length:" in line:
            length = line.split("Length:")[1].strip()
        elif "Byline:" in line:
            byline = line.split("Byline:")[1].strip()
    
    section = ""
    body_start_index = 0
    for i, line in enumerate(lines):
        if "Section:" in line:
            section = line.split("Section:")[1].strip()
        if "Body:" in line:
            body_start_index = i + 1
            break

    # Extract the body text starting from the "Body:" section
    body = '\n'.join(lines[body_start_index:]).strip()

    return header, media, time, section, length, byline, body

# Function to split the extracted text into individual stories
def split_stories(text):
    stories = text.split("End of Document")
    return stories

# Clean time data and save the cleaned data to a JSON file
def extract_and_clean_data(pdf_directory, output_file_path):
    extracted_data = []

    pdf_files = os.listdir(pdf_directory)
    if '.DS_Store' in pdf_files:
        pdf_files.remove('.DS_Store')
    
    for pdf_file in pdf_files:
        if pdf_file.endswith('.pdf'):
            pdf_path = os.path.join(pdf_directory, pdf_file)
            print(f"Processing file: {pdf_path}")
            pdf_text = extract_text_from_pdf(pdf_path)
            if pdf_text:
                stories = split_stories(pdf_text)
                for story in stories:
                    header, media, time, section, length, byline, body = extract_meta_info(story)
                    extracted_data.append({
                        "file_name": pdf_file,
                        "header": header,
                        "media": media,
                        "time": time,
                        "section": section,
                        "length": length,
                        "byline": byline,
                        "story_text": body
                    })
                print(f"Extracted text from {pdf_file}")
            else:
                print(f"No text extracted from {pdf_file}")
    
    cleaned_data = []
    for item in extracted_data:
        time = item['time']
        if not any(month in time for month in ["Jan", "January", "Feb", "February", "Mar", "March", "Apr", "April", "May", "Jun", "June", "Jul", "July", "Aug", "August", "Sep", "September", "Oct", "October", "Nov", "November", "Dec", "December"]):
            item['time'] = "N/A"
        cleaned_data.append(item)

    with open(output_file_path, 'w', encoding='utf-8') as f:
        json.dump(cleaned_data, f, ensure_ascii=False, indent=4)
        print(f"Data written to {output_file_path}")

    print(f"Total stories extracted and saved: {len(cleaned_data)}")

# Example usage:
pdf_directory = "/Users/QuangAP/Quang_Apollo/PhD_Data/NexisUni"
output_file_path = 'raw_corpus.json'

extract_and_clean_data(pdf_directory, output_file_path)

Processing file: /Users/QuangAP/Quang_Apollo/PhD_Data/NexisUni/Files700_801.pdf
Page 0 text length: 3785
Page 1 text length: 117
Page 2 text length: 2521
Page 3 text length: 3331
Page 4 text length: 2856
Page 5 text length: 3703
Page 6 text length: 453
Page 7 text length: 2522
Page 8 text length: 2535
Page 9 text length: 1467
Page 10 text length: 2538
Page 11 text length: 2408
Page 12 text length: 2734
Page 13 text length: 935
Page 14 text length: 2527
Page 15 text length: 2653
Page 16 text length: 2153
Page 17 text length: 1063
Page 18 text length: 3803
Page 19 text length: 90
Page 20 text length: 2792
Page 21 text length: 2830
Page 22 text length: 2984
Page 23 text length: 1606
Page 24 text length: 2931
Page 25 text length: 4038
Page 26 text length: 1895
Page 27 text length: 3039
Page 28 text length: 2130
Page 29 text length: 101
Page 30 text length: 3410
Page 31 text length: 52
Page 32 text length: 2513
Page 33 text length: 3462
Page 34 text length: 1092
Page 35 text length: 2745
Pa

Step 2: Arrange Corpus Chronologically, Renamed files  and Clean Emty Story  

In [4]:
import json
from datetime import datetime

def ordered_and_cleaned_corpus(input_file_path, output_file_path):
    # Load the corpus from the input file
    with open(input_file_path, 'r', encoding='utf-8') as f:
        corpus = json.load(f)

    # Function to parse the date from the 'time' field
    def parse_date(date_str):
        for fmt in ("%B %d, %Y", "%b %d, %Y"):
            try:
                return datetime.strptime(date_str, fmt)
            except ValueError:
                continue
        return datetime.max  # Return a max date for sorting invalid dates to the end

    # Sort the corpus by the 'time' field and remove news stories with no content
    sorted_and_cleaned_corpus = []
    for metadata in sorted(corpus, key=lambda x: parse_date(x['time'])):
        if metadata.get('story_text', '').strip():  # Check if story_text is not empty
            sorted_and_cleaned_corpus.append(metadata)

    # Function to update the "file_name" section of metadata with the new filename
    def update_file_name(metadata):
        media = metadata["media"].replace(" ", "_")
        time = metadata["time"].split()
        if len(time) >= 3:
            month = time[0][:3] # Extract the first three letters of the month
            year = time[-1] # Extract the last element as the year
            new_filename = f"{media}_{month}{year}"
            metadata["file_name"] = new_filename
        else:
            # If time field is not in the expected format, skip updating file name
            print(f"Warning: Invalid 'time' format for metadata: {metadata}")
        return metadata

    # Update file names in metadata
    updated_corpus_metadata = [update_file_name(metadata) for metadata in sorted_and_cleaned_corpus]

    # Write the updated metadata to the output file
    with open(output_file_path, 'w') as f:
        json.dump(updated_corpus_metadata, f, indent=4)

    num_cleaned_stories = len(sorted_and_cleaned_corpus)
    print(f"{num_cleaned_stories} news stories have been cleaned and saved to {output_file_path}")

# Example usage
input_file_path = 'raw_corpus.json'
output_file_path = 'clean_corpus.json'
ordered_and_cleaned_corpus(input_file_path, output_file_path)


1608 news stories have been cleaned and saved to clean_corpus.json


##### Filtering the Corpus by Time (May)

In [4]:
import json
from datetime import datetime

def def_filter(input_file_path, target_month_str, target_year):
    # Load cleaned corpus from JSON file
    def load_clean_corpus(file_path):
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        return data

    # Filter stories released in the specified month and year
    def filter_stories_by_date(data, target_month_str, target_year):
        filtered_stories = []
        for item in data:
            if item['time'] != "N/A":
                try:
                    story_date = datetime.strptime(item['time'], '%B %d, %Y')
                    if story_date.strftime('%B') == target_month_str and story_date.year == target_year:
                        filtered_stories.append(item)
                except ValueError:
                    pass  # Skip processing for invalid date formats or "N/A"
        return filtered_stories

    # Save filtered stories to a new JSON file
    def save_filtered_stories(filtered_stories, output_file_path):
        with open(output_file_path, 'w', encoding='utf-8') as f:
            json.dump(filtered_stories, f, ensure_ascii=False, indent=4)

    # Define output file path
    output_file_path = 'Mar2024_corpus.json'

    # Load and filter stories
    data = load_clean_corpus(input_file_path)
    filtered_stories = filter_stories_by_date(data, target_month_str, target_year)

    # Save the filtered stories
    save_filtered_stories(filtered_stories, output_file_path)

    print(f"Filtered {len(filtered_stories)} stories released in {target_month_str} {target_year}.")
    
    return output_file_path

# Example usage
input_file_path = 'clean_corpus.json'
target_month_str = 'March'
target_year = 2024
output_file_path = def_filter(input_file_path, target_month_str, target_year)
print(f"Filtered stories saved to: {output_file_path}")


Filtered 150 stories released in March 2024.
Filtered stories saved to: Mar2024_corpus.json
