##### Filtering Corpus by Agency

In [1]:
import json

# Function to filter news stories by media and save to a new file
def filter_by_media(input_file_path, media_name, output_file_path):
    # Load data from the input file
    with open(input_file_path, 'r', encoding='utf-8') as f:
        news_stories = json.load(f)

    # Filter news stories by media
    filtered_stories = [story for story in news_stories if story['media'] == media_name]

    # Save the filtered stories to a new file
    with open(output_file_path, 'w', encoding='utf-8') as f:
        json.dump(filtered_stories, f, ensure_ascii=False, indent=4)

    num_filtered_stories = len(filtered_stories)
    
    print(f"{num_filtered_stories} news stories from {media_name} saved to {output_file_path}")

# Example usage:
input_file_path = 'clean_corpus.json'
output_file_path = 'NYTimes_stories.json'
media_name = "The New York Times"

filter_by_media(input_file_path, media_name, output_file_path)

563 news stories from The New York Times saved to NYTimes_stories.json


##### Filtering Corpus by a Keyword

In [2]:
import json

def def_keyword_filter(input_file_path, keyword):
    # Load cleaned corpus from JSON file
    def load_clean_corpus(file_path):
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        return data

    # Filter stories containing the specified keyword
    def filter_stories_by_keyword(data, keyword):
        keyword_lower = keyword.lower()
        filtered_stories = [item for item in data if keyword_lower in item['story_text'].lower()]
        return filtered_stories

    # Save filtered stories to a new JSON file
    def save_filtered_stories(filtered_stories, output_file_path):
        with open(output_file_path, 'w', encoding='utf-8') as f:
            json.dump(filtered_stories, f, ensure_ascii=False, indent=4)

    # Define output file path
    output_file_path = 'ChatGPT_corpus.json'

    # Load and filter stories
    data = load_clean_corpus(input_file_path)
    filtered_stories = filter_stories_by_keyword(data, keyword)

    # Save the filtered stories
    save_filtered_stories(filtered_stories, output_file_path)

    print(f"Filtered {len(filtered_stories)} stories containing the keyword '{keyword}'.")
    
    return output_file_path

# Example usage
input_file_path = 'clean_corpus.json'
keyword = 'ChatGPT'
output_file_path = def_keyword_filter(input_file_path, keyword)
print(f"Filtered stories saved to: {output_file_path}")

Filtered 781 stories containing the keyword 'ChatGPT'.
Filtered stories saved to: ChatGPT_corpus.json


##### Filtering Corpus by Time

In [3]:
import json
from datetime import datetime

def def_filter(input_file_path, target_month_str, target_year):
    # Load cleaned corpus from JSON file
    def load_clean_corpus(file_path):
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        return data

    # Filter stories released in the specified month and year
    def filter_stories_by_date(data, target_month_str, target_year):
        filtered_stories = []
        for item in data:
            if item['time'] != "N/A":
                try:
                    story_date = datetime.strptime(item['time'], '%B %d, %Y')
                    if story_date.strftime('%B') == target_month_str and story_date.year == target_year:
                        filtered_stories.append(item)
                except ValueError:
                    pass  # Skip processing for invalid date formats or "N/A"
        return filtered_stories

    # Save filtered stories to a new JSON file
    def save_filtered_stories(filtered_stories, output_file_path):
        with open(output_file_path, 'w', encoding='utf-8') as f:
            json.dump(filtered_stories, f, ensure_ascii=False, indent=4)

    # Define output file path
    output_file_path = 'Mar2024_corpus.json'

    # Load and filter stories
    data = load_clean_corpus(input_file_path)
    filtered_stories = filter_stories_by_date(data, target_month_str, target_year)

    # Save the filtered stories
    save_filtered_stories(filtered_stories, output_file_path)

    print(f"Filtered {len(filtered_stories)} stories released in {target_month_str} {target_year}.")
    
    return output_file_path

# Example usage
input_file_path = 'clean_corpus.json'
target_month_str = 'March'
target_year = 2024
output_file_path = def_filter(input_file_path, target_month_str, target_year)
print(f"Filtered stories saved to: {output_file_path}")

Filtered 150 stories released in March 2024.
Filtered stories saved to: Mar2024_corpus.json


##### Merging all PDF Files into  one PDF File

In [6]:
import os
import PyPDF2

def merge_pdf_files(directory_path, output_file_path):
    # Initialize PDF writer
    pdf_writer = PyPDF2.PdfWriter()
    
    # List PDF files in the directory
    pdf_files = [file for file in os.listdir(directory_path) if file.endswith('.pdf')]
    
    # Sort PDF files by name
    pdf_files.sort()
    
    # Iterate over PDF files and merge all pages from each file
    for pdf_file in pdf_files:
        with open(os.path.join(directory_path, pdf_file), 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            # Merge all pages
            for page in pdf_reader.pages:
                pdf_writer.add_page(page)
    
    # Write the merged PDF to the output file
    with open(output_file_path, 'wb') as output_file:
        pdf_writer.write(output_file)

# Example usage
directory_path = '/Users/QuangAP/Quang_Apollo/Dissertation_Data/NexisUni'  # Replace with the path to your directory containing PDF files
output_file_path = '/Users/QuangAP/Quang_Apollo/Dissertation_Data/NexisUni/merged_corpus.pdf'    # Replace with the desired output PDF file path
merge_pdf_files(directory_path, output_file_path)


##### Separating Corpus to Individual PDF Files

In [8]:
import PyPDF2

def separate_news_stories(input_file_path, output_directory):
    # Initialize PDF reader
    pdf_reader = PyPDF2.PdfReader(input_file_path)
    
    # Get the total number of pages in the PDF
    total_pages = len(pdf_reader.pages)
    
    # Initialize variables for tracking news stories
    start_page = 0
    story_num = 1
    
    # Iterate through each page to identify and separate news stories
    for page_num in range(total_pages):
        page_text = pdf_reader.pages[page_num].extract_text()
        if "End of Document" in page_text:
            # Extract the news story between start_page and page_num
            pdf_writer = PyPDF2.PdfWriter()
            for i in range(start_page, page_num + 1):
                pdf_writer.add_page(pdf_reader.pages[i])
            # Write the news story to a separate PDF file
            output_file_path = os.path.join(output_directory, f'news_story_{story_num}.pdf')
            with open(output_file_path, 'wb') as output_file:
                pdf_writer.write(output_file)
            # Update variables for the next news story
            start_page = page_num + 1
            story_num += 1
    
    # Check if there are any remaining pages after the last news story
    if start_page < total_pages:
        pdf_writer = PyPDF2.PdfWriter()
        for i in range(start_page, total_pages):
            pdf_writer.add_page(pdf_reader.pages[i])
        # Write the remaining pages to a separate PDF file
        output_file_path = os.path.join(output_directory, f'news_story_{story_num}.pdf')
        with open(output_file_path, 'wb') as output_file:
            pdf_writer.write(output_file)

# Example usage
input_file_path = '/Users/QuangAP/Quang_Apollo/Dissertation_Data/NexisUni/merged_corpus.pdf'  # Replace with the path to your PDF corpus
output_directory = '/Users/QuangAP/Quang_Apollo/Dissertation_Data/NexisUni/separated_stories'  # Replace with the directory to save separated news stories
separate_news_stories(input_file_path, output_directory)


##### Sampling 50 Single Stories 

In [9]:
import os
import random
import shutil

def sampling_50(input_directory, sample_num, output_directory):
    # Create the output directory if it doesn't exist
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)
    
    # List all files in the input directory
    all_files = os.listdir(input_directory)
    # Filter out non-PDF files
    pdf_files = [file for file in all_files if file.endswith('.pdf')]
    
    # Randomly sample 50 files
    sampled_files = random.sample(pdf_files, min(sample_num, len(pdf_files)))
    
    # Copy the sampled files to the output directory
    for file_name in sampled_files:
        src_file_path = os.path.join(input_directory, file_name)
        dst_file_path = os.path.join(output_directory, file_name)
        shutil.copyfile(src_file_path, dst_file_path)

# Example usage
input_directory = '/Users/QuangAP/Quang_Apollo/Dissertation_Data/NexisUni/separated_stories'  # Replace with the path to your separated stories directory
sample_num = 50  # Number of stories to sample
output_directory = '/Users/QuangAP/Quang_Apollo/Dissertation_Data/NexisUni/50sample_stories'  # Replace with the path to your output directory
sampling_50(input_directory, sample_num, output_directory)


##### Others

In [3]:
import os
import json
from nltk.tokenize import word_tokenize

def determine_frame(input_file_path, frame_path, output_file_path):
    # Function to read frames from files
    def load_frames(directory):
        frames = {}
        for filename in os.listdir(directory):
            if filename.endswith('.txt'):
                frame_name = filename.rsplit('.', 1)[0]
                with open(os.path.join(directory, filename), 'r') as file:
                    keywords = [line.strip().lower() for line in file if line.strip()]
                frames[frame_name] = keywords
        return frames

    # Function to determine the frame of a document
    def get_frame(tokenized_doc, frames):
        frame_scores = {frame: 0 for frame in frames}
        
        for word in tokenized_doc:
            for frame, keywords in frames.items():
                if word in keywords:
                    frame_scores[frame] += 1
        
        # Return the frame with the highest score
        return max(frame_scores, key=frame_scores.get)

    # Load frames from the directory
    frames = load_frames(frame_path)

    # Load tokenized stories from JSON file
    with open(input_file_path, 'r') as file:
        tokenized_stories = json.load(file)

    # Determine the frame of each document in the corpus
    document_frames = []
    for document in tokenized_stories:
        frame = get_frame(document, frames)
        document_frames.append(frame)

    # Save the frames results into a new JSON file
    with open(output_file_path, 'w') as file:
        json.dump(document_frames, file)

    print(f"Frames have been saved to {output_file_path}")

# Example usage
input_file_path = 'preprocessed_200.json'
frame_path = '/Users/QuangAP/Quang_Apollo/Frames'
output_file_path = 'framed_200.json'

determine_frame(input_file_path, frame_path, output_file_path)


Frames have been saved to framed_200.json
