In [10]:
import json
import re
from collections import Counter
import csv

def extract_sparql_queries(json_file_path):
    """
    Extracts SPARQL queries from a JSON file and returns them as a list.
    """
    sparql_queries = []
    try:
        with open(json_file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        if "questions" in data and isinstance(data["questions"], list):
            for question in data["questions"]:
                if "query" in question and "sparql" in question["query"]:
                    sparql_queries.append(question["query"]["sparql"])
    except (FileNotFoundError, json.JSONDecodeError, KeyError) as e:
        print(f"An error occurred: {e}")
    return sparql_queries

def extract_and_count_predicates(queries):
    """
    Extracts and counts predicates with the orkgp: prefix from a list of SPARQL queries.
    """
    all_predicates = []
    # Pattern to find orkgp: followed by letters, numbers, or underscores
    pattern = re.compile(r'\borkgp:[a-zA-Z0-9_]+\b')
    for query in queries:
        found_predicates = pattern.findall(query)
        all_predicates.extend(found_predicates)
    predicate_counts = Counter(all_predicates)
    return predicate_counts

def save_to_csv(data, filename):
    """
    Saves a dictionary of predicate frequencies to a CSV file, ordered by frequency.
    """
    with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['predicate', 'frequency']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()

        # Sort the dictionary items by frequency in descending order
        sorted_data = sorted(data.items(), key=lambda item: item[1], reverse=True)

        for predicate, count in sorted_data:
            writer.writerow({'predicate': predicate, 'frequency': count})

# --- Main execution flow ---

# Replace 'your_input_file.json' with the actual path to your JSON file.
json_file_path = '/Users/sherrypan/GitHub/GAR_SKGQA/datasets/sciqa/all.json'
output_csv_filename = 'orkgp_predicate_frequency_all.csv'

# 1. Extract queries from the JSON file
sparql_queries = extract_sparql_queries(json_file_path)

if sparql_queries:
    # 2. Extract and count the orkgp predicates
    frequency_list = extract_and_count_predicates(sparql_queries)
    
    # 3. Save the results to a CSV file
    save_to_csv(frequency_list, output_csv_filename)
    
    print(f"Successfully processed {len(sparql_queries)} queries.")
    print(f"Predicate frequency list has been saved to '{output_csv_filename}'.")
else:
    print("No queries were found or an error occurred during extraction. No CSV file was generated.")

Successfully processed 2565 queries.
Predicate frequency list has been saved to 'orkgp_predicate_frequency_all.csv'.
