In [4]:
import os
import pandas as pd
current_file = os.getcwd()
parent_dir = os.path.dirname(current_file)

dataset_deep = os.path.join(current_file, "deep_distributed_graph_dataset.csv")
dataset_reduced = os.path.join(parent_dir, "reduced", "reduced_deep_distributed_graph_dataset.csv")
deep_graph_df = pd.read_csv(dataset_deep)
reduced_graph_df = pd.read_csv(dataset_reduced)

In [7]:
from tabulate import tabulate

COLUMNS_TO_INCLUDE = [
    "use_case",
    "retrieval_operation",
    "based_on_template",
    "updated template",
    "semi-typed"
]

missing_columns = [col for col in COLUMNS_TO_INCLUDE if col not in deep_graph_df.columns]
df_selected = deep_graph_df[COLUMNS_TO_INCLUDE]

# Convert the selected DataFrame to a Markdown table
# Using 'pipe' format for typical Markdown tables
markdown_table = tabulate(df_selected, headers='keys', tablefmt='pipe', showindex=False)
with open("output_table.md", "w") as f:
    f.write(markdown_table)
    print(f"Markdown table saved to output_table.md")

Markdown table saved to output_table.md


In [7]:

# Filter out all those rows where the topic_entity_id is not "R659055"
filtered_deep_graph_df = deep_graph_df[deep_graph_df['topic_entity_id'] != 'R659055']

# Save the filtered DataFrame to a new CSV file
filtered_deep_graph_df.to_csv(os.path.join(parent_dir, "reduced", "topic_entity_test_dataset.csv"), index=False)


In [2]:
# for each row in reduced_graph_df find the corresponding row by the "uid" in deep_graph_df
# replace each column in deep_graph_df with the corresponding column in reduced_graph_df overwriting the values. For any column that is not in reduced_graph_df, keep the value from deep_graph_df
for index, row in reduced_graph_df.iterrows():
    uid = row["uid"]
    deep_graph_row = deep_graph_df[deep_graph_df["uid"] == uid]
    if not deep_graph_row.empty:
        for column in deep_graph_row.columns:
            if column in row:
                deep_graph_df.loc[deep_graph_df["uid"] == uid, column] = row[column]

# For every uid that appears in the reduced dataset, i want to mark the row in the deep_graph_df as "used_in_reduced" 
deep_graph_df["used_in_reduced"] = False
for index, row in reduced_graph_df.iterrows():
    uid = row["uid"]
    deep_graph_df.loc[deep_graph_df["uid"] == uid, "used_in_reduced"] = True
                

# save the modified deep_graph_df to a new csv file
output_file = os.path.join(current_file, "modified_deep_graph_dataset.csv")
deep_graph_df.to_csv(output_file, index=False)
print(f"Modified deep graph dataset saved to {output_file}")

Modified deep graph dataset saved to /home/marco/master_thesis_implementation/sqa-system/experiments/qa_datasets/qa_datasets/full/modified_deep_graph_dataset.csv


In [13]:
columns = df.columns
print("Columns in the dataset:")
for column in columns:
    print(column)

Columns in the dataset:
uid
semi-typed
question
golden_answer
source_ids
golden_doc_chunks
golden_triples
is_generated_with
topic_entity_id
topic_entity_value
hops
based_on_template
updated template
use_case
retrieval_operation
graph_representation
answer_format
answer_type
condition_type


In [14]:
import ast
def parse_list_string(list_string):
    if pd.isna(list_string):
        return []
    if not isinstance(list_string, str):
        return []
    if isinstance(list_string, list):
        return list_string
    try:
        return ast.literal_eval(list_string)
    except (ValueError, SyntaxError):
        print(f"Error parsing list string: {list_string}")
        return []

df['golden_triples'] = df['golden_triples'].apply(parse_list_string)
