In [None]:
import json
import pandas as pd

# Initialize an empty list to store data for each row
rows = []

# Read the JSONL file line by line
with open("combined_results_2.jsonl", "r") as file:
    for line in file:
        try:
            # Parse the JSON data from each line
            item = json.loads(line.strip())

            # Extract dataset_name and dataset_description
            dataset_name = item.get("input", {}).get("dataset_name", "Unknown")
            dataset_description = item.get("input", {}).get("dataset_description", "Unknown")

            # Extract all titles from the modalities in the output
            modalities = item.get("output", "")
            if modalities.startswith("```json"):  # Remove formatting markers in the JSON string
                modalities = modalities.strip("```json\n").strip("```")
            modalities_data = json.loads(modalities)  # Parse the string as JSON

            # Collect all titles
            titles = []
            for modality in modalities_data.get("modalities", []):
                titles.append(modality.get("title", "Unknown"))

            # Add data to the row
            rows.append({
                "dataset_name": dataset_name,
                "dataset_description": dataset_description,
                "modalities_titles": ", ".join(titles)  # Join titles with a comma separator
            })

        except Exception as e:
            print(f"Error processing entry, skipping this line: {line}, Error: {e}")

# Create a DataFrame
df = pd.DataFrame(rows)

# Save as a CSV file
df.to_csv("output_jiewen.csv", index=False)
print("Data has been successfully saved to output_jiewen.csv!")

Error processing entry, skipping this line: {"custom_id": "task-4818", "input": {"dataset_name": "Helix", "dataset_description": "See https://zenodo.org/record/5500215#.YUCgD51Kg2w"}, "output": "I'm unable to access external content such as the specific dataset description from Zenodo. However, I can provide a general structure for a multimodal dataset based on common modalities and data types. If you have specific details about the \"Helix\" dataset, please provide them, and I can tailor the response accordingly.\n\n```json\n{\n    \"modalities\": [\n        {\n            \"title\": \"text\",\n            \"explanation\": \"This modality includes any textual data that might be part of the dataset.\",\n            \"data_types\": [\n                {\n                    \"title\": \"description\",\n                    \"explanation\": \"Textual descriptions or annotations related to the dataset.\"\n                }\n            ]\n        },\n        {\n            \"title\": \"imag

In [None]:
import json
import pandas as pd

# Initialize an empty list to store results
rows = []

# Open the JSON file
with open("results.json", "r") as file:
    try:
        # Load JSON data
        data = json.load(file)

        # Extract information
        for item in data:
            # Get the name and keep only the part before the parentheses
            name = item.get("name", "Unknown").split(" (")[0]

            # Get the titles under modalities in tags
            modalities = item.get("tags", {}).get("modalities", [])
            modality_titles = [modality.get("title", "Unknown") for modality in modalities]

            # Add data to rows
            rows.append({
                "name": name,
                "modalities_titles": ", ".join(modality_titles)  # Join titles with commas
            })

    except json.JSONDecodeError as e:
        print(f"JSON format error: {e}. Please check the file content!")

# Create a DataFrame
if rows:
    df = pd.DataFrame(rows)

    # Save as a CSV file
    df.to_csv("output_yu.csv", index=False)
    print("DataFrame has been successfully saved to output_yu.csv")
else:
    print("No data was generated. The file might be empty or all entries have issues.")

DataFrame has been successfully saved to output_yu.csv


In [None]:
import pandas as pd

# Read two CSV files
output_jiewen = pd.read_csv("output_jiewen.csv")
output_yu = pd.read_csv("output_yu.csv")

# Extract the common column and ensure consistent name handling
output_jiewen["dataset_name"] = output_jiewen["dataset_name"]
output_yu["name"] = output_yu["name"]

# Merge the two files based on the common column (name or dataset_name)
merged_df = pd.merge(
    output_jiewen.rename(columns={"dataset_name": "name"}),
    output_yu,
    on="name",
    how="inner",  # Keep only the common rows
    suffixes=("_jiewen", "_yu")  # Avoid column name conflicts
)

# Rearrange and rename columns
final_df = merged_df[["name", "dataset_description", "modalities_titles_jiewen", "modalities_titles_yu"]]
final_df.columns = ["name", "dataset_description", "modalities_jiewen", "modalities_yu"]

# Save the final result to a new CSV file
final_df.to_csv("merged_output.csv", index=False)
print("The merged file has been successfully saved as merged_output.csv")


The merged file has been successfully saved as merged_output.csv


In [None]:
import pandas as pd

# Read the file
file_path = "merged_output.csv"  # Replace with your file path
df = pd.read_csv(file_path)

# Print the number of rows in the file
print(f"The total number of rows in the file is: {len(df)}")

The total number of rows in the file is: 10224


In [None]:
# Find the number of rows where the third and fourth columns are the same
same_count = (df.iloc[:, 2] == df.iloc[:, 3]).sum()

# Print the result
print(f"The number of rows where the third and fourth columns are the same is: {same_count}")

The number of rows where the third and fourth columns are the same is: 6252


In [None]:
different_rows_df = df[df.iloc[:, 2] != df.iloc[:, 3]]

In [None]:
# Randomly sample 20 rows from the DataFrame while keeping the result reproducible by setting random_state
sample_df = different_rows_df.sample(n=20, random_state=42)
print(sample_df)

                                                   name  \
7371                                               ATUE   
3252                           twitter politicians data   
6279                                           AnoShift   
3219  NISP- A Multi-lingual Multi-accent Dataset for...   
7457                                       MO-Gymnasium   
6233  Replication Data for: The use of differential ...   
7446                                PACE 2022 Heuristic   
7065                                          Wild-Time   
8567  3D-Point Cloud dataset of various geometrical ...   
4136                        Dark Machines Anomaly Score   
8139                                       TriMouse-161   
8193                                         DeepPatent   
1991                                                H3D   
3470                                           TCR-pMHC   
9385                          Student-Teacher Prompting   
5470                          Simulated EM showers data 

In [None]:
# Randomly select 20 rows
sample_df = different_rows_df.sample(n=20, random_state=42)  # Ensure reproducibility by setting random_state

# Merge the third and fourth columns, remove duplicates, and combine them into a string
def merge_and_deduplicate(row):
    col3 = row.iloc[2].split(",")  # Split the third column by commas
    col4 = row.iloc[3].split(",")  # Split the fourth column by commas
    merged = list(set(col3 + col4))  # Merge the two lists and remove duplicates
    return ",".join(merged)  # Combine the list into a single string

# Add a new column to store the merged result
sample_df["merged_columns"] = sample_df.apply(merge_and_deduplicate, axis=1)

# Keep the name, description, and merged result
result_df = sample_df[["name", "dataset_description"]].copy()  # Assuming name and description columns exist
result_df["merged_columns"] = sample_df["merged_columns"]

# Print the processed result
print(result_df)

# Save to a file if needed
output_path = "sampled_different_rows_with_metadata.csv"
result_df.to_csv(output_path, index=False)
print(f"The result has been saved to {output_path}")


                                                   name  \
7371                                               ATUE   
3252                           twitter politicians data   
6279                                           AnoShift   
3219  NISP- A Multi-lingual Multi-accent Dataset for...   
7457                                       MO-Gymnasium   
6233  Replication Data for: The use of differential ...   
7446                                PACE 2022 Heuristic   
7065                                          Wild-Time   
8567  3D-Point Cloud dataset of various geometrical ...   
4136                        Dark Machines Anomaly Score   
8139                                       TriMouse-161   
8193                                         DeepPatent   
1991                                                H3D   
3470                                           TCR-pMHC   
9385                          Student-Teacher Prompting   
5470                          Simulated EM showers data 

In [None]:
import pandas as pd

# Read verification_data.csv
verification_df = pd.read_csv('verification_data.csv')

# Merge sample_df and verification_df based on the 'name' column
merged_df = pd.merge(sample_df, verification_df, on='name', how='inner')

# Define a function to split words (supporting both comma and space as delimiters)
def split_words(text):
    if pd.isna(text):  # Handle missing values
        return set()
    # Replace commas with spaces and split by spaces
    return set(text.replace(',', ' ').split())

# Initialize counters
total_jiewen_words = set()
total_yu_words = set()
total_verification_words = set()

# Iterate through each row and collect all words
for _, row in merged_df.iterrows():
    total_jiewen_words.update(split_words(row['modalities_jiewen']))
    total_yu_words.update(split_words(row['modalities_yu']))
    total_verification_words.update(split_words(row['Verification']))

# Calculate jiewen_percentage
common_jiewen_words = total_verification_words.intersection(total_jiewen_words)
if len(total_jiewen_words) == 0:
    jiewen_percentage = 0
else:
    jiewen_percentage = len(common_jiewen_words) / len(total_jiewen_words)

# Calculate yu_percentage
common_yu_words = total_verification_words.intersection(total_yu_words)
if len(total_yu_words) == 0:
    yu_percentage = 0
else:
    yu_percentage = len(common_yu_words) / len(total_yu_words)

# Output the results
print(f"jiewen_percentage: {jiewen_percentage:.2%}")
print(f"yu_percentage: {yu_percentage:.2%}")

jiewen_percentage: 81.25%
yu_percentage: 60.00%
