<a href="https://colab.research.google.com/github/SingularityNET-Archive/LLM-Development/blob/main/snet_json.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# A script to take a JSON file and clean it up

In [1]:
# A script to take a JSON file and clean it up

import json
import re

def clean_json(input_file, output_file):
    try:
        with open(input_file, 'r', encoding='utf-8') as f:
            try:
                data = json.load(f)
            except json.JSONDecodeError as e:
                print(f"Error decoding JSON: {e}")
                # Attempt to fix common JSON errors (e.g., trailing commas)
                try:
                    data = json.loads(re.sub(r",\s*\]", "]", re.sub(r",\s*}", "}", f.read())))
                except json.JSONDecodeError as e2:
                    print(f"Could not fix JSON errors, error is: {e2}")
                    return
    except FileNotFoundError:
        print(f"Error: Input file '{input_file}' not found.")
        return

    # Example cleaning operations (customize as needed)
    if isinstance(data, dict):
        for key, value in list(data.items()):  # Iterate through a copy to allow deletion
          if value is None:
            del data[key]
          if isinstance(value, str):
            data[key] = value.strip()
          if isinstance(value, dict):
            # Recursively call the clean_json for nested dictionaries
            data[key] = clean_json_helper(value)

    elif isinstance(data, list):
        for i in range(len(data)):
            if isinstance(data[i], dict):
                data[i] = clean_json_helper(data[i])
            elif isinstance(data[i], str):
                data[i] = data[i].strip()

    with open(output_file, 'w', encoding='utf-8') as outfile:
        json.dump(data, outfile, indent=4)

def clean_json_helper(data):

    # Example cleaning operations for nested dictionaries (customize as needed)
    if isinstance(data, dict):
      for key, value in list(data.items()):  # Iterate through a copy to allow deletion
        if value is None:
          del data[key]
        if isinstance(value, str):
          data[key] = value.strip()
    return data

# Example usage
input_file = "/content/drive/MyDrive/Colab Notebooks/meeting-summaries-by-id.json" # Replace with your input file
output_file = "output.json"  # Replace with your desired output file
clean_json(input_file, output_file)


# Take output.json file and check it is well formed

In [2]:
# Take output.json file and check it is well formed

import json

def is_valid_json(file_path):
    try:
        with open(file_path, 'r') as f:
            json.load(f)
        return True
    except json.JSONDecodeError:
        return False

# Example usage
file_path = "output.json"  # Replace with the actual path to your file
if is_valid_json(file_path):
    print(f"The file '{file_path}' is a valid JSON file.")
else:
    print(f"The file '{file_path}' is not a valid JSON file.")

The file 'output.json' is a valid JSON file.


# Identify the entities and relationships in output.json using NLP and output in nlp.txt

In [2]:
# Identify the entities and relationships in output.json using NLP and output in nlp.txt

import json
import spacy

# Load a spaCy model (you might need to download one: python -m spacy download en_core_web_sm)
nlp = spacy.load("en_core_web_sm")

def extract_entities_and_relationships(json_file, output_file):
    try:
        with open(json_file, "r", encoding="utf-8") as f:
            data = json.load(f)
    except FileNotFoundError:
        print(f"Error: File '{json_file}' not found.")
        return
    except json.JSONDecodeError:
        print(f"Error: Invalid JSON in file '{json_file}'.")
        return

    entities_and_relationships = []

    def process_data(data):
      if isinstance(data, dict):
        for key, value in data.items():
          if isinstance(value, str):
            doc = nlp(value)
            for ent in doc.ents:
              entities_and_relationships.append({"entity": ent.text, "label": ent.label_,"context":key})

            for token in doc:
                if token.dep_ == "ROOT":
                   entities_and_relationships.append({"relationship": token.text, "context": key})
          elif isinstance(value, (dict, list)):
            process_data(value)

      elif isinstance(data, list):
        for item in data:
           process_data(item)


    process_data(data)

    try:
        with open(output_file, "w", encoding="utf-8") as outfile:
            json.dump(entities_and_relationships, outfile, indent=4)
        print(f"Entities and relationships extracted to '{output_file}'")

    except Exception as e:
        print(f"Error writing to file: {e}")

# Example usage:
input_json_file = "output.json"  # Replace with the path to your JSON file
output_txt_file = "nlp.txt"  # Replace with the desired output file name
extract_entities_and_relationships(input_json_file, output_txt_file)

Entities and relationships extracted to 'nlp.txt'
