In [1]:
import pandas as pd

In [2]:
import numpy as np

In [6]:
import pandas as pd
import json
import numpy as np # Import numpy for np.nan

# Step 1: Load the JSON file
with open("C:/Users/Ayush Jindal/Downloads/medical_diagnosis_dataset_cleaned.json", 'r') as file:
    data = json.load(file)

# Step 2: Convert to DataFrame and flatten the 'output' column
# This handles the 'output' dictionary and creates top-level columns for its contents
df = pd.json_normalize(data)

# Rename the new flattened columns for clarity
df.rename(columns={
    'output.diagnosis': 'diagnosis',
    'output.description': 'description',
    'output.suggestions': 'suggestions'
}, inplace=True)

# Step 3: Function to clean symptoms in 'input' column
def clean_symptoms(text):
    if pd.isna(text): # Handle NaN values which might occur after dropna
        return text
    if "Symptoms:" in text:
        # Split and clean individual symptoms, then rejoin
        symptoms_part = text.split("Symptoms:")[1]
        cleaned_symptoms_list = [s.strip().replace("_", " ") for s in symptoms_part.split(",") if s.strip()] # Ensure no empty strings from split
        return "Symptoms: " + ", ".join(cleaned_symptoms_list)
    return text.strip().replace("_", " ") # Fallback for cases without "Symptoms:"

# Step 4: Apply the cleaning function to 'input'
df['input'] = df['input'].apply(clean_symptoms)

# Convert lists in 'diagnosis' and 'suggestions' to tuples for deduplication
# Also ensures consistency for potential NaN values in original lists
df['diagnosis'] = df['diagnosis'].apply(lambda x: tuple(x) if isinstance(x, list) else (np.nan if pd.isna(x) else x))
df['suggestions'] = df['suggestions'].apply(lambda x: tuple(x) if isinstance(x, list) else (np.nan if pd.isna(x) else x))


# Step 5: Remove duplicates
# Now that 'diagnosis' and 'suggestions' are hashable (tuples), drop_duplicates will work
# We'll drop based on all relevant columns to ensure uniqueness
df_final = df.drop_duplicates(subset=['input', 'diagnosis', 'description', 'suggestions'])


# Step 6: Drop rows with any null values
# This will drop rows where 'input', 'diagnosis', 'description', or 'suggestions' are np.nan
df_final = df_final.dropna(subset=['input', 'diagnosis', 'description', 'suggestions'])

# Convert 'diagnosis' and 'suggestions' back to lists for the final JSON output
df_final['diagnosis'] = df_final['diagnosis'].apply(lambda x: list(x) if isinstance(x, tuple) else x)
df_final['suggestions'] = df_final['suggestions'].apply(lambda x: list(x) if isinstance(x, tuple) else x)


# Step 7: Reconstruct the original JSON structure
final_json_output_data = []
for index, row in df_final.iterrows():
    final_json_output_data.append({
        "input": row['input'],
        "output": {
            "diagnosis": row['diagnosis'],
            "description": row['description'],
            "suggestions": row['suggestions']
        }
    })

# Step 8: Save the cleaned DataFrame to a new JSON file
output_file_name = 'cleaned_medical_diagnosis_dataset.json'
with open(output_file_name, 'w') as f:
    json.dump(final_json_output_data, f, indent=2)

print(f"Cleaned and formatted dataset saved to {output_file_name}")

# Step 9: Show a preview
print("\nPreview of the cleaned and formatted data (first 5 entries):")
if len(final_json_output_data) > 5:
    print(json.dumps(final_json_output_data[:5], indent=2))
else:
    print(json.dumps(final_json_output_data, indent=2))

Cleaned and formatted dataset saved to cleaned_medical_diagnosis_dataset.json

Preview of the cleaned and formatted data (first 5 entries):
[
  {
    "input": "Symptoms: itching, skin rash, nodal skin eruptions, dischromic  patches.",
    "output": {
      "diagnosis": [
        "Fungal infection"
      ],
      "description": "In humans, fungal infections occur when an invading fungus takes over an area of the body and is too much for the immune system to handle. Fungi can live in the air, soil, water, and plants. There are also some fungi that live naturally in the human body. Like many microbes, there are helpful fungi and harmful fungi.",
      "suggestions": [
        "bath twice",
        "use detol or neem in bathing water",
        "keep infected area dry",
        "use clean cloths"
      ]
    }
  },
  {
    "input": "Symptoms: skin rash, nodal skin eruptions, dischromic  patches.",
    "output": {
      "diagnosis": [
        "Fungal infection"
      ],
      "description": 