In [1]:
import pandas as pd

In [2]:
import numpy as np

In [2]:
import json
import pandas as pd
import numpy as np
# Step 1: Load the initial JSON file (assuming it's named medical_diagnosis_dataset.json)
# If your file has a different name or path, please update 'medical_diagnosis_dataset.json' below.
try:
    with open('C:/Users/Ayush Jindal/Downloads/cleaned_medical_diagnosis_dataset.json', 'r') as f:
        json_raw_data = json.load(f)
except FileNotFoundError:
    print("Error: 'medical_diagnosis_dataset.json' not found. Please ensure it's in the same directory as your script.")
    exit()

# Step 2: Convert JSON data to a pandas DataFrame and normalize nested 'output'
df = pd.json_normalize(json_raw_data)

# Rename columns for easier access
df.rename(columns={
    'output.diagnosis': 'diagnosis',
    'output.description': 'description',
    'output.suggestions': 'suggestions'
}, inplace=True)

# Drop the original nested 'output' column if it still exists
if 'output' in df.columns:
    df.drop(columns=['output'], inplace=True)

# --- Step 3: Correctly Handle Duplicates (converting lists to tuples) ---
initial_rows = df.shape[0]

# Convert 'diagnosis' and 'suggestions' list columns to tuples to make them hashable for drop_duplicates
# This also handles potential NaN values or non-list types by making them hashable
df['diagnosis_temp'] = df['diagnosis'].apply(lambda x: tuple(x) if isinstance(x, list) else (x if pd.isna(x) else tuple()))
df['suggestions_temp'] = df['suggestions'].apply(lambda x: tuple(x) if isinstance(x, list) else (x if pd.isna(x) else tuple()))

# Perform deduplication on a relevant subset of columns
df_deduplicated = df.drop_duplicates(subset=['input', 'diagnosis_temp', 'description', 'suggestions_temp']).copy()

# Drop the temporary tuple columns
df_deduplicated.drop(columns=['diagnosis_temp', 'suggestions_temp'], inplace=True, errors='ignore')

duplicates_removed = initial_rows - df_deduplicated.shape[0]
print(f"Number of duplicate rows removed: {duplicates_removed}")

# --- Step 4: Correctly Handle Missing Values ---

# For 'input' and 'description', treat empty strings as missing (np.nan)
df_deduplicated.loc[:, 'input'] = df_deduplicated['input'].replace('', np.nan)
df_deduplicated.loc[:, 'description'] = df_deduplicated['description'].replace('', np.nan)

# Robust function to check if a list is empty or contains only empty/NA strings
def is_list_effectively_empty(lst):
    if not isinstance(lst, list):
        return pd.isna(lst) # Handles NaN values directly if not a list
    if not lst: # Handles truly empty list
        return True
    # Check if all elements in the list are empty strings or NaN
    return all(pd.isna(x) or (isinstance(x, str) and x.strip() == '') for x in lst)

rows_before_na_drop = df_deduplicated.shape[0]

# Filter out rows where 'diagnosis' or 'suggestions' are effectively empty
df_cleaned_final = df_deduplicated[~df_deduplicated['diagnosis'].apply(is_list_effectively_empty)].copy()
df_cleaned_final = df_cleaned_final[~df_cleaned_final['suggestions'].apply(is_list_effectively_empty)].copy()

# Drop rows where critical string columns ('input', 'description') are genuinely NaN
df_cleaned_final.dropna(subset=['input', 'description'], inplace=True)

missing_values_dropped = rows_before_na_drop - df_cleaned_final.shape[0]
print(f"Number of rows with critical missing values dropped: {missing_values_dropped}")
print(f"Total rows after all cleaning steps: {df_cleaned_final.shape[0]}")

# --- Step 5: Apply Symptom Formatting (Trim, replace underscores, and remove space before/after comma) ---
def format_symptoms_string_final(text):
    if pd.isna(text):
        return text
    
    # Extract prefix and symptoms part
    prefix = ""
    symptoms_only_text = text
    if "Symptoms:" in text:
        prefix_parts = text.split("Symptoms:", 1)
        prefix = prefix_parts[0].strip()
        symptoms_only_text = prefix_parts[1]

    # Split by comma, trim each symptom, replace underscores
    # Filter out any empty strings that might result from splitting
    cleaned_symptoms_list = [s.strip().replace("_", " ") for s in symptoms_only_text.split(',') if s.strip()]

    # Rejoin with just a comma (no space before or after)
    rejoined_symptoms = ",".join(cleaned_symptoms_list)

    # Ensure there are no double commas that might result from cleaning
    rejoined_symptoms = rejoined_symptoms.replace(",,", ",")


    # Add "Symptoms:" prefix back if it was originally present
    if "Symptoms:" in text: # Check original text to decide if prefix was there
        return prefix + ":" + rejoined_symptoms
    else:
        return rejoined_symptoms # If no prefix, just return the cleaned string

df_cleaned_final.loc[:, 'input'] = df_cleaned_final['input'].apply(format_symptoms_string_final)


# --- Step 6: Format 'output' into a single multiline string ---
def format_output_to_multiline_string(row):
    # Ensure diagnosis and suggestions are lists before joining
    diagnosis_list = list(row['diagnosis']) if isinstance(row['diagnosis'], (tuple, pd.Series)) else row['diagnosis']
    suggestions_list = list(row['suggestions']) if isinstance(row['suggestions'], (tuple, pd.Series)) else row['suggestions']

    # Join with no spaces around commas for suggestions and diagnosis
    diagnosis_str = "Diagnosis: " + ",".join(diagnosis_list) if diagnosis_list else "Diagnosis: N/A"
    description_str = "Description: " + row['description'] if row['description'] else "Description: N/A"
    suggestions_str = "Suggestions: " + ",".join(suggestions_list) if suggestions_list else "Suggestions: N/A"
    
    # Combine with newlines
    return f"{diagnosis_str}\n{description_str}\n{suggestions_str}"

# Apply this new formatting function to create the 'output_string' column
df_cleaned_final['output_string'] = df_cleaned_final.apply(format_output_to_multiline_string, axis=1)


# --- Step 7: Reconstruct the final JSON structure with multiline string output ---
final_json_output_data = []
for index, row in df_cleaned_final.iterrows():
    final_json_output_data.append({
        "input": row['input'],
        "output": row['output_string'] # 'output' is now a single multiline string
    })

# Step 8: Save the final cleaned and formatted JSON data to a new file
output_file_name = 'medical_diagnosis_dataset_fully_cleaned.json'
with open(output_file_name, 'w') as f:
    json.dump(final_json_output_data, f, indent=2)

print(f"\nFinal cleaned and formatted JSON dataset saved to {output_file_name}")

# --- Display the first 5 entries for preview ---
print("\nFirst 5 entries of the final cleaned and formatted JSON:")
if len(final_json_output_data) > 0:
    print(json.dumps(final_json_output_data[:5], indent=2))
else:
    print("The dataset is empty.")

Number of duplicate rows removed: 0
Number of rows with critical missing values dropped: 0
Total rows after all cleaning steps: 283

Final cleaned and formatted JSON dataset saved to medical_diagnosis_dataset_fully_cleaned.json

First 5 entries of the final cleaned and formatted JSON:
[
  {
    "input": ":itching,skin rash,nodal skin eruptions,dischromic  patches.",
    "output": "Diagnosis: Fungal infection\nDescription: In humans, fungal infections occur when an invading fungus takes over an area of the body and is too much for the immune system to handle. Fungi can live in the air, soil, water, and plants. There are also some fungi that live naturally in the human body. Like many microbes, there are helpful fungi and harmful fungi.\nSuggestions: bath twice,use detol or neem in bathing water,keep infected area dry,use clean cloths"
  },
  {
    "input": ":skin rash,nodal skin eruptions,dischromic  patches.",
    "output": "Diagnosis: Fungal infection\nDescription: In humans, fungal i