In [2]:
# ✅ Step 1: Import Libraries
import os
import json

# ✅ Step 2: Set Folder Paths
TRAINING_FOLDER = "training_data"  # Replace with your actual folder path
OUTPUT_FILE = "training_dataset.json"

# ✅ Step 3: Initialize an Empty List to Hold the Combined Data
combined_data = []

# ✅ Step 4: Loop Through All JSON Files in the Folder
for root, _, files in os.walk(TRAINING_FOLDER):
    for file in files:
        if file.endswith(".json"):
            file_path = os.path.join(root, file)
            
            # Read each JSON file
            with open(file_path, "r", encoding="utf-8") as f:
                try:
                    data = json.load(f)
                    
                    # Ensure the file contains a list of samples
                    if isinstance(data, list):
                        combined_data.extend(data)
                    else:
                        combined_data.append(data)
                    
                    print(f"✅ Processed: {file}")
                except json.JSONDecodeError as e:
                    print(f"❌ Error reading {file}: {e}")

# ✅ Step 5: Save the Combined Data into a Single JSON File
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    json.dump(combined_data, f, indent=4)

print(f"\n✅ Combined dataset saved as: {OUTPUT_FILE}")
print(f"Total samples combined: {len(combined_data)}")


✅ Processed: class_11_biology_Chapter1.pdf.json
✅ Processed: class_11_biology_Chapter10.pdf.json
✅ Processed: class_11_biology_Chapter11.pdf.json
✅ Processed: class_11_biology_Chapter12.pdf.json
✅ Processed: class_11_biology_Chapter13.pdf.json
✅ Processed: class_11_biology_Chapter14.pdf.json
✅ Processed: class_11_biology_Chapter15.pdf.json
✅ Processed: class_11_biology_Chapter16.pdf.json
✅ Processed: class_11_biology_Chapter17.pdf.json
✅ Processed: class_11_biology_Chapter18.pdf.json
✅ Processed: class_11_biology_Chapter19.pdf.json
✅ Processed: class_11_biology_Chapter2.pdf.json
✅ Processed: class_11_biology_Chapter3.pdf.json
✅ Processed: class_11_biology_Chapter4.pdf.json
✅ Processed: class_11_biology_Chapter5.pdf.json
✅ Processed: class_11_biology_Chapter6.pdf.json
✅ Processed: class_11_biology_Chapter7.pdf.json
✅ Processed: class_11_biology_Chapter8.pdf.json
✅ Processed: class_11_biology_Chapter9.pdf.json
✅ Processed: class_11_chemistry_Answer1.pdf.json
✅ Processed: class_11_chemist

In [6]:
# ✅ Step 1: Import Libraries
import os
import json
import random
from sklearn.model_selection import train_test_split

# ✅ Step 2: Define Paths
TRAINING_FOLDER = "training_data"  # Path to your training data folder
TRAINING_OUTPUT_FILE = "training_dataset.json"  # 80% Training dataset
TEST_OUTPUT_FILE = "test_dataset.json"          # 20% Test dataset

# ✅ Step 3: Merge Training Data into a Single Dataset
combined_data = []

# ✅ Loop through all JSON files in the training folder
for root, _, files in os.walk(TRAINING_FOLDER):
    for file in files:
        if file.endswith(".json"):
            file_path = os.path.join(root, file)

            # Read each JSON file
            with open(file_path, "r", encoding="utf-8") as f:
                try:
                    data = json.load(f)

                    # Ensure the file contains a list of samples
                    if isinstance(data, list):
                        combined_data.extend(data)
                    else:
                        combined_data.append(data)

                    print(f"✅ Processed: {file}")
                except json.JSONDecodeError as e:
                    print(f"❌ Error reading {file}: {e}")

# ✅ Step 4: Split the Dataset into 80-20 Split
random.shuffle(combined_data)  # Shuffle for randomness

# 80% training, 20% testing
train_data, test_data = train_test_split(combined_data, test_size=0.2, random_state=42)

# ✅ Step 5: Save the Training and Test Datasets
with open(TRAINING_OUTPUT_FILE, "w", encoding="utf-8") as f:
    json.dump(train_data, f, indent=4)

with open(TEST_OUTPUT_FILE, "w", encoding="utf-8") as f:
    json.dump(test_data, f, indent=4)

# ✅ Print the results
print(f"\n✅ Training dataset saved as: {TRAINING_OUTPUT_FILE}")
print(f"Total training samples: {len(train_data)}")

print(f"\n✅ Test dataset saved as: {TEST_OUTPUT_FILE}")
print(f"Total test samples: {len(test_data)}")



✅ Processed: class_11_biology_Chapter1.pdf.json
✅ Processed: class_11_biology_Chapter10.pdf.json
✅ Processed: class_11_biology_Chapter11.pdf.json
✅ Processed: class_11_biology_Chapter12.pdf.json
✅ Processed: class_11_biology_Chapter13.pdf.json
✅ Processed: class_11_biology_Chapter14.pdf.json
✅ Processed: class_11_biology_Chapter15.pdf.json
✅ Processed: class_11_biology_Chapter16.pdf.json
✅ Processed: class_11_biology_Chapter17.pdf.json
✅ Processed: class_11_biology_Chapter18.pdf.json
✅ Processed: class_11_biology_Chapter19.pdf.json
✅ Processed: class_11_biology_Chapter2.pdf.json
✅ Processed: class_11_biology_Chapter3.pdf.json
✅ Processed: class_11_biology_Chapter4.pdf.json
✅ Processed: class_11_biology_Chapter5.pdf.json
✅ Processed: class_11_biology_Chapter6.pdf.json
✅ Processed: class_11_biology_Chapter7.pdf.json
✅ Processed: class_11_biology_Chapter8.pdf.json
✅ Processed: class_11_biology_Chapter9.pdf.json
✅ Processed: class_11_chemistry_Answer1.pdf.json
✅ Processed: class_11_chemist