In [None]:
import os
import pandas as pd
import json

# Paths
csv_folder  = r"D:\TRAINING MODEL\data\csv_folder"
logs_folder = r"D:\TRAINING MODEL\logs"
out_folder  = r"D:\TRAINING MODEL\data\processed"

# Master CSV and log file
master_csv = os.path.join(out_folder, "master_dataset.csv")
log_file   = os.path.join(logs_folder, "csv_master_log.txt")

# Ensure folders exist
os.makedirs(csv_folder, exist_ok=True)
os.makedirs(logs_folder, exist_ok=True)
os.makedirs(out_folder, exist_ok=True)

# Load processed file log
if os.path.exists(log_file):
    with open(log_file, "r") as f:
        processed_files = set(line.strip() for line in f)
else:
    processed_files = set()

# Find all CSV files
csv_files = [f for f in os.listdir(csv_folder) if f.lower().endswith(".csv")]

new_dataframes = []
new_files = []

for file in csv_files:
    file_path = os.path.join(csv_folder, file)
    
    if file not in processed_files:  # Only new files
        print(f"📂 Adding: {file}")
        df = pd.read_csv(file_path)
        new_dataframes.append(df)
        new_files.append(file)

if new_dataframes:
    combined_df = pd.concat(new_dataframes, ignore_index=True)

    # If master CSV exists, append without header
    if os.path.exists(master_csv):
        combined_df.to_csv(master_csv, mode="a", header=False, index=False)
    else:
        combined_df.to_csv(master_csv, mode="w", header=True, index=False)

    # Update log file
    with open(log_file, "a") as f:
        for file in new_files:
            f.write(file + "\n")

    print(f"✅ Updated {master_csv} with {len(new_files)} new files")
else:
    print("⚠️ No new files to add.")


📂 Adding: TOP 10 LEADING APRIL 2023.csv
📂 Adding: TOP 10 LEADING AUGUST 2023.csv
📂 Adding: TOP 10 LEADING DECEMBER 2023.csv
📂 Adding: TOP 10 LEADING JULY 2023.csv
📂 Adding: TOP 10 LEADING JUNE 2023.csv
📂 Adding: TOP 10 LEADING NOVEMBER 2023.csv
📂 Adding: TOP 10 LEADING OCTOBER 2023.csv
📂 Adding: TOP 10 LEADING SEPTEMBER 2023.csv
✅ Updated D:\TRAINING MODEL\data\processed\master_dataset.csv with 8 new files


In [36]:
# Load the master CSV
df = pd.read_csv(master_csv)

# Remove rows where 'Case' is empty (NaN or blank)
df = df[df["Case"].notna()]            # drop NaN
df = df[df["Case"].astype(str).str.strip() != ""]  # drop empty strings

# Save cleaned dataset back
df.to_csv(master_csv, index=False)

print(f"✅ Cleaned master CSV, remaining rows: {len(df)}")

✅ Cleaned master CSV, remaining rows: 241


In [37]:
# Path to dictionary file
case_dict_file = os.path.join(logs_folder, "case_dictionary.json")

# Load existing dictionary if it exists
if os.path.exists(case_dict_file):
    with open(case_dict_file, "r") as f:
        case_dict = json.load(f)
else:
    case_dict = {}

# Load the master CSV
df = pd.read_csv(master_csv)

# Get all unique Case values
unique_cases = df["Case"].dropna().astype(str).str.strip().unique()

# Update dictionary with new cases
for case in unique_cases:
    if case not in case_dict:
        case_dict[case] = len(case_dict) + 1   # assign next ID (or any scheme you like)

# Save dictionary back to file
with open(case_dict_file, "w") as f:
    json.dump(case_dict, f, indent=4)

print(f"✅ Case dictionary updated. Total cases: {len(case_dict)}")

✅ Case dictionary updated. Total cases: 108


In [38]:
# Load case dictionary
with open(case_dict_file, "r") as f:
    case_dict = json.load(f)

# Load master CSV
df = pd.read_csv(master_csv)

# Map Case column to numbers using dictionary
df["Case"] = df["Case"].astype(str).str.strip().map(case_dict)

# Save updated dataset
df.to_csv(master_csv, index=False)

print("✅ Converted 'Case' column into numeric codes using case dictionary")

✅ Converted 'Case' column into numeric codes using case dictionary


In [39]:
# Load master CSV
df = pd.read_csv(master_csv)

# Define mapping for Consultation_Type
consultation_map = {
    "Consultation": 1,
    "Diagnosis": 2,
    "Mortality": 3
}

# Apply mapping (ignores NaN and unexpected values)
df["Consultation_Type"] = df["Consultation_Type"].map(consultation_map)

# Save back to master CSV
df.to_csv(master_csv, index=False)

print("✅ Converted 'Consultation_Type' column into numeric codes")

✅ Converted 'Consultation_Type' column into numeric codes
