In [2]:
# Importing essential libraries for data handling and synthetic generation
import pandas as pd
from sdv.single_table import CTGANSynthesizer as CTGAN
from sdv.metadata import Metadata

print("All required libraries are imported successfully.")

All required libraries are imported successfully.


In [3]:
# -- 1. Loading the orginal dataset --
data = pd.read_csv('labeled_ml_input.csv')
data.columns = ['Age', 'Gender', 'District', 'Education_Level', 'Qualification', 
                'Years_Experience', 'Languages_Spoken', 'Care_Category', 
                'Care_Service_Type', 'Preferred_Time', 'Expected_Salary', 'Match_Decision']

print(f"Original dataset loaded with {len (data)} records...")

print("Column structure verified...")


# -- 2. Detecting metadata from the dataset --
print("\nDetecting metadata from the dataset...")


metadata = Metadata.detect_from_dataframe(
    data=data,
    table_name='elderly_data'
)

print("Metadata detection completed...")

# -- 3. Updating metadata for multi-value categorical columns --
multi_value_cols = ['Languages_Spoken', 'Qualification', 'Care_Category']
for col in multi_value_cols:
    metadata.update_column(
        table_name='elderly_data',
        column_name=col,
        sdtype='categorical'
    )

print("Metadata configuration is finalized successfully!")

Original dataset loaded with 20 records...
Column structure verified...

Detecting metadata from the dataset...
Metadata detection completed...
Metadata configuration is finalized successfully!


In [5]:
# -- 4.  Initializing and train the CTGAN model --
print ("Intializing CTGAN model...")
print ("Training configuration")

model = CTGAN(
    metadata=metadata, 
    epochs=300, 
    verbose=True)

print("CTGAN training started...")
model.fit(data)
print("Model training completed successfully.")

# -- 5. Generate synthetic caregiver records --
print("Generating synthetic caregiver records...")

synthetic_data = model.sample(num_rows=1980)

# -- 6. Combining original and synthetic datasets -- 
print("Merging original human-labeled data with AI-generated data...")
final_data = pd.concat([data, synthetic_data], ignore_index=True)
final_data.to_csv('final_ml_dataset2.csv', index=False)

print(f"\nCOMPLETED! Final dataset size: {len(final_data)} records.")
final_data.head()

Intializing CTGAN model...
Training configuration
CTGAN training started...


Gen. (0.60) | Discrim. (-0.12): 100%|█████████| 300/300 [00:13<00:00, 22.71it/s]


Model training completed successfully.
Generating synthetic caregiver records...
Merging original human-labeled data with AI-generated data...

COMPLETED! Final dataset size: 2000 records.


NameError: name 'train_test_split' is not defined