In [1]:
!pip install sdv
!pip install pandas
!pip install scikit-learn
!pip install io
!pip install numpy

Collecting sdv
  Downloading sdv-1.17.2-py3-none-any.whl.metadata (13 kB)
Collecting boto3<2.0.0,>=1.28 (from sdv)
  Downloading boto3-1.35.68-py3-none-any.whl.metadata (6.7 kB)
Collecting copulas>=0.12.0 (from sdv)
  Downloading copulas-0.12.0-py3-none-any.whl.metadata (9.1 kB)
Collecting ctgan>=0.10.2 (from sdv)
  Downloading ctgan-0.10.2-py3-none-any.whl.metadata (10 kB)
Collecting deepecho>=0.6.1 (from sdv)
  Downloading deepecho-0.6.1-py3-none-any.whl.metadata (10 kB)
Collecting rdt>=1.13.1 (from sdv)
  Downloading rdt-1.13.1-py3-none-any.whl.metadata (10 kB)
Collecting sdmetrics>=0.17.0 (from sdv)
  Downloading sdmetrics-0.17.0-py3-none-any.whl.metadata (8.7 kB)
Collecting platformdirs>=4.0 (from sdv)
  Downloading platformdirs-4.3.6-py3-none-any.whl.metadata (11 kB)
Collecting botocore<2.0.0,>=1.31 (from sdv)
  Downloading botocore-1.35.68-py3-none-any.whl.metadata (5.7 kB)
Collecting s3transfer<0.11.0,>=0.10.0 (from boto3<2.0.0,>=1.28->sdv)
  Downloading s3tr

In [2]:
import kagglehub
from sdv.single_table import CTGANSynthesizer
import pandas as pd
import numpy as np
from sklearn import preprocessing 
from io import StringIO
from collections import defaultdict

from sdv.metadata import Metadata
from sdv.evaluation.single_table import evaluate_quality


In [3]:
# playground_series_s4e11_path = kagglehub.competition_download('playground-series-s4e11')

In [4]:
path='/kaggle/input/playground-series-s4e11/train.csv'
train = pd.read_csv(path)


In [5]:
student_train = train[train["Working Professional or Student"]=="Student"]
student_train.drop(['id','Working Professional or Student','Profession','Work Pressure','Job Satisfaction'],axis=1, inplace=True)
student_train.dropna(axis=1, how='all', inplace=True)

In [6]:
string_columns = student_train.select_dtypes(include=['object','string']).columns
numeric_student = student_train[student_train.columns.difference(string_columns)]


In [7]:
string_student=student_train[string_columns]

In [8]:

d = defaultdict(preprocessing.LabelEncoder)
string_student = string_student.apply(lambda x: d[x.name].fit_transform(x))

In [9]:
numeric_student = numeric_student.join(string_student)

In [10]:
numeric_student.select_dtypes(include=['object']).columns

Index([], dtype='object')

In [11]:

original_means = numeric_student.mean()
original_stds = numeric_student.std()
original_corr = numeric_student.corr()

In [12]:
# Define metadata for your dataset
metadata = Metadata()
# md = metadata.detect_from_dataframe(student_train).save_to_json('./metadata.json')

In [13]:
metadata_path='/kaggle/input/student-metadata/metadata.json'

In [14]:
md = metadata.load_from_json(metadata_path)


In [15]:
synthesizer = CTGANSynthesizer(
    md,
    epochs=500,
    generator_dim=(256, 256, 256),
    log_frequency=True)



In [16]:
synthesizer.auto_assign_transformers(numeric_student)


In [17]:
synthesizer.fit(numeric_student)

In [18]:
synthesizer.save(
    filepath='/kaggle/working/my_synthesizer.pkl'
)

In [19]:
synthetic_data = synthesizer.sample(len(numeric_student))


In [20]:
synthetic_data.to_csv('/kaggle/working/numeric_synethetic_data.csv')

In [21]:
quality_score = evaluate_quality(synthetic_data, numeric_student,md)
print(f"Quality Score: {quality_score}")

Generating report ...

(1/2) Evaluating Column Shapes: |██████████| 15/15 [00:00<00:00, 102.17it/s]|
Column Shapes Score: 83.7%

(2/2) Evaluating Column Pair Trends: |██████████| 105/105 [00:00<00:00, 127.14it/s]|
Column Pair Trends Score: 72.52%

Overall Score (Average): 78.11%

Quality Score: <sdmetrics.reports.single_table.quality_report.QualityReport object at 0x7eeb6825ebf0>


In [22]:
synthesizer.get_loss_values()


Unnamed: 0,Epoch,Generator Loss,Discriminator Loss
0,0,1.579942,-0.043535
1,1,1.129351,-0.006719
2,2,0.841202,-0.070814
3,3,0.397571,-0.065605
4,4,0.142792,-0.128091
...,...,...,...
495,495,-0.253239,0.428573
496,496,-0.741080,0.219617
497,497,-0.000300,-0.483392
498,498,0.160666,-0.233624


In [23]:
fig = synthesizer.get_loss_values_plot()
fig.show()

In [24]:
# synthetic_data = synthetic_data.apply(lambda x: d[x.name].inverse_transform(x))


In [25]:
# synthetic_data.to_csv('/kaggle/working/results.csv')