In [1]:
!pip install sdv
!pip install pandas
!pip install scikit-learn
!pip install io
!pip install numpy

Collecting sdv
  Downloading sdv-1.17.2-py3-none-any.whl.metadata (13 kB)
Collecting boto3<2.0.0,>=1.28 (from sdv)
  Downloading boto3-1.35.68-py3-none-any.whl.metadata (6.7 kB)
Collecting copulas>=0.12.0 (from sdv)
  Downloading copulas-0.12.0-py3-none-any.whl.metadata (9.1 kB)
Collecting ctgan>=0.10.2 (from sdv)
  Downloading ctgan-0.10.2-py3-none-any.whl.metadata (10 kB)
Collecting deepecho>=0.6.1 (from sdv)
  Downloading deepecho-0.6.1-py3-none-any.whl.metadata (10 kB)
Collecting rdt>=1.13.1 (from sdv)
  Downloading rdt-1.13.1-py3-none-any.whl.metadata (10 kB)
Collecting sdmetrics>=0.17.0 (from sdv)
  Downloading sdmetrics-0.17.0-py3-none-any.whl.metadata (8.7 kB)
Collecting platformdirs>=4.0 (from sdv)
  Downloading platformdirs-4.3.6-py3-none-any.whl.metadata (11 kB)
Collecting botocore<2.0.0,>=1.31 (from sdv)
  Downloading botocore-1.35.68-py3-none-any.whl.metadata (5.7 kB)
Collecting s3transfer<0.11.0,>=0.10.0 (from boto3<2.0.0,>=1.28->sdv)
  Downloading s3tr

In [2]:
import kagglehub
from sdv.single_table import CTGANSynthesizer
import pandas as pd
import numpy as np
from sklearn import preprocessing 
from io import StringIO
from collections import defaultdict

from sdv.metadata import Metadata
from sdv.evaluation.single_table import evaluate_quality


In [3]:
# playground_series_s4e11_path = kagglehub.competition_download('playground-series-s4e11')

In [4]:
path='/kaggle/input/playground-series-s4e11/train.csv'
train = pd.read_csv(path)


In [5]:
student_train = train[train["Working Professional or Student"]=="Student"]
student_train.drop(['id','Working Professional or Student','Profession','Work Pressure','Job Satisfaction'],axis=1, inplace=True)
student_train.dropna(axis=1, how='all', inplace=True)

In [6]:
string_columns = student_train.select_dtypes(include=['object','string']).columns
numeric_student = student_train[student_train.columns.difference(string_columns)]


In [7]:
string_student=student_train[string_columns]

In [8]:

d = defaultdict(preprocessing.LabelEncoder)
string_student = string_student.apply(lambda x: d[x.name].fit_transform(x))

In [9]:
numeric_student = numeric_student.join(string_student)

In [10]:
numeric_student.select_dtypes(include=['object']).columns

Index([], dtype='object')

In [11]:
# Define metadata for your dataset
metadata = Metadata()
# md = metadata.detect_from_dataframe(student_train).save_to_json('./metadata.json')

In [12]:
metadata_path='/kaggle/input/metadata-student/metadata.json'

In [13]:
md = metadata.load_from_json(metadata_path)


In [14]:
synthesizer = CTGANSynthesizer(
    md,
    epochs=350,
    generator_dim=(256, 256, 256),
    log_frequency=True)


In [15]:
synthesizer.auto_assign_transformers(numeric_student)


In [16]:
synthesizer.fit(numeric_student)

In [17]:
synthesizer.save(
    filepath='/kaggle/working/my_synthesizer.pkl'
)

In [18]:
synthetic_data = synthesizer.sample(len(numeric_student))


In [19]:
synthetic_data.to_csv('/kaggle/working/numeric_synethetic_data.csv')

In [20]:
quality_score = evaluate_quality(synthetic_data, numeric_student,md)
print(f"Quality Score: {quality_score}")

Generating report ...

(1/2) Evaluating Column Shapes: |██████████| 15/15 [00:00<00:00, 89.34it/s]|
Column Shapes Score: 83.56%

(2/2) Evaluating Column Pair Trends: |██████████| 105/105 [00:00<00:00, 128.53it/s]|
Column Pair Trends Score: 69.29%

Overall Score (Average): 76.42%

Quality Score: <sdmetrics.reports.single_table.quality_report.QualityReport object at 0x7ad3245e8a90>


In [21]:
synthesizer.get_loss_values()

Unnamed: 0,Epoch,Generator Loss,Discriminator Loss
0,0,1.617047,-0.051137
1,1,1.240352,-0.034994
2,2,1.052859,0.031982
3,3,0.692600,0.144205
4,4,0.413872,0.167208
...,...,...,...
345,345,-2.394638,0.023164
346,346,-2.432284,0.104075
347,347,-2.337579,0.177952
348,348,-2.392318,-0.227242


In [22]:
fig = synthesizer.get_loss_values_plot()
fig.show()

In [23]:

# Verify adjustments
adjusted_means = synthetic_data.mean()
adjusted_stds = synthetic_data.std()
adjusted_corr = synthetic_data.corr()

original_means = numeric_student.mean()
original_stds = numeric_student.std()
original_corr = numeric_student.corr()


print("Original Means:")
print(original_means)
print("\nAdjusted Means:")
print(adjusted_means)

print("\nOriginal Standard Deviations:")
print(original_stds)
print("\nAdjusted Standard Deviations:")
print(adjusted_stds)

print("\nOriginal Correlation Matrix:")
print(original_corr)
print("\nAdjusted Correlation Matrix:")
print(adjusted_corr)

diff_mean=(adjusted_means/original_means-1)*100
diff_sd=(adjusted_stds/original_stds-1)*100
diff_corr=(adjusted_corr/original_corr-1)*100
print("\n Diff Mean %:")
print(diff_mean)
print("\n Diff SD %:")
print(diff_sd)
print("\n Diff Corr %:")
print(diff_corr)

Original Means:
Academic Pressure                          3.142227
Age                                       25.822300
CGPA                                       7.658575
Depression                                 0.585499
Financial Stress                           3.139867
Study Satisfaction                         2.944893
Work/Study Hours                           7.156984
Name                                     124.655819
Gender                                     0.557220
City                                      26.294291
Sleep Duration                            10.794846
Dietary Habits                             8.093258
Degree                                    27.692341
Have you ever had suicidal thoughts ?      0.632809
Family History of Mental Illness           0.483961
dtype: float64

Adjusted Means:
Name                                     123.366188
Gender                                     0.482563
Age                                       26.382101
City            

In [24]:
# synthetic_data = pd.read_csv('/kaggle/input/synthetic-and-model/numeric_synethetic_data.csv')

In [25]:
# synthetic_data

In [26]:
for key in d.keys():
    print(key)
    synthetic_data[key]=d[key].inverse_transform(synthetic_data[key])

Name
Gender
City
Sleep Duration
Dietary Habits
Degree
Have you ever had suicidal thoughts ?
Family History of Mental Illness


In [27]:
# student_train.drop(['id','Working Professional or Student','Profession','Work Pressure','Job Satisfaction'],axis=1, inplace=True)
synthetic_data['Working Professional or Student']='Student'
synthetic_data['Profession']='Student'
synthetic_data['Work Pressure']=None
synthetic_data['Job Satisfaction']=None


In [28]:
synthetic_data.to_csv('synthetic_data.csv')