In [35]:
import pandas as pd

# Load your dataset
df = pd.read_csv('data/stud.csv')
# Preview data
print(df.head())


   gender race_ethnicity parental_level_of_education         lunch  \
0  female        group B           bachelor's degree      standard   
1  female        group C                some college      standard   
2  female        group B             master's degree      standard   
3    male        group A          associate's degree  free/reduced   
4    male        group C                some college      standard   

  test_preparation_course  math_score  reading_score  writing_score  
0                    none          72             72             74  
1               completed          69             90             88  
2                    none          90             95             93  
3                    none          47             57             44  
4                    none          76             78             75  


In [36]:
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
target_cols = ['math_score', 'reading_score', 'writing_score']
X = df.drop(columns=target_cols)
y = df[target_cols]

# Encode categorical features
categorical_cols = ['gender', 'race_ethnicity', 'parental_level_of_education', 'lunch', 'test_preparation_course']
encoders = {col: LabelEncoder() for col in categorical_cols}

for col in categorical_cols:
    X[col] = encoders[col].fit_transform(X[col])

# Scale numeric features (targets)
scaler = MinMaxScaler()
y = pd.DataFrame(scaler.fit_transform(y), columns=target_cols)

In [37]:
from tabgan.sampler import GANGenerator
import pandas as pd

synthetic_data_list = []  # To store synthetic data for each target column

# Iterate over each target column
for target_col in y.columns:
    new_X, new_y = GANGenerator(
        gen_x_times=10,  # Generate 10 times more data
        cat_cols=categorical_cols,  # Categorical columns
        bot_filter_quantile=0.001,
        top_filter_quantile=0.999,
        is_post_process=True,
        gen_params={"batch_size": 50, "epochs": 1000, "patience": 50},  # GAN training parameters
    ).generate_data_pipe(train_df=X, target=y[[target_col]], test_df=X)  # Pass target as a DataFrame

    # Combine the generated features and target for this specific target column
    synthetic_data = pd.concat([new_X, new_y], axis=1)
    synthetic_data.columns = list(X.columns) + [target_col]  # Rename columns

    # Append to the list
    synthetic_data_list.append(synthetic_data)

# Merge synthetic data for all target columns
final_synthetic_data = synthetic_data_list[0]  # Start with the first DataFrame
for i in range(1, len(synthetic_data_list)):
    final_synthetic_data = final_synthetic_data.merge(
        synthetic_data_list[i][[target_cols[i]]],
        left_index=True,
        right_index=True
    )

# Display or save final synthetic data
print(final_synthetic_data.head())


Fitting CTGAN transformers for each column: 100%|██████████| 6/6 [00:00<00:00, 106.52it/s]
Training CTGAN, epochs::  24%|██▍       | 239/1000 [00:16<00:52, 14.62it/s]
Fitting CTGAN transformers for each column: 100%|██████████| 6/6 [00:00<00:00, 117.76it/s]
Training CTGAN, epochs::  35%|███▌      | 350/1000 [00:23<00:43, 15.08it/s]
Fitting CTGAN transformers for each column: 100%|██████████| 6/6 [00:00<00:00, 117.59it/s]
Training CTGAN, epochs::  35%|███▍      | 348/1000 [00:23<00:43, 15.02it/s]


   gender  race_ethnicity  parental_level_of_education  lunch  \
0       1               4                            2      0   
1       1               4                            2      0   
2       1               4                            2      0   
3       1               4                            2      0   
4       1               4                            2      0   

   test_preparation_course  math_score  reading_score  writing_score  
0                        0    0.369426       0.762250       0.514766  
1                        0    0.277593       0.642882       0.612392  
2                        0    0.489613       0.556259       0.509706  
3                        0    0.568839       0.918147       0.473063  
4                        0    0.554555       0.516830       0.438481  


In [52]:
# Inverse transform target columns (scaled back to original range)
final_synthetic_data[target_cols] = scaler.inverse_transform(final_synthetic_data[target_cols])


In [43]:
# Check the unique values in each categorical column
unique_values = final_synthetic_data[categorical_cols].apply(lambda col: col.unique())

# Print the result
print(unique_values)


gender                                                            [male, female]
race_ethnicity                     [group E, group D, group B, group A, group C]
parental_level_of_education    [high school, some high school, associate's de...
lunch                                                   [free/reduced, standard]
test_preparation_course                                        [completed, none]
dtype: object


In [53]:
final_synthetic_data

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
0,male,group E,high school,free/reduced,completed,36.942575,80.266713,56.328945
1,male,group E,high school,free/reduced,completed,27.759261,70.359187,65.115238
2,male,group E,high school,free/reduced,completed,48.961313,63.169485,55.873525
3,male,group E,high school,free/reduced,completed,56.883893,93.206217,52.575701
4,male,group E,high school,free/reduced,completed,55.455512,59.896871,49.463304
...,...,...,...,...,...,...,...,...
20995,female,group D,master's degree,standard,none,68.527928,72.000000,81.068403
20996,female,group D,master's degree,standard,none,86.061210,77.112421,38.000000
20997,female,group D,master's degree,standard,none,87.000000,76.020294,96.809275
20998,female,group D,master's degree,standard,none,45.088968,66.457196,46.866869


In [54]:
# Concatenate the original dataset with the final synthetic dataset
final_synthetic_data_with_original = pd.concat([df, final_synthetic_data], axis=0)

# Reset index (optional, but helpful for keeping things organized)
final_synthetic_data_with_original.reset_index(drop=True, inplace=True)

# Display the combined dataset
print(final_synthetic_data_with_original.head())


   gender race_ethnicity parental_level_of_education         lunch  \
0  female        group B           bachelor's degree      standard   
1  female        group C                some college      standard   
2  female        group B             master's degree      standard   
3    male        group A          associate's degree  free/reduced   
4    male        group C                some college      standard   

  test_preparation_course  math_score  reading_score  writing_score  
0                    none        72.0           72.0           74.0  
1               completed        69.0           90.0           88.0  
2                    none        90.0           95.0           93.0  
3                    none        47.0           57.0           44.0  
4                    none        76.0           78.0           75.0  


In [55]:
final_synthetic_data_with_original.shape

(22000, 8)

In [56]:
from sklearn.utils import shuffle

# Shuffle the combined dataset
final_shuffled_data = shuffle(final_synthetic_data_with_original, random_state=42)

# Display the shuffled dataset
print(final_shuffled_data.head())


       gender race_ethnicity parental_level_of_education         lunch  \
13035  female        group E                 high school      standard   
3115     male        group A                 high school  free/reduced   
8732   female        group C          associate's degree  free/reduced   
7591   female        group B                some college  free/reduced   
221      male        group B          associate's degree      standard   

      test_preparation_course  math_score  reading_score  writing_score  
13035                    none  107.447644      92.693613      76.389010  
3115                completed   64.481594      77.269901      81.188755  
8732                completed   57.702255     103.857337      69.146261  
7591                completed   81.673020      73.885831      53.468732  
221                      none   87.000000      85.000000      73.000000  


In [57]:
final_shuffled_data.to_csv('data/stud.csv')

In [58]:
df1=pd.read_csv('data/stud1.csv')

In [63]:
df1['gender'].value_counts()

gender
female    518
male      482
Name: count, dtype: int64

In [64]:
df1['parental_level_of_education'].value_counts()

parental_level_of_education
some college          226
associate's degree    222
high school           196
some high school      179
bachelor's degree     118
master's degree        59
Name: count, dtype: int64