In [45]:
import sdv
from sdv.evaluation.single_table import evaluate_quality
from sdv.single_table import CTGANSynthesizer
from sdv.single_table import TVAESynthesizer
from sdv.metadata import SingleTableMetadata
from sklearn.impute import SimpleImputer
import pandas as pd
from sklearn.preprocessing import StandardScaler
import os
from sklearn.ensemble import RandomForestRegressor

In [44]:
! pip install xgboost

Defaulting to user installation because normal site-packages is not writeable
Collecting xgboost
  Downloading xgboost-2.0.2-py3-none-win_amd64.whl (99.8 MB)
     --------------------------------------- 99.8/99.8 MB 12.1 MB/s eta 0:00:00
Installing collected packages: xgboost
Successfully installed xgboost-2.0.2


In [31]:
MAIN_PATH = "mmo"

In [32]:
# Read the data
all_data = pd.read_excel("Biopak data compiled final 1-70 v2.xlsx")
# Extract just JVA data
n_columns = len(all_data.columns)
jva_column_count = 14
data = all_data.iloc[:,n_columns-14:].copy()

#### Synthetic data generation

In [21]:
# Function to remove features with 0 variance (constant value features)
def get_const_value_features_to_drop(df):
    return [e for e in df.columns if df[e].nunique() == 1]

def impute_and_remove_zero_var(features_ta_r):
    # Performing imputation to replace any NaN value in the dataset with the median of the feature
    imputer = SimpleImputer(strategy="median")
    features_ta_r_imputed = imputer.fit_transform(features_ta_r)
    features_ta_r_imputed_df = pd.DataFrame(features_ta_r_imputed,
                                            columns=features_ta_r.columns)

    # Remove zero variance features
    columns_to_remove = get_const_value_features_to_drop(features_ta_r_imputed_df)
    features_ta_r_imputed_df.drop(columns=columns_to_remove, inplace=True)

    return features_ta_r_imputed_df

imputed_data = impute_and_remove_zero_var(data)

In [23]:
def tune_tvae(features_ta_r_imputed_df, metadata):
    # Tuning a variational autoencoder
    tvae_scores = []
    embedding_dims = [128, 256]
    compress_dims = [128, 256]
    decompress_dims = [128, 256]

    for embedding_dim in embedding_dims:
        for compress_dim in compress_dims:
            for decompress_dim in decompress_dims:
                # Creating a Variational Autoencoder synthesizer
                tvae_synthesizer = TVAESynthesizer(metadata,
                                                embedding_dim=embedding_dim,
                                                compress_dims=(compress_dim,compress_dim),
                                                decompress_dims=(decompress_dim,decompress_dim),
                                                epochs=500)
                
                # Fitting the model
                tvae_synthesizer.fit(features_ta_r_imputed_df)
                
                # Generating synthetic data
                synthetic_data = tvae_synthesizer.sample(num_rows=200)

                # Evaluating synthetic data
                quality_report = evaluate_quality(
                    features_ta_r_imputed_df,
                    synthetic_data,
                    metadata,
                    verbose=False
                )

                tvae_scores.append((quality_report.get_score(), tvae_synthesizer))

    return tvae_scores

def tune_ctgan(features_ta_r_imputed_df, metadata):
    # Tuning a ctgan
    ctgan_scores = []
    embedding_dims = [256, 512]
    generator_dims = [256, 512]
    discriminator_dims = [128, 256]

    for embedding_dim in embedding_dims:
        for generator_dim in generator_dims:
            for discriminator_dim in discriminator_dims:
                # Creating a ctgan synthesizer
                ctgan_synthesizer = CTGANSynthesizer(metadata,
                                                    embedding_dim=embedding_dim,
                                                    generator_dim=(generator_dim,generator_dim),
                                                    discriminator_dim=(discriminator_dim,discriminator_dim),
                                                    epochs=500)
                
                # Fitting the model
                ctgan_synthesizer.fit(features_ta_r_imputed_df)
                
                # Generating synthetic data
                synthetic_data = ctgan_synthesizer.sample(num_rows=200)

                # Evaluating synthetic data
                quality_report = evaluate_quality(
                    features_ta_r_imputed_df,
                    synthetic_data,
                    metadata,
                    verbose=False
                )

                ctgan_scores.append((quality_report.get_score(), ctgan_synthesizer))

    return ctgan_scores

def run_augmentation_pipeline(features_ta_r_imputed_df):

    # Creating metadata object to get metadata about the original dataset of extracted features
    metadata = SingleTableMetadata()
    metadata.detect_from_dataframe(data=features_ta_r_imputed_df)

    # Get tvae and ctgan tuning results
    tvae_scores = tune_tvae(features_ta_r_imputed_df, metadata)
    ctgan_scores = tune_ctgan(features_ta_r_imputed_df, metadata)

    # Return scores
    return tvae_scores, ctgan_scores

In [24]:
tvae_scores, ctgan_scores = run_augmentation_pipeline(imputed_data)

In [25]:
# Creating a dataframe of tuning results for all
results = pd.DataFrame({
    "TVAE-TA-R": sorted([each[0] for each in tvae_scores], reverse=True),
    "CTGAN-TA-R": sorted([each[0] for each in ctgan_scores], reverse=True),
})
results

Unnamed: 0,TVAE-TA-R,CTGAN-TA-R
0,0.867345,0.76425
1,0.859138,0.762946
2,0.855135,0.759312
3,0.851106,0.747622
4,0.840747,0.711818
5,0.839228,0.701221
6,0.835476,0.694926
7,0.816627,0.692241


In [34]:
# obtaining the best models for each of the six muscles
best_tvae = sorted(tvae_scores, reverse=True)[0]
best_ctgan = sorted(ctgan_scores, reverse=True)[0]

# Save best models
path = "{}_best_models/".format(MAIN_PATH)
if not os.path.exists(path):  
    os.makedirs(path)
best_tvae[1].save(
    filepath = path + "/" + "tvae.pkl"
)
best_ctgan[1].save(
    filepath = path + "/" + "ctgan.pkl"
)

In [37]:
# Loading the synthesizer
path = "{}_best_models/".format(MAIN_PATH)
best_synthesizer = TVAESynthesizer.load(
    filepath = path + "/" + 'tvae.pkl'
)

# Generating 2000 synthetic observations using the trained model
synthetic_data = best_synthesizer.sample(num_rows=2000, batch_size=100)

Sampling rows:   0%|          | 0/2000 [00:00<?, ?it/s]

Sampling rows: 100%|██████████| 2000/2000 [00:00<00:00, 2140.76it/s]


In [47]:
# combine with original data
full_data = pd.concat([synthetic_data, imputed_data], axis=0)

#### Data proprocessing

In [49]:
# Scale the data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(full_data)

# Handle categorical variables
df = pd.DataFrame(scaled_data, columns=full_data.columns)
df_encoded = pd.get_dummies(df)

#### Splitting the data

In [50]:
original_data = full_data.iloc[2000:,:]

#### Modelling

(2000, 14)