In [107]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import sdv
from sdv.evaluation.single_table import evaluate_quality
from sdv.single_table import CTGANSynthesizer
from sdv.single_table import TVAESynthesizer
from sdv.metadata import SingleTableMetadata
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold, cross_validate
from sklearn.base import clone
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
import os
from catboost import CatBoostRegressor
from pytorch_tabnet.tab_model import TabNetRegressor

In [76]:
# Global variable
CURR_ACTIVITY = "mmo"
CURR_MUSCLE = "ta_r"
CURR_TYPE = "intensity"

In [15]:
# Read the data
all_data = pd.read_excel("Biopak data compiled final 1-70 v2.xlsx")

# Extract just JVA data
n_columns = len(all_data.columns)
step = 14
if CURR_ACTIVITY == "map":
    start = n_columns - step
elif CURR_ACTIVITY == "mle":
    start = n_columns - (step*2)
else:
    start = n_columns - (step*3)
data = all_data.iloc[:,start:start+step].copy()

In [16]:
# Visualize the data
data.head()

Unnamed: 0,JVA MMO left integral,JVA MMO left Integral <300Hz,JVA MMO left Integral >300Hz,JVA MMO left Integral ratio,JVA MMO left peak amplitude,JVA MMO left peak frequency,JVA MMO left median frequency,JVA MMO right integral,JVA MMO right Integral <300Hz,JVA MMO right Integral >300Hz,JVA MMO right Integral ratio,JVA MMO right peak amplitude,JVA MMO right peak frequency,JVA MMO right median frequency
0,18.8,18.1,0.7,0.04,2.7,20,54,22.5,21.6,0.9,0.04,4.0,24,49
1,19.8,17.4,2.4,0.14,1.2,39,112,22.7,20.2,2.5,0.12,1.1,98,127
2,17.5,16.2,1.3,0.08,1.8,78,88,20.3,18.1,2.1,0.12,2.2,24,73
3,17.0,15.3,1.7,0.11,1.8,78,98,11.6,10.5,1.1,0.1,1.1,20,83
4,49.6,46.9,2.7,0.06,6.2,68,73,65.5,62.0,3.6,0.06,8.2,20,88


In [17]:
def add_quotients(muscle, data):
    intensity_quotients = pd.read_csv(f"quotients/{muscle}/quotient_intensity.csv")
    duration_quotients = pd.read_csv(f"quotients/{muscle}/quotient_duration.csv")
    temp_data_intensity = intensity_quotients.copy()
    temp_data_duration = duration_quotients.copy()
    temp_data_intensity.columns = [f"{muscle}_intensity_" + name
                                for name in temp_data_intensity.columns]
    temp_data_duration.columns = [f"{muscle}_duration_" + name
                                for name in temp_data_duration.columns]
    
    indices_to_insert_nan = [1, 40, 42, 60]

    # Insert NaN values without replacing existing values
    for index in indices_to_insert_nan:
        temp_data_intensity = pd.concat([temp_data_intensity.iloc[:index],
                                        pd.DataFrame([[np.nan] * len(temp_data_intensity.columns)],
                                                    columns=temp_data_intensity.columns),
                                                    temp_data_intensity.iloc[index:]],
                                                    ignore_index=True).copy()
        temp_data_duration = pd.concat([temp_data_duration.iloc[:index],
                                        pd.DataFrame([[np.nan] * len(temp_data_duration.columns)],
                                                    columns=temp_data_duration.columns),
                                                    temp_data_duration.iloc[index:]],
                                                    ignore_index=True).copy()
        
    # Add quotients to JVA data
    new_data = pd.concat([data, temp_data_intensity], axis=1)
    new_data = pd.concat([new_data, temp_data_duration], axis=1)

    return new_data

In [18]:
# Data with quotient
new_data = add_quotients(CURR_ACTIVITY, data)
new_data = new_data.dropna().copy()

In [19]:
new_data.head()

Unnamed: 0,JVA MMO left integral,JVA MMO left Integral <300Hz,JVA MMO left Integral >300Hz,JVA MMO left Integral ratio,JVA MMO left peak amplitude,JVA MMO left peak frequency,JVA MMO left median frequency,JVA MMO right integral,JVA MMO right Integral <300Hz,JVA MMO right Integral >300Hz,...,mmo_intensity_mm_r,mmo_intensity_mm_l,mmo_intensity_da_r,mmo_intensity_da_l,mmo_duration_ta_r,mmo_duration_ta_l,mmo_duration_mm_r,mmo_duration_mm_l,mmo_duration_da_r,mmo_duration_da_l
0,18.8,18.1,0.7,0.04,2.7,20,54,22.5,21.6,0.9,...,0.18,0.19,0.18,0.21,9.47,25.17,73.17,94.0,87.92,137.67
2,17.5,16.2,1.3,0.08,1.8,78,88,20.3,18.1,2.1,...,0.1,0.1,0.11,0.09,74.83,78.83,87.67,10.0,92.58,9.0
3,17.0,15.3,1.7,0.11,1.8,78,98,11.6,10.5,1.1,...,0.17,0.18,0.18,0.17,16.7,3.5,47.0,67.83,14.75,68.83
4,49.6,46.9,2.7,0.06,6.2,68,73,65.5,62.0,3.6,...,0.12,0.13,0.13,0.12,30.87,17.67,32.83,53.67,28.92,54.67
5,10.4,9.5,0.9,0.1,1.2,20,73,17.7,16.5,1.3,...,0.11,0.11,0.1,0.13,24.87,11.67,157.33,59.67,53.58,103.33


In [20]:
# Function to remove features with 0 variance (constant value features)
def get_const_value_features_to_drop(df):
    return [e for e in df.columns if df[e].nunique() == 1]

def impute_and_remove_zero_var(features_ta_r):
    # Performing imputation to replace any NaN value in the dataset with the median of the feature
    imputer = SimpleImputer(strategy="median")
    features_ta_r_imputed = imputer.fit_transform(features_ta_r)
    features_ta_r_imputed_df = pd.DataFrame(features_ta_r_imputed,
                                            columns=features_ta_r.columns)

    # Remove zero variance features
    columns_to_remove = get_const_value_features_to_drop(features_ta_r_imputed_df)
    features_ta_r_imputed_df.drop(columns=columns_to_remove, inplace=True)

    return features_ta_r_imputed_df

imputed_data = impute_and_remove_zero_var(new_data)

In [21]:
def tune_tvae(features_ta_r_imputed_df, metadata):
    # Tuning a variational autoencoder
    tvae_scores = []
    embedding_dims = [128, 256]
    compress_dims = [128, 256]
    decompress_dims = [128, 256]

    for embedding_dim in embedding_dims:
        for compress_dim in compress_dims:
            for decompress_dim in decompress_dims:
                # Creating a Variational Autoencoder synthesizer
                tvae_synthesizer = TVAESynthesizer(metadata,
                                                embedding_dim=embedding_dim,
                                                compress_dims=(compress_dim,compress_dim),
                                                decompress_dims=(decompress_dim,decompress_dim),
                                                epochs=500)
                
                # Fitting the model
                tvae_synthesizer.fit(features_ta_r_imputed_df)
                
                # Generating synthetic data
                synthetic_data = tvae_synthesizer.sample(num_rows=200)

                # Evaluating synthetic data
                quality_report = evaluate_quality(
                    features_ta_r_imputed_df,
                    synthetic_data,
                    metadata,
                    verbose=False
                )

                tvae_scores.append((quality_report.get_score(), tvae_synthesizer))

    return tvae_scores

def tune_ctgan(features_ta_r_imputed_df, metadata):
    # Tuning a ctgan
    ctgan_scores = []
    embedding_dims = [256, 512]
    generator_dims = [256, 512]
    discriminator_dims = [128, 256]

    for embedding_dim in embedding_dims:
        for generator_dim in generator_dims:
            for discriminator_dim in discriminator_dims:
                # Creating a ctgan synthesizer
                ctgan_synthesizer = CTGANSynthesizer(metadata,
                                                    embedding_dim=embedding_dim,
                                                    generator_dim=(generator_dim,generator_dim),
                                                    discriminator_dim=(discriminator_dim,discriminator_dim),
                                                    epochs=500)
                
                # Fitting the model
                ctgan_synthesizer.fit(features_ta_r_imputed_df)
                
                # Generating synthetic data
                synthetic_data = ctgan_synthesizer.sample(num_rows=200)

                # Evaluating synthetic data
                quality_report = evaluate_quality(
                    features_ta_r_imputed_df,
                    synthetic_data,
                    metadata,
                    verbose=False
                )

                ctgan_scores.append((quality_report.get_score(), ctgan_synthesizer))

    return ctgan_scores

def run_augmentation_pipeline(features_ta_r_imputed_df):

    # Creating metadata object to get metadata about the original dataset of extracted features
    metadata = SingleTableMetadata()
    metadata.detect_from_dataframe(data=features_ta_r_imputed_df)

    # Get tvae and ctgan tuning results
    tvae_scores = tune_tvae(features_ta_r_imputed_df, metadata)
    ctgan_scores = tune_ctgan(features_ta_r_imputed_df, metadata)

    # Return scores
    return tvae_scores, ctgan_scores

tvae_scores, ctgan_scores = run_augmentation_pipeline(imputed_data)

In [22]:
# Creating a dataframe of tuning results for all
results = pd.DataFrame({
    "TVAE": sorted([each[0] for each in tvae_scores], reverse=True),
    "CTGAN": sorted([each[0] for each in ctgan_scores], reverse=True),
})
results

Unnamed: 0,TVAE,CTGAN
0,0.863684,0.782423
1,0.862057,0.778562
2,0.859711,0.750616
3,0.858614,0.748247
4,0.856019,0.725311
5,0.849368,0.718764
6,0.846309,0.711128
7,0.845648,0.683741


In [29]:
# obtaining the best models for each of the six muscles
best_tvae = sorted(tvae_scores, reverse=True)[0]

# Generating 2000 synthetic observations using the trained model
synthetic_data = best_tvae[1].sample(num_rows=2000, batch_size=100)

Sampling rows: 100%|██████████| 2000/2000 [00:00<00:00, 2195.28it/s]


In [72]:
len_columns = len(synthetic_data.columns)
columns = synthetic_data.columns[:len_columns-12]

In [77]:
# get response variable
y_train = synthetic_data[f"{CURR_ACTIVITY}_{CURR_TYPE}_{CURR_MUSCLE}"].copy()
y_test = imputed_data[f"{CURR_ACTIVITY}_{CURR_TYPE}_{CURR_MUSCLE}"].copy()
X_train = synthetic_data[columns]
X_test = imputed_data[columns]

#### Preprocessing

In [81]:
# Scaling the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train).copy()
X_test_scaled = scaler.transform(X_test).copy()

#### Modelling

In [104]:
def get_lr_results(X_train, y_train):
    scores_lr = []
    # Create a CV instance
    cv_lr = KFold(n_splits=5, shuffle=True, random_state=42)
    for train, val in cv_lr.split(X_train, y_train):
        X_train_temp = X_train[train]
        X_val_temp = X_train[val]
        y_train_temp = y_train.values[train]
        y_val_temp = y_train.values[val]

        # Define model
        lr_model = LinearRegression()
        lr_model.fit(X_train_temp, y_train_temp)

        # Predictions
        predictions_train = lr_model.predict(X_train_temp)
        predictions_val = lr_model.predict(X_val_temp)

        train_rmse = np.sqrt(mean_squared_error(y_train_temp,
                                                predictions_train))
        val_rmse = np.sqrt(mean_squared_error(y_val_temp,
                                            predictions_val))
        scores_lr.append((val_rmse, train_rmse))

    # Compute average scores
    avg_val_rmse_lr = np.sum([each[0]
                             for each in scores_lr]) / len(scores_lr)
    avg_train_rmse_lr = np.sum([each[1]
                               for each in scores_lr]) / len(scores_lr)
    
    # Final fitted model
    final_lr = LinearRegression()
    final_lr.fit(X_train, y_train)

    return ((avg_train_rmse_lr,
             avg_val_rmse_lr),
             final_lr)

In [100]:
"""
The following function was taken from
https://xgboost.readthedocs.io/en/stable/python/sklearn_estimator.html
"""
def fit_and_score(estimator, X_train, X_test, y_train, y_test):
    """Fit the estimator on the train set and score it on both sets"""
    estimator.fit(X_train, y_train,
                  eval_set=[(X_test, y_test)], verbose=False)
    predictions_train = estimator.predict(X_train)
    predictions_test = estimator.predict(X_test)
    train_score = np.sqrt(mean_squared_error(y_train,
                                             predictions_train)) #estimator.score(X_train, y_train)
    test_score = np.sqrt(mean_squared_error(y_test,
                                            predictions_test)) #estimator.score(X_test, y_test)
    return estimator, train_score, test_score

def get_catboost_results(X_train, y_train):
    model = CatBoostRegressor(early_stopping_rounds=10)
    params = {
        'depth':range(3,6,2),
        'iterations':[100,500,1000],
        'learning_rate':[0.001,0.01,0.03,0.1], 
        # 'l2_leaf_reg': [1, 3, 5, 7, 9],
        # 'border_count':[32,5,10,20,50,100,200],
        # 'ctr_border_count':[50,5,10,20,100,200],
        # 'thread_count':4
    }
    results = model.grid_search(params,
                                X=X_train,
                                y=y_train,
                                cv=5,
                                refit=True,
                                partition_random_seed=42)
    
    # Refit using best params to create best model
    best_params = results["params"]
    best_model = CatBoostRegressor(**best_params)
    best_model.fit(X_train, y_train)

    # Get training and validation scores
    best_train_score = min(results['cv_results']['train-RMSE-mean'])
    best_val_score = min(results['cv_results']['test-RMSE-mean'])

    scores = {
        "Best Train RMSE": best_train_score,
        "Best Validation RMSE": best_val_score,
    }
    return scores, best_model

In [105]:
def get_xgboost_results(X_train, y_train):
    # Parameters to tune in the grid
    params_to_test = {
        'max_depth': range(3,10,2),
        'min_child_weight': range(1,6,2),
        'n_estimators': [100, 500, 1000],
        'learning_rate': [0.1, 0.01, 0.001]
    }

    # Performing gridSearch manually
    best_xgb_scores = None
    best_xgb_model = None
    curr_val_score = float('inf')
    for depth in params_to_test['max_depth']:
        for weight in params_to_test['min_child_weight']:
            for lr in params_to_test['learning_rate']:
                for estimator_count in params_to_test['n_estimators']:
                    # Create a CV instance
                    cv = KFold(n_splits=5, shuffle=True, random_state=42)

                    # XGBoost model
                    xgb_model = XGBRegressor(n_estimators=estimator_count, max_depth=depth,
                                            min_child_weight=weight, early_stopping_rounds=10,
                                            learning_rate=lr)

                    # Add cross validation results
                    total_train_score = 0
                    total_val_score = 0
                    for train, val in cv.split(X_train, y_train):
                        X_train_temp = X_train[train]
                        X_val_temp = X_train[val]
                        y_train_temp = y_train.values[train]
                        y_val_temp = y_train.values[val]

                        est, train_score, test_score = fit_and_score(
                            clone(xgb_model), X_train_temp,
                            X_val_temp, y_train_temp,
                            y_val_temp
                        )
                        total_train_score += train_score
                        total_val_score += test_score

                        avg_val_score = total_val_score / 5
                        if avg_val_score < curr_val_score:
                            curr_val_score = avg_val_score
                            best_xgb_scores = (total_train_score / 5, avg_val_score)
                            best_xgb_model = est

    # Determine best model (refitted)
    best_params = best_xgb_model.get_params()
    best_params["early_stopping_rounds"] = None
    best_model = XGBRegressor(**best_params)
    best_model.fit(X_train, y_train)
    return best_xgb_scores, best_model


In [106]:
def get_rf_results(X_train, y_train):
    params_to_test = {
        'max_depth': range(3,10,2),
        'n_estimators': [100, 500, 1000],
        'max_features': [1.0, 'sqrt', 'log2']
    }
    rf_grid = GridSearchCV(RandomForestRegressor(random_state=42),
                           params_to_test, cv=5,
                           scoring='neg_mean_squared_error',
                           return_train_score=True)

    # Best scores
    rf_grid.fit(X_train, y_train)
    rf_train_score = np.sqrt(
        -rf_grid.cv_results_['mean_train_score'][rf_grid.best_index_])
    rf_val_score = np.sqrt(-rf_grid.best_score_)

    # Best params
    best_params = rf_grid.best_params_
    best_model = RandomForestRegressor(**best_params)
    best_model.fit(X_train, y_train)

    return (rf_train_score, rf_val_score), best_model

In [115]:
def get_tabnet_results(X_train, y_train):
    reduced_X_train, X_val, reduced_y_train, y_val = \
        train_test_split(X_train, y_train)
    model = TabNetRegressor()
    model.fit(reduced_X_train,
              reduced_y_train,
              eval_metric="rmse",
              eval_set=[(X_val, y_val)])
    return model

In [117]:
tabnet_results = get_tabnet_results(X_train_scaled,
                                    y_train.values.reshape(-1,1))



AssertionError: r is not available, choose in ['auc', 'accuracy', 'balanced_accuracy', 'logloss', 'mae', 'mse', 'rmsle', 'unsup_loss', 'unsup_loss_numpy', 'rmse']

In [92]:
catboost_results = get_catboost_results(X_train_scaled,
                                        y_train)
xgboost_results = get_xgboost_results(X_train_scaled,
                                      y_train)

0:	learn: 0.1443760	test: 0.1417424	best: 0.1417424 (0)	total: 2.54ms	remaining: 2.54s
1:	learn: 0.1402942	test: 0.1376521	best: 0.1376521 (1)	total: 7.55ms	remaining: 3.77s
2:	learn: 0.1363700	test: 0.1337351	best: 0.1337351 (2)	total: 9.44ms	remaining: 3.14s
3:	learn: 0.1325437	test: 0.1299155	best: 0.1299155 (3)	total: 10.7ms	remaining: 2.67s
4:	learn: 0.1288395	test: 0.1261986	best: 0.1261986 (4)	total: 13.1ms	remaining: 2.6s
5:	learn: 0.1252603	test: 0.1226070	best: 0.1226070 (5)	total: 36.4ms	remaining: 6.02s
6:	learn: 0.1217800	test: 0.1191099	best: 0.1191099 (6)	total: 37ms	remaining: 5.25s
7:	learn: 0.1183961	test: 0.1157392	best: 0.1157392 (7)	total: 40ms	remaining: 4.96s
8:	learn: 0.1151409	test: 0.1124734	best: 0.1124734 (8)	total: 41.9ms	remaining: 4.61s
9:	learn: 0.1119800	test: 0.1093094	best: 0.1093094 (9)	total: 43.4ms	remaining: 4.29s
10:	learn: 0.1089319	test: 0.1062490	best: 0.1062490 (10)	total: 45ms	remaining: 4.05s
11:	learn: 0.1059874	test: 0.1032941	best: 0.103

In [94]:
catboost_preds = catboost_results[1].predict(X_test_scaled)

In [97]:
np.sqrt(mean_squared_error(y_test, catboost_preds)), mean_absolute_error(y_test, catboost_preds)

(0.04608289668388523, 0.03651715887540573)