In [235]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import sdv
from sdv.evaluation.single_table import evaluate_quality
from sdv.single_table import CTGANSynthesizer
from sdv.single_table import TVAESynthesizer
from sdv.metadata import SingleTableMetadata
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold, cross_validate
from sklearn.model_selection import StratifiedKFold
from sklearn.base import clone
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
import os

In [236]:
# Global variable
CURR_ACTIVITY = "mmo"

In [237]:
# Read the data
all_data = pd.read_excel("Biopak data compiled final 1-70 v2.xlsx")

# Extract just JVA data
n_columns = len(all_data.columns)
step = 14
if CURR_ACTIVITY == "map":
    start = n_columns - step
elif CURR_ACTIVITY == "mle":
    start = n_columns - (step*2)
else:
    start = n_columns - (step*3)
data = all_data.iloc[:,start:start+step].copy()

data["ROM Slant"] = all_data["ROM Slant"]
data["Max opening"] = all_data["Deviation max point during opening slant"]
data["Max closing"] = all_data["Deviation max point during closing slant"]
data["Opening lateral right"] = all_data["Deviation max point during opening Lateral right"]
data["Opening lateral left"] = all_data["Deviation max point during opening Lateral left"]
data["Closing lateral right"] = all_data["Deviation max point during closing Lateral right"]
data["Closing lateral left"] = all_data["Deviation max point during closing Lateral left"]

In [238]:
'''
This function adds duration and intensity quotients to the existing data.
'''
def add_quotients(muscle, data):
    intensity_quotients = pd.read_csv(f"quotients/{muscle}/quotient_intensity.csv")
    duration_quotients = pd.read_csv(f"quotients/{muscle}/quotient_duration.csv")
    temp_data_intensity = intensity_quotients.copy()
    temp_data_duration = duration_quotients.copy()
    temp_data_intensity.columns = [f"{muscle}_intensity_" + name
                                for name in temp_data_intensity.columns]
    temp_data_duration.columns = [f"{muscle}_duration_" + name
                                for name in temp_data_duration.columns]
    
    indices_to_insert_nan = [1, 40, 42, 60]

    # Insert NaN values without replacing existing values
    for index in indices_to_insert_nan:
        temp_data_intensity = pd.concat([temp_data_intensity.iloc[:index],
                                        pd.DataFrame([[np.nan] * len(temp_data_intensity.columns)],
                                                    columns=temp_data_intensity.columns),
                                                    temp_data_intensity.iloc[index:]],
                                                    ignore_index=True).copy()
        temp_data_duration = pd.concat([temp_data_duration.iloc[:index],
                                        pd.DataFrame([[np.nan] * len(temp_data_duration.columns)],
                                                    columns=temp_data_duration.columns),
                                                    temp_data_duration.iloc[index:]],
                                                    ignore_index=True).copy()
        
    # Add quotients to JVA data
    new_data = pd.concat([data, temp_data_intensity], axis=1)
    new_data = pd.concat([new_data, temp_data_duration], axis=1)

    return new_data

In [239]:
# Data with quotient
new_data = add_quotients(CURR_ACTIVITY, data)
new_data = new_data.dropna().copy()

In [240]:
new_data.columns

Index(['JVA MMO left integral', 'JVA MMO left Integral <300Hz',
       'JVA MMO left Integral >300Hz', 'JVA MMO left Integral ratio',
       'JVA MMO left peak amplitude', 'JVA MMO left peak frequency',
       'JVA MMO left median frequency', 'JVA MMO right integral',
       'JVA MMO right Integral <300Hz', 'JVA MMO right Integral >300Hz',
       'JVA MMO right Integral ratio', 'JVA MMO right peak amplitude',
       'JVA MMO right peak frequency', 'JVA MMO right median frequency',
       'ROM Slant', 'Max opening', 'Max closing', 'Opening lateral right',
       'Opening lateral left', 'Closing lateral right', 'Closing lateral left',
       'mmo_intensity_ta_r', 'mmo_intensity_ta_l', 'mmo_intensity_mm_r',
       'mmo_intensity_mm_l', 'mmo_intensity_da_r', 'mmo_intensity_da_l',
       'mmo_duration_ta_r', 'mmo_duration_ta_l', 'mmo_duration_mm_r',
       'mmo_duration_mm_l', 'mmo_duration_da_r', 'mmo_duration_da_l'],
      dtype='object')

#### Synthetic data generation

In [241]:
# Function to remove features with 0 variance (constant value features)
def get_const_value_features_to_drop(df):
    return [e for e in df.columns if df[e].nunique() == 1]

def impute_and_remove_zero_var(features_ta_r):
    # Remove zero variance features
    columns_to_remove = get_const_value_features_to_drop(features_ta_r)
    features_ta_r.drop(columns=columns_to_remove, inplace=True)

    return features_ta_r

processed_data = impute_and_remove_zero_var(new_data)

In [242]:
def tune_tvae(features_ta_r_imputed_df, metadata):
    # Tuning a variational autoencoder
    tvae_scores = []
    embedding_dims = [128, 256]
    compress_dims = [128, 256]
    decompress_dims = [128, 256]

    for embedding_dim in embedding_dims:
        for compress_dim in compress_dims:
            for decompress_dim in decompress_dims:
                # Creating a Variational Autoencoder synthesizer
                tvae_synthesizer = TVAESynthesizer(metadata,
                                                embedding_dim=embedding_dim,
                                                compress_dims=(compress_dim,compress_dim),
                                                decompress_dims=(decompress_dim,decompress_dim),
                                                epochs=500)
                
                # Fitting the model
                tvae_synthesizer.fit(features_ta_r_imputed_df)
                
                # Generating synthetic data
                synthetic_data = tvae_synthesizer.sample(num_rows=200)

                # Evaluating synthetic data
                quality_report = evaluate_quality(
                    features_ta_r_imputed_df,
                    synthetic_data,
                    metadata,
                    verbose=False
                )

                tvae_scores.append((quality_report.get_score(), tvae_synthesizer))

    return tvae_scores

def tune_ctgan(features_ta_r_imputed_df, metadata):
    # Tuning a ctgan
    ctgan_scores = []
    embedding_dims = [256, 512]
    generator_dims = [256, 512]
    discriminator_dims = [128, 256]

    for embedding_dim in embedding_dims:
        for generator_dim in generator_dims:
            for discriminator_dim in discriminator_dims:
                # Creating a ctgan synthesizer
                ctgan_synthesizer = CTGANSynthesizer(metadata,
                                                    embedding_dim=embedding_dim,
                                                    generator_dim=(generator_dim,generator_dim),
                                                    discriminator_dim=(discriminator_dim,discriminator_dim),
                                                    epochs=500)
                
                # Fitting the model
                ctgan_synthesizer.fit(features_ta_r_imputed_df)
                
                # Generating synthetic data
                synthetic_data = ctgan_synthesizer.sample(num_rows=200)

                # Evaluating synthetic data
                quality_report = evaluate_quality(
                    features_ta_r_imputed_df,
                    synthetic_data,
                    metadata,
                    verbose=False
                )

                ctgan_scores.append((quality_report.get_score(), ctgan_synthesizer))

    return ctgan_scores

def run_augmentation_pipeline(features_ta_r_imputed_df):

    # Creating metadata object to get metadata about the original dataset of extracted features
    metadata = SingleTableMetadata()
    metadata.detect_from_dataframe(data=features_ta_r_imputed_df)

    # Get tvae and ctgan tuning results
    tvae_scores = tune_tvae(features_ta_r_imputed_df, metadata)
    ctgan_scores = tune_ctgan(features_ta_r_imputed_df, metadata)

    # Return scores
    return tvae_scores, ctgan_scores

tvae_scores, ctgan_scores = run_augmentation_pipeline(processed_data)

In [247]:
# Creating a dataframe of tuning results for all
results = pd.DataFrame({
    "TVAE": sorted([each[0] for each in tvae_scores], reverse=True),
    "CTGAN": sorted([each[0] for each in ctgan_scores], reverse=True),
})
results

Unnamed: 0,TVAE,CTGAN
0,0.865984,0.783699
1,0.854784,0.783497
2,0.85428,0.773307
3,0.851297,0.758952
4,0.848791,0.745825
5,0.845079,0.745177
6,0.833094,0.744206
7,0.831847,0.726132


In [248]:
# obtaining the best models for each of the six muscles
best_tvae = sorted(tvae_scores, reverse=True)[0]
best_ctgan = sorted(ctgan_scores, reverse=True)[0]

# Save best models
path = "obj_2_best_models/"
if not os.path.exists(path):  
    os.makedirs(path)
best_tvae[1].save(
    filepath = path + "/" + "tvae.pkl"
)
best_ctgan[1].save(
    filepath = path + "/" + "ctgan.pkl"
)

In [257]:
# Loading the synthesizer
path = "obj_2_best_models/"
best_synthesizer = TVAESynthesizer.load(
    filepath = path + "/" + "tvae.pkl"
)

# Generating 2000 synthetic observations using the trained model
synthetic_data = best_synthesizer.sample(num_rows=2000, batch_size=100)

Sampling rows: 100%|██████████| 2000/2000 [00:02<00:00, 915.00it/s]


In [258]:
indices_to_drop_opening = []
for i,(a,b) in enumerate(zip(synthetic_data["Opening lateral left"],
                             synthetic_data["Opening lateral right"])):
    if a > 0 and b > 0:
        indices_to_drop_opening.append(i)
synthetic_data.drop(indices_to_drop_opening, inplace=True)
# Reset indices
synthetic_data = synthetic_data.reset_index(drop=True)

indices_to_drop_closing = []
for i,(a,b) in enumerate(zip(synthetic_data["Closing lateral left"],
                             synthetic_data["Closing lateral right"])):
    if a > 0 and b > 0:
        indices_to_drop_closing.append(i)
synthetic_data.drop(indices_to_drop_closing, inplace=True)
# Reset indices
synthetic_data = synthetic_data.reset_index(drop=True)

In [260]:
# combine with original data
full_data = pd.concat([synthetic_data, processed_data], axis=0)

#### Add final target variables of deviation

In [262]:
full_data["Lateral deviation opening"] = np.abs(
    full_data["Opening lateral left"] - full_data["Opening lateral right"])
full_data["Lateral deviation closing"] = np.abs(
    full_data["Closing lateral left"] - full_data["Closing lateral right"])

In [263]:
# Fixing names of columns
column_names = full_data.columns
new_column_names = [
    each.replace("<", "less than ").replace(">", "greater than ").replace(",", " ")
    for each in column_names]
full_data.columns = new_column_names

In [264]:
full_data.head()

Unnamed: 0,JVA MMO left integral,JVA MMO left Integral less than 300Hz,JVA MMO left Integral greater than 300Hz,JVA MMO left Integral ratio,JVA MMO left peak amplitude,JVA MMO left peak frequency,JVA MMO left median frequency,JVA MMO right integral,JVA MMO right Integral less than 300Hz,JVA MMO right Integral greater than 300Hz,...,mmo_intensity_da_r,mmo_intensity_da_l,mmo_duration_ta_r,mmo_duration_ta_l,mmo_duration_mm_r,mmo_duration_mm_l,mmo_duration_da_r,mmo_duration_da_l,Lateral deviation opening,Lateral deviation closing
0,11.9,7.0,0.6,0.08,2.0,27,64,14.8,13.9,1.5,...,0.1,0.1,52.69,12.8,96.89,56.62,36.86,40.65,0.4,1.6
1,15.8,14.2,1.3,0.09,0.9,81,79,13.0,7.2,1.0,...,0.09,0.13,23.54,39.19,123.74,50.61,26.15,54.93,0.0,0.0
2,10.2,8.9,0.4,0.08,0.7,77,80,10.5,14.5,0.9,...,0.09,0.08,8.68,27.54,95.01,57.42,49.5,85.0,2.5,3.1
3,17.3,10.9,1.4,0.11,0.8,61,72,16.1,20.5,1.5,...,0.08,0.07,39.08,15.65,114.11,70.23,25.75,107.22,0.0,0.1
4,21.9,23.2,1.5,0.03,2.1,54,63,20.3,59.0,2.3,...,0.18,0.12,38.08,124.6,53.96,36.2,50.15,21.97,3.0,3.8


#### Add first categorical column

In [265]:
def create_first_categorical(col_opening, col_closing):
    cat = []
    for val_1, val_2 in zip(col_opening,
                            col_closing):
        if val_1 > 0 and val_2 > 0:
            cat.append("both")
        elif val_1 == 0 and val_2 == 0:
            cat.append("no")
        else:
            if val_1 > 0:
                cat.append("opening")
            else:
                cat.append("closing")
    return cat

full_data["Deviation scenario"] = create_first_categorical(full_data["Max opening"],
                                                           full_data["Max closing"])


#### Add second categorical column

In [266]:
def create_second_categorical(column, comparison_column):
    cat = []
    for val_1, val_2 in zip(column,
                            comparison_column):
        if val_1 == 0:
            cat.append("no")
        else:
            step = val_2 / 3
            if val_1 < step:
                cat.append("first")
            elif val_1 >= step and val_1 < step * 2:
                cat.append("second")
            elif val_1 >= step * 2:
                cat.append("third")
    return cat

full_data["Deviation opening"] = create_second_categorical(full_data["Max opening"],
                                                           full_data["ROM Slant"])
full_data["Deviation closing"] = create_second_categorical(full_data["Max closing"],
                                                           full_data["ROM Slant"])

#### Add third categorical column

In [267]:
def create_third_categorical(col_left, col_right):
    cat = []
    for val_1, val_2 in zip(col_left,
                            col_right):
        if val_1 == 0 and val_2 == 0:
            cat.append("no")
        elif val_1 > 0 and val_2 > 0:
            cat.append("both")
        else:
            if val_1 > 0:
                cat.append("left")
            else:
                cat.append("right")
    return cat

full_data["Deviation direction opening"] = create_third_categorical(full_data["Opening lateral left"],
                                                                    full_data["Opening lateral right"])
full_data["Deviation direction closing"] = create_third_categorical(full_data["Closing lateral left"],
                                                                    full_data["Closing lateral right"])

In [296]:
full_data.columns

Index(['JVA MMO left integral', 'JVA MMO left Integral less than 300Hz',
       'JVA MMO left Integral greater than 300Hz',
       'JVA MMO left Integral ratio', 'JVA MMO left peak amplitude',
       'JVA MMO left peak frequency', 'JVA MMO left median frequency',
       'JVA MMO right integral', 'JVA MMO right Integral less than 300Hz',
       'JVA MMO right Integral greater than 300Hz',
       'JVA MMO right Integral ratio', 'JVA MMO right peak amplitude',
       'JVA MMO right peak frequency', 'JVA MMO right median frequency',
       'ROM Slant', 'Max opening', 'Max closing', 'Opening lateral right',
       'Opening lateral left', 'Closing lateral right', 'Closing lateral left',
       'mmo_intensity_ta_r', 'mmo_intensity_ta_l', 'mmo_intensity_mm_r',
       'mmo_intensity_mm_l', 'mmo_intensity_da_r', 'mmo_intensity_da_l',
       'mmo_duration_ta_r', 'mmo_duration_ta_l', 'mmo_duration_mm_r',
       'mmo_duration_mm_l', 'mmo_duration_da_r', 'mmo_duration_da_l',
       'Lateral deviatio

#### Common functions

In [297]:
def get_splits(X, y, type = "classification"):
    # Separating original observations
    # original_x = X.iloc[2000:,:]
    # original_y = y.iloc[2000:]
    # X_temp = X.iloc[:2000,:]
    # y_temp = y.iloc[:2000]

    # print(y_temp.value_counts())
    # # Creating train test splits
    # if type == "classification":
    #     X_train, X_test, y_train, y_test = train_test_split(
    #         X_temp, y_temp, train_size=0.8,
    #         random_state=42, stratify=y_temp
    #     )
    # else:
    #     X_train, X_test, y_train, y_test = train_test_split(
    #         X_temp, y_temp, train_size=0.8,
    #         random_state=42
    #     )
    # X_test = pd.concat([X_test, original_x]).copy()
    # y_test = pd.concat([y_test, original_y]).copy()
    # print(y_test.value_counts())
    total_synthetic = len(X) - 66
    X_train = X.iloc[:total_synthetic,:].copy()
    y_train = y.iloc[:total_synthetic].copy()
    X_test = X.iloc[total_synthetic:,].copy()
    y_test = y.iloc[total_synthetic:].copy()
    return (X_train, X_test, 
            y_train, y_test)

"""
The following function was taken from
https://xgboost.readthedocs.io/en/stable/python/sklearn_estimator.html
"""
def fit_and_score(estimator, X_train,
                  X_test, y_train,
                  y_test, type="classification"):
    """Fit the estimator on the train set and score it on both sets"""
    estimator.fit(X_train, y_train,
                  eval_set=[(X_test, y_test)], verbose=False)
    predictions_train = estimator.predict(X_train)
    predictions_test = estimator.predict(X_test)
    if type == "classification":
        train_score = estimator.score(X_train, y_train)
        test_score = estimator.score(X_test, y_test)
    else:
        train_score = np.sqrt(mean_squared_error(y_train,
                                                predictions_train))
        test_score = np.sqrt(mean_squared_error(y_test,
                                                predictions_test))
    return estimator, train_score, test_score

def do_logistic_r(X_train, y_train):
    scores_lr = []
    # Create a CV instance
    cv_lr = StratifiedKFold(n_splits=5, shuffle=True,
                            random_state=42)
    for train, val in cv_lr.split(X_train, y_train):
        X_train_temp = X_train[train]
        X_val_temp = X_train[val]
        y_train_temp = y_train[train]
        y_val_temp = y_train[val]

        # Define model
        lr_model = LogisticRegression(multi_class='multinomial',
                                      max_iter=1000)
        lr_model.fit(X_train_temp, y_train_temp)

        # Predictions
        predictions_train = lr_model.predict(X_train_temp)
        predictions_val = lr_model.predict(X_val_temp)

        train_acc = accuracy_score(y_train_temp,
                                   predictions_train)
        val_acc = accuracy_score(y_val_temp,
                                  predictions_val)
        scores_lr.append((val_acc, train_acc))

    # Compute average scores
    avg_val_acc_lr = np.sum([each[0]
                            for each in scores_lr]) / len(scores_lr)
    avg_train_acc_lr = np.sum([each[1]
                              for each in scores_lr]) / len(scores_lr)
    return avg_train_acc_lr, avg_val_acc_lr

def do_linear_r(X_train, y_train):
    scores_lr = []
    # Create a CV instance
    cv_lr = KFold(n_splits=5, shuffle=True, random_state=42)
    for train, val in cv_lr.split(X_train, y_train):
        X_train_temp = X_train[train]
        X_val_temp = X_train[val]
        y_train_temp = y_train[train]
        y_val_temp = y_train[val]

        # Define model
        lr_model = LinearRegression()
        lr_model.fit(X_train_temp, y_train_temp)

        # Predictions
        predictions_train = lr_model.predict(X_train_temp)
        predictions_val = lr_model.predict(X_val_temp)

        train_rmse = np.sqrt(mean_squared_error(y_train_temp,
                                                predictions_train))
        val_rmse = np.sqrt(mean_squared_error(y_val_temp,
                                            predictions_val))
        scores_lr.append((val_rmse, train_rmse))

    # Compute average scores
    avg_val_rmse_lr = np.sum([each[0]
                            for each in scores_lr]) / len(scores_lr)
    avg_train_rmse_lr = np.sum([each[1]
                                for each in scores_lr]) / len(scores_lr)
    
    return avg_train_rmse_lr, avg_val_rmse_lr

def do_xgb_classifier(X_train, y_train):
    # Hyperparameter tuning XGBoost
    # Parameters to tune in the grid
    params_to_test = {
        'max_depth': range(3,10,2),
        'min_child_weight': range(1,6,2),
        'n_estimators': [100, 500, 1000],
        'learning_rate': [0.1, 0.01, 0.001]
    }

    # Performing gridSearch manually
    best_xgb_scores = None
    best_xgb_model = None
    curr_val_score = -1
    for depth in params_to_test['max_depth']:
        for weight in params_to_test['min_child_weight']:
            for lr in params_to_test['learning_rate']:
                for estimator_count in params_to_test['n_estimators']:
                    # Create a CV instance
                    cv = StratifiedKFold(n_splits=5, shuffle=True,
                                         random_state=42)

                    # XGBoost model
                    xgb_model = XGBClassifier(n_estimators=estimator_count, max_depth=depth,
                                              min_child_weight=weight, early_stopping_rounds=10,
                                              learning_rate=lr)

                    # Add cross validation results
                    total_train_score = 0
                    total_val_score = 0
                    for train, val in cv.split(X_train, y_train):
                        X_train_temp = X_train[train]
                        X_val_temp = X_train[val]
                        y_train_temp = y_train[train]
                        y_val_temp = y_train[val]

                        est, train_score, test_score = fit_and_score(
                            clone(xgb_model), X_train_temp,
                            X_val_temp, y_train_temp,
                            y_val_temp
                        )
                        total_train_score += train_score
                        total_val_score += test_score

                        avg_val_score = total_val_score / 5
                        if avg_val_score > curr_val_score:
                            curr_val_score = avg_val_score
                            best_xgb_scores = (total_train_score / 5, avg_val_score)
                            best_xgb_model = est
    return best_xgb_scores, best_xgb_model

def do_xgb_regressor(X_train, y_train):
    # Hyperparameter tuning XGBoost
    # Parameters to tune in the grid
    params_to_test = {
        'max_depth': range(3,10,2),
        'min_child_weight': range(1,6,2),
        'n_estimators': [100, 500, 1000],
        'learning_rate': [0.1, 0.01, 0.001]
    }

    # Performing gridSearch manually
    best_xgb_scores = None
    best_xgb_model = None
    curr_val_score = float('inf')
    for depth in params_to_test['max_depth']:
        for weight in params_to_test['min_child_weight']:
            for lr in params_to_test['learning_rate']:
                for estimator_count in params_to_test['n_estimators']:
                    # Create a CV instance
                    cv = KFold(n_splits=5, shuffle=True, random_state=42)

                    # XGBoost model
                    xgb_model = XGBRegressor(n_estimators=estimator_count, max_depth=depth,
                                             min_child_weight=weight, early_stopping_rounds=10,
                                             learning_rate=lr)

                    # Add cross validation results
                    total_train_score = 0
                    total_val_score = 0
                    for train, val in cv.split(X_train, y_train):
                        X_train_temp = X_train[train]
                        X_val_temp = X_train[val]
                        y_train_temp = y_train[train]
                        y_val_temp = y_train[val]

                        est, train_score, test_score = fit_and_score(
                            clone(xgb_model), X_train_temp,
                            X_val_temp, y_train_temp,
                            y_val_temp, type="regression"
                        )
                        total_train_score += train_score
                        total_val_score += test_score

                        avg_val_score = total_val_score / 5
                        if avg_val_score < curr_val_score:
                            curr_val_score = avg_val_score
                            best_xgb_scores = (total_train_score / 5, avg_val_score)
                            best_xgb_model = est
    return best_xgb_scores, best_xgb_model

def do_rf_regressor(X_train, y_train):
    params_to_test = {
        'max_depth': range(3,10,2),
        'n_estimators': [100, 500, 1000],
        'max_features': [1.0, 'sqrt', 'log2']
    }
    rf_grid = GridSearchCV(RandomForestRegressor(random_state=42),
                           params_to_test, cv=5, scoring='neg_mean_squared_error',
                           return_train_score=True)

    rf_grid.fit(X_train, y_train)
    rf_train_score = np.sqrt(
        -rf_grid.cv_results_['mean_train_score'][rf_grid.best_index_])
    rf_val_score = np.sqrt(-rf_grid.best_score_)
    
    return rf_train_score, rf_val_score, rf_grid.best_estimator_

def do_rf_classifier(X_train, y_train):
    params_to_test = {
        'max_depth': range(3,10,2),
        'n_estimators': [100, 500, 1000],
        'max_features': [1.0, 'sqrt', 'log2']
    }
    rf_grid = GridSearchCV(RandomForestClassifier(random_state=42),
                           params_to_test, cv=5,
                           return_train_score=True)

    rf_grid.fit(X_train, y_train)
    rf_train_score = rf_grid.cv_results_['mean_train_score'][rf_grid.best_index_]
    rf_val_score = rf_grid.best_score_
    
    return rf_train_score, rf_val_score, rf_grid.best_estimator_

#### Analysis 1

In [298]:
# Form data
first_data = full_data.copy()
first_data = full_data.drop(columns=['Max opening', 'Max closing',
                                     'Opening lateral right', 'Opening lateral left',
                                     'Closing lateral right', 'Closing lateral left',
                                     'Deviation opening', 'Deviation closing',
                                     'Deviation direction opening', 
                                     'Deviation direction closing',
                                     'Lateral deviation opening',
                                     'Lateral deviation closing']).copy()

X_a = first_data.drop(columns=["Deviation scenario"]).copy()
y_a_org = first_data["Deviation scenario"].copy()

# Get splits
X_train_a, X_test_a, y_train_a, y_test_a = get_splits(
    X_a, y_a_org
)

# Scale the data
scaler = StandardScaler()
X_train_a_scaled = scaler.fit_transform(X_train_a)
X_test_a_scaled = scaler.transform(X_test_a)

# Encode y
encoder = LabelEncoder()
y_train_a = encoder.fit_transform(y_train_a).copy()
y_test_a = encoder.transform(y_test_a).copy()

# Fit models
log_reg_results_a = do_logistic_r(X_train_a_scaled, y_train_a)
xgb_classifier_results_a = do_xgb_classifier(X_train_a_scaled,
                                             y_train_a)
rf_classifier_results_a = do_rf_classifier(X_train_a_scaled,
                                           y_train_a)

In [286]:
val_results_a = pd.DataFrame({
    "Multinomial Logistic Regression": [log_reg_results_a[0],
                                        log_reg_results_a[1]],
    "XGBoost Classifier": [xgb_classifier_results_a[0][0],
                           xgb_classifier_results_a[0][1]],
    "Random Forest Classifier": [rf_classifier_results_a[0],
                                 rf_classifier_results_a[1]]
})
val_results_a.index = ["Train Accuracy", "Validation Accuracy"]
val_results_a.to_csv("obj2_results/val_results_1.csv")

# Test results based on best model (XGBoost)
best_model_a = xgb_classifier_results_a[1]
test_predictions_a = best_model_a.predict(X_test_a_scaled)
test_accuracy_a = accuracy_score(y_test_a, test_predictions_a)
test_proba_a = best_model_a.predict_proba(X_test_a_scaled)
test_auc_a = roc_auc_score(y_test_a, test_proba_a,
                           multi_class='ovr')

test_results_a = pd.DataFrame({
    "Test Accuracy XGBoost": [test_accuracy_a],
})
test_results_a.to_csv("obj2_results/test_results_1.csv",
                      index=False)

ValueError: Found input variables with inconsistent numbers of samples: [0, 303]

#### Analysis 2

In [299]:
def second_analysis(target = "Deviation opening"):
    # Form data
    second_data = full_data.copy()
    second_data = full_data.drop(columns=['Max opening', 'Max closing',
                                         'Opening lateral right', 'Opening lateral left',
                                         'Closing lateral right', 'Closing lateral left',
                                         'Deviation direction opening', 
                                         'Deviation direction closing',
                                         'Lateral deviation opening',
                                         'Lateral deviation closing']).copy()

    X_b = second_data.drop(columns=["Deviation opening",
                                    "Deviation closing"]).copy()
    y_b_org = second_data[target].copy()

    # Get splits
    X_train_b, X_test_b, y_train_b, y_test_b = get_splits(
        X_b, y_b_org
    )

    # Scale and create dummies
    scaler = StandardScaler()
    X_train_b_num = X_train_b.drop(columns=["Deviation scenario"]).copy()
    X_train_b_cat = X_train_b["Deviation scenario"].copy()
    X_test_b_num = X_test_b.drop(columns=["Deviation scenario"]).copy()
    X_test_b_cat = X_test_b["Deviation scenario"].copy()

    X_train_b_num_scaled = scaler.fit_transform(X_train_b_num)
    X_test_b_num_scaled = scaler.transform(X_test_b_num)
    X_train_b_cat_encoded = pd.get_dummies(X_train_b_cat,
                                        dtype=int)
    X_test_b_cat_encoded = pd.get_dummies(X_test_b_cat,
                                        dtype=int)
    X_train_b_final = np.concatenate([X_train_b_num_scaled,
                                     X_train_b_cat_encoded.values],
                                     axis=1)
    X_test_b_final = np.concatenate([X_test_b_num_scaled,
                                     X_test_b_cat_encoded],
                                     axis=1)

    # Encode y
    encoder = LabelEncoder()
    y_train_b = encoder.fit_transform(y_train_b).copy()
    y_test_b = encoder.transform(y_test_b).copy()

    # Fit models
    log_reg_results_b = do_logistic_r(X_train_b_final, y_train_b)
    xgb_classifier_results_b = do_xgb_classifier(X_train_b_final,
                                                 y_train_b)
    rf_classifier_results_b = do_rf_classifier(X_train_b_final,
                                               y_train_b)
    
    val_results_b = pd.DataFrame({
        "Multinomial Logistic Regression": [log_reg_results_b[0],
                                            log_reg_results_b[1]],
        "XGBoost Classifier": [xgb_classifier_results_b[0][0],
                                xgb_classifier_results_b[0][1]],
        "Random Forest Classifier": [rf_classifier_results_b[0],
                                     rf_classifier_results_b[1]]
    })
    val_results_b.index = ["Train Accuracy", "Validation Accuracy"]
    val_results_b.to_csv(f"obj2_results/val_results_2_{target}.csv")
    
    return (log_reg_results_b,
            xgb_classifier_results_b,
            rf_classifier_results_b,
            X_test_b_final, y_test_b)

val_results_b_opening = second_analysis()
val_results_b_closing = second_analysis(
    target="Deviation closing")

KeyboardInterrupt: 

In [274]:
# Test results based on best model (XGBoost)
best_model_b_opening = val_results_b_opening[1][1]
test_predictions_b_opening = best_model_b_opening.predict(
    val_results_b_opening[3])
test_accuracy_b_opening = accuracy_score(val_results_b_opening[4],
                                         test_predictions_b_opening)

test_results_b_opening = pd.DataFrame({
    "Test Accuracy XGBoost": [test_accuracy_b_opening],
})
test_results_b_opening.to_csv("obj2_results/test_results_2_deviation_opening.csv",
                              index=False)

# Test results based on best model (XGBoost)
best_model_b_closing = val_results_b_closing[1][1]
test_predictions_b_closing = best_model_b_closing.predict(
    val_results_b_closing[3])
test_accuracy_b_closing = accuracy_score(val_results_b_closing[4],
                                         test_predictions_b_closing)

test_results_b_closing = pd.DataFrame({
    "Test Accuracy XGBoost": [test_accuracy_b_closing],
})
test_results_b_closing.to_csv("obj2_results/test_results_2_deviation_closing.csv",
                              index=False)

#### Analysis 3

In [None]:
def third_analysis(target = "Deviation direction opening"):
    # Form data
    second_data = full_data.copy()
    second_data = full_data.drop(columns=['Max opening', 'Max closing',
                                         'Opening lateral right', 'Opening lateral left',
                                         'Closing lateral right', 'Closing lateral left',
                                         'Lateral deviation opening',
                                         'Lateral deviation closing']).copy()

    X_b = second_data.drop(columns=["Deviation direction opening",
                                    "Deviation direction closing"]).copy()
    y_b_org = second_data[target].copy()

    # Get splits
    X_train_b, X_test_b, y_train_b, y_test_b = get_splits(
        X_b, y_b_org
    )

    # Scale and create dummies
    scaler = StandardScaler()
    X_train_b_num = X_train_b.drop(columns=["Deviation scenario",
                                            "Deviation opening",
                                            "Deviation closing"]).copy()
    X_train_b_cat = X_train_b[["Deviation scenario",
                               "Deviation opening",
                               "Deviation closing"]].copy()
    X_test_b_num = X_test_b.drop(columns=["Deviation scenario",
                                          "Deviation opening",
                                          "Deviation closing"]).copy()
    X_test_b_cat = X_test_b[["Deviation scenario",
                             "Deviation opening",
                             "Deviation closing"]].copy()

    X_train_b_num_scaled = scaler.fit_transform(X_train_b_num)
    X_test_b_num_scaled = scaler.transform(X_test_b_num)
    X_train_b_cat_encoded = pd.get_dummies(X_train_b_cat,
                                        dtype=int)
    X_test_b_cat_encoded = pd.get_dummies(X_test_b_cat,
                                        dtype=int)
    X_train_b_final = np.concatenate([X_train_b_num_scaled,
                                     X_train_b_cat_encoded.values],
                                     axis=1)
    X_test_b_final = np.concatenate([X_test_b_num_scaled,
                                     X_test_b_cat_encoded],
                                     axis=1)

    # Encode y
    encoder = LabelEncoder()
    y_train_b = encoder.fit_transform(y_train_b).copy()
    y_test_b = encoder.transform(y_test_b).copy()

    # Fit models
    log_reg_results_b = do_logistic_r(X_train_b_final, y_train_b)
    xgb_classifier_results_b = do_xgb_classifier(X_train_b_final,
                                                 y_train_b)
    rf_classifier_results_b = do_rf_classifier(X_train_b_final,
                                               y_train_b)
    
    val_results_b = pd.DataFrame({
        "Multinomial Logistic Regression": [log_reg_results_b[0],
                                            log_reg_results_b[1]],
        "XGBoost Classifier": [xgb_classifier_results_b[0][0],
                                xgb_classifier_results_b[0][1]],
        "Random Forest Classifier": [rf_classifier_results_b[0],
                                     rf_classifier_results_b[1]]
    })
    val_results_b.index = ["Train Accuracy", "Validation Accuracy"]
    val_results_b.to_csv(f"obj2_results/val_results_3_{target}.csv")
    
    return (log_reg_results_b,
            xgb_classifier_results_b,
            rf_classifier_results_b,
            X_test_b_final, y_test_b)

val_results_c_opening = third_analysis()
val_results_c_closing = third_analysis(
    target="Deviation direction closing")

right    838
no       624
left      53
Name: Deviation direction opening, dtype: int64
right    168
no       125
left      10
Name: Deviation direction opening, dtype: int64
right    1013
no        260
left      242
Name: Deviation direction closing, dtype: int64
right    203
no        52
left      48
Name: Deviation direction closing, dtype: int64


In [270]:
# Test results based on best model (XGBoost)
best_model_c_opening = val_results_c_opening[1][1]
test_predictions_c_opening = best_model_c_opening.predict(
    val_results_c_opening[3])
test_accuracy_c_opening = accuracy_score(val_results_c_opening[4],
                                         test_predictions_c_opening)

test_results_c_opening = pd.DataFrame({
    "Test Accuracy XGBoost": [test_accuracy_c_opening],
})
test_results_c_opening.to_csv("obj2_results/test_results_3_deviation_direction_opening.csv",
                              index=False)

# Test results based on best model (XGBoost)
best_model_c_closing = val_results_c_closing[1][1]
test_predictions_c_closing = best_model_c_closing.predict(
    val_results_c_closing[3])
test_accuracy_c_closing = accuracy_score(val_results_c_closing[4],
                                         test_predictions_c_closing)

test_results_c_closing = pd.DataFrame({
    "Test Accuracy XGBoost": [test_accuracy_c_closing],
})
test_results_c_closing.to_csv("obj2_results/test_results_3_deviation_direction_closing.csv",
                              index=False)


#### Final analysis

In [None]:
def final_analysis(target = "Lateral deviation opening"):
    # Form data
    second_data = full_data.copy()
    second_data = full_data.drop(columns=['Max opening', 'Max closing',
                                         'Opening lateral right', 'Opening lateral left',
                                         'Closing lateral right', 'Closing lateral left']).copy()

    X_b = second_data.drop(columns=["Lateral deviation opening",
                                    "Lateral deviation closing"]).copy()
    y_b_org = second_data[target].copy()

    # Get splits
    X_train_b, X_test_b, y_train_b, y_test_b = get_splits(
        X_b, y_b_org, type="regression"
    )

    # Scale and create dummies
    scaler = StandardScaler()
    X_train_b_num = X_train_b.drop(columns=["Deviation scenario",
                                            "Deviation opening",
                                            "Deviation closing",
                                            "Deviation direction opening",
                                            "Deviation direction closing"
                                            ]).copy()
    X_train_b_cat = X_train_b[["Deviation scenario",
                               "Deviation opening",
                               "Deviation closing",
                               "Deviation direction opening",
                               "Deviation direction closing"
                               ]].copy()
    X_test_b_num = X_test_b.drop(columns=["Deviation scenario",
                                          "Deviation opening",
                                          "Deviation closing",
                                          "Deviation direction opening",
                                          "Deviation direction closing"
                                         ]).copy()
    X_test_b_cat = X_test_b[["Deviation scenario",
                             "Deviation opening",
                             "Deviation closing",
                             "Deviation direction opening",
                             "Deviation direction closing"
                            ]].copy()

    X_train_b_num_scaled = scaler.fit_transform(X_train_b_num)
    X_test_b_num_scaled = scaler.transform(X_test_b_num)
    X_train_b_cat_encoded = pd.get_dummies(X_train_b_cat,
                                        dtype=int)
    X_test_b_cat_encoded = pd.get_dummies(X_test_b_cat,
                                        dtype=int)
    X_train_b_final = np.concatenate([X_train_b_num_scaled,
                                     X_train_b_cat_encoded.values],
                                     axis=1)
    X_test_b_final = np.concatenate([X_test_b_num_scaled,
                                     X_test_b_cat_encoded],
                                     axis=1)

    # Fit models
    lin_reg_results_b = do_linear_r(X_train_b_final, y_train_b)
    xgb_regressor_results_b = do_xgb_regressor(X_train_b_final,
                                               y_train_b)
    rf_regressor_results_b = do_rf_regressor(X_train_b_final,
                                             y_train_b)
    
    val_results_b = pd.DataFrame({
        "Multinomial Logistic Regression": [lin_reg_results_b[0],
                                            lin_reg_results_b[1]],
        "XGBoost Classifier": [xgb_regressor_results_b[0][0],
                                xgb_regressor_results_b[0][1]],
        "Random Forest Classifier": [rf_regressor_results_b[0],
                                     rf_regressor_results_b[1]]
    })
    val_results_b.index = ["Train RMSE", "Validation RMSE"]
    val_results_b.to_csv(f"obj2_results/val_results_final_{target}.csv")
    
    return (lin_reg_results_b,
            xgb_regressor_results_b,
            rf_regressor_results_b,
            X_test_b_final, y_test_b)

val_results_final_opening = final_analysis()
val_results_final_closing = final_analysis(
    target="Lateral deviation closing")

0.0    624
0.1    163
0.2    105
0.3     44
2.2     39
2.4     37
2.3     30
2.7     30
2.1     27
1.9     23
2.5     22
2.6     22
1.7     21
2.8     21
2.9     19
2.0     18
3.1     18
3.7     17
3.3     16
1.8     15
3.5     15
3.2     13
3.6     13
0.4     12
3.0     12
1.3     12
3.4     11
1.0     10
0.8     10
0.5      9
3.9      9
1.5      9
3.8      7
1.1      7
1.2      6
1.6      6
4.3      6
0.9      6
0.7      5
1.4      5
0.6      5
4.0      4
4.1      3
4.6      2
4.8      2
5.8      1
4.2      1
4.4      1
4.9      1
6.4      1
Name: Lateral deviation opening, dtype: int64
0.0    118
0.1     31
0.2     19
2.2     14
2.4      9
1.9      9
0.3      9
2.7      8
2.3      7
2.6      7
2.0      6
2.1      6
2.8      5
1.8      5
2.5      4
3.1      4
2.9      4
3.7      4
0.4      3
1.1      3
3.5      3
1.3      3
3.9      2
3.3      2
3.2      2
1.7      2
3.0      2
0.9      1
4.3      1
1.5      1
1.6      1
4.8      1
1.4      1
0.5      1
3.4      1
0.6      1
3.6     

AttributeError: 'numpy.ndarray' object has no attribute 'iloc'

In [None]:
# Test results based on best model (XGBoost)
best_model_final_opening = val_results_final_opening[1][1]
test_predictions_final_opening = best_model_final_opening.predict(
    val_results_final_opening[3])

# Performance metrics
test_mae_final_opening = mean_absolute_error(val_results_final_opening[4], 
                                             test_predictions_final_opening)
test_rmse_final_opening = np.sqrt(mean_squared_error(
    val_results_final_opening[4],
    test_predictions_final_opening))

test_results_final_opening = pd.DataFrame({
    "Test RMSE XGBoost": [test_rmse_final_opening],
    "Test MAE XGBoost": [test_mae_final_opening]
})
test_results_final_opening.to_csv("obj2_results/test_results_final_lateral_deviation_opening.csv",
                                  index=False)

# Test results based on best model (XGBoost)
best_model_final_closing = val_results_final_closing[1][1]
test_predictions_final_closing = best_model_final_closing.predict(
    val_results_final_closing[3])

# Performance metrics
test_mae_final_closing = mean_absolute_error(val_results_final_closing[4], 
                                             test_predictions_final_closing)
test_rmse_final_closing = np.sqrt(mean_squared_error(
    val_results_final_closing[4],
    test_predictions_final_closing))

test_results_final_closing = pd.DataFrame({
    "Test RMSE XGBoost": [test_rmse_final_closing],
    "Test MAE XGBoost": [test_mae_final_closing]
})
test_results_final_closing.to_csv("obj2_results/test_results_final_lateral_deviation_closing.csv",
                                  index=False)

