<a href="https://colab.research.google.com/github/TomasCajan/DataScience/blob/main/P4_Machine_Alignment_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Welcome to my project Machine Alignment Prediction v2**

In this particular one, the goal is to re-make the previously completed case study and push it to the limits.
Main milestones are following : 

1.   Introduce a more meaningfull and reproductible Feature Selection process
2.   Automate the Data Mining process as much as possible
3.   Improve model interpretability of selected models and EDA of feature relations. 
4.   Beat the scores of previously made predictions






In [1]:
!wget https://raw.githubusercontent.com/TomasCajan/DataScience/main/Data/train_target.csv
!wget https://raw.githubusercontent.com/TomasCajan/DataScience/main/Data/train_query.csv
!wget https://raw.githubusercontent.com/TomasCajan/DataScience/main/Data/test_target.csv
!wget https://raw.githubusercontent.com/TomasCajan/DataScience/main/Data/test_query.csv

--2023-06-07 06:08:48--  https://raw.githubusercontent.com/TomasCajan/DataScience/main/Data/train_target.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 27107 (26K) [text/plain]
Saving to: ‘train_target.csv’


2023-06-07 06:08:49 (12.9 MB/s) - ‘train_target.csv’ saved [27107/27107]

--2023-06-07 06:08:49--  https://raw.githubusercontent.com/TomasCajan/DataScience/main/Data/train_query.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.110.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 27098 (26K) [text/plain]
Saving to: ‘train_query.csv’


2023-06-07 06:08:49 (28.6 MB/s) - ‘t

In [2]:
!pip install --quiet shap
!pip install --quiet feature_engine
!pip install --quiet plotly --upgrade

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m572.6/572.6 kB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m319.4/319.4 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.3/15.3 MB[0m [31m47.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
# Load data

import pandas as pd
train_target = pd.read_csv('train_target.csv')
train_query = pd.read_csv('train_query.csv')
test_target = pd.read_csv('test_target.csv')
test_query = pd.read_csv('test_query.csv')

In [4]:
train_target.iloc[:17,:]

Unnamed: 0,ID,Alignment,Parameter,Value
0,2,Instrument.Alignments.Optics.MdTLDAlign,X,-0.08125
1,2,Instrument.Alignments.Optics.AlignCorrAngleUHR,[0.004;2000].Y,4.9e-05
2,2,Instrument.Alignments.Optics.CorrStigUppYUHR,[0.002;1000].X,-0.006568
3,2,Instrument.Alignments.Optics.MdSShiftTube3,Y,-47.505291
4,2,Instrument.Alignments.Optics.ShiftCorrAngleUHR,[1000].X,-0.000688
5,2,Instrument.Alignments.Optics.MdSImRotCheb1,,0.3589
6,2,Instrument.Alignments.Optics.MdSDeflACF,[0.001].X,0.127467
7,2,Instrument.Alignments.Optics.MdDeflACF,Y,0.134068
8,2,Instrument.Alignments.Optics.MdACRotUpp,Y,-0.00098
9,2,Instrument.Alignments.Optics.MdCompUHRRem,,-5.0


In [5]:
# Parse data into more efficient shape for Pandas

train_target = train_target.pivot(index='ID', columns='Alignment', values='Value')
train_query = train_query.pivot(index='ID', columns='Alignment', values='Value')
test_target = test_target.pivot(index='ID', columns='Alignment', values='Value')
test_query = test_query.pivot(index='ID', columns='Alignment', values='Value')

In [6]:
# Clean up shared column names for better visual appeal

common_col_names = ['Instrument.Alignments.Optics.', 'Instrument.Alignments.Detectors.']
new_columns = train_target.columns.copy()
for common_col_name in common_col_names:
  new_columns = [col.replace(common_col_name, '') for col in new_columns]

train_target.columns = new_columns
train_query.columns = new_columns
test_target.columns = new_columns
test_query.columns = new_columns

train_target.columns

Index(['TLD.Push', 'AlignCorrAngleUHR', 'CorrStigUppYUHR', 'MdACRotLow',
       'MdACRotUpp', 'MdCompUHRRem', 'MdDeflACF', 'MdDeflDC1', 'MdHRSatPar1',
       'MdHRpar4', 'MdSDeflACF', 'MdSImRotCheb0', 'MdSImRotCheb1',
       'MdSShiftTube3', 'MdTLDAlign', 'ShiftCorrAngleUHR'],
      dtype='object')

In [7]:
test_target.shape

(12, 16)

In [8]:
# imports
import numpy as np
import copy

# Pre Processing
from sklearn.preprocessing import RobustScaler, StandardScaler, MinMaxScaler, MaxAbsScaler
from scipy.stats import skew 

# Evaluation
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV, train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, median_absolute_error, mean_gamma_deviance
from scipy import stats

from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

# Visualization
import matplotlib.pyplot as plt
import matplotlib.pylab as pl
import seaborn as sns
import plotly.graph_objs as go
import plotly.express as px 
import plotly.figure_factory as ff
import plotly.subplots as sp
from plotly.subplots import make_subplots
from sklearn.inspection import permutation_importance
import shap
import networkx as nx
from IPython.display import display
import ipywidgets as widgets
from ipywidgets import GridspecLayout

# Feature selection
from scipy.cluster.hierarchy import linkage, dendrogram
from sklearn.feature_selection import mutual_info_regression
from sklearn.feature_selection import f_regression
from sklearn.manifold import TSNE
from sklearn.feature_selection import mutual_info_regression
from sklearn.feature_selection import SelectKBest, SelectPercentile
from feature_engine.selection import DropCorrelatedFeatures, SmartCorrelatedSelection
from sklearn.feature_selection import f_regression
from sklearn.feature_selection import SequentialFeatureSelector as SFS
from sklearn.feature_selection import RFE
from sklearn.feature_selection import RFECV
from feature_engine.selection import SelectByShuffling
from sklearn.decomposition import PCA
from sklearn.neighbors import kneighbors_graph

from feature_engine.selection import (
    RecursiveFeatureElimination,
    DropConstantFeatures,
    DropDuplicateFeatures,
)

import warnings
warnings.filterwarnings('ignore')

In [49]:
# Prepare main dataframe for work
def feature_prep(before: pd.DataFrame, after: pd.DataFrame, target_feature: str) -> pd.DataFrame:

    target = after[[target_feature]].rename(columns={target_feature: f'{target_feature}_target'})
    merged = pd.concat([before, target], axis=1)
    merged = merged.reset_index()
    merged = merged.drop('ID', axis=1)

    return merged

# Feature selection - Correlation based methods
def featEngineCorr(method, threshold, model, scoring, X, y, Xtest):

  sel = SmartCorrelatedSelection(
      variables=None,
      method=method,
      threshold=threshold,
      missing_values="raise",
      selection_method="model_performance",
      estimator=model,
      scoring=scoring,
      cv=3,
  )

  sel.fit(X, y)
  feature_grp = sel.correlated_feature_sets_
  feature_transformed = sel.transform(X)
  feature_kept = feature_transformed.columns
  feature_transformed_test = Xtest.loc[:,feature_kept]

  return feature_transformed, feature_kept, feature_transformed_test

def corData(dataframe, target_feature, mth):

    corr_d = dataframe.corr(method = mth)[[target_feature]]
    return corr_d

# Feature selection - PCA
def fsPCA(X, Xtest, strategy):

  numerical_columns = X.select_dtypes(include=['float64', 'int64']).columns

  if strategy == "MinMax":
    scaler = MinMaxScaler()
    df_scaled = scaler.fit_transform(X)
    df_scaled_test = scaler.fit_transform(Xtest)
    X_scaled_df = pd.DataFrame(df_scaled, columns=numerical_columns)
    X_scaled_df_test = pd.DataFrame(df_scaled_test, columns=numerical_columns)

  if strategy == "Mean_norm":
    scaler_mean_train = StandardScaler(with_mean = True, with_std = False)
    scaler_mean_test = StandardScaler(with_mean = True, with_std = False)
    scaler_minmax_train = RobustScaler(with_centering = False, with_scaling = True, quantile_range = (0,100))
    scaler_minmax_test = RobustScaler(with_centering = False, with_scaling = True, quantile_range = (0,100))
    scaler_mean_train.fit(X)
    scaler_minmax_train.fit(X)
    scaler_mean_test.fit(Xtest)
    scaler_minmax_test.fit(Xtest)
    df_scaled = scaler_minmax_train.transform(scaler_mean_train.transform(X))
    df_scaled_test = scaler_minmax_test.transform(scaler_mean_test.transform(Xtest))
    X_scaled_df = pd.DataFrame(df_scaled, columns=numerical_columns)
    X_scaled_df_test = pd.DataFrame(df_scaled_test, columns=numerical_columns)

  if strategy == "MMS":
    scaler_mean_train = StandardScaler(with_mean = True, with_std = False)
    scaler_mean_test = StandardScaler(with_mean = True, with_std = False)
    scaler_minmax_train = MinMaxScaler()
    scaler_minmax_test = MinMaxScaler()
    scaler_mean_train.fit(X)
    scaler_minmax_train.fit(X)
    scaler_mean_test.fit(Xtest)
    scaler_minmax_test.fit(Xtest)
    df_scaled  = scaler_mean_train.transform(scaler_minmax_train.transform(X))
    df_scaled_test = scaler_mean_test.transform(scaler_minmax_test.transform(Xtest))
    X_scaled_df = pd.DataFrame(df_scaled, columns=numerical_columns)
    X_scaled_df_test = pd.DataFrame(df_scaled_test, columns=numerical_columns)

  pca = PCA()
  X_pca = pca.fit_transform(X_scaled_df)

  explained_variance_ratio = pca.explained_variance_ratio_
  cumulative_variance_ratio = np.cumsum(explained_variance_ratio)

  threshold_variance = 0.95 
  num_components = np.argmax(cumulative_variance_ratio >= threshold_variance) + 1

  selected_components = X_pca[:, :num_components]
  selected_components_df = pd.DataFrame(selected_components, columns=X.columns[:num_components])
  feature_kept = selected_components_df.columns
  feature_transformed_test = X_scaled_df_test.loc[:,feature_kept]

  return selected_components_df , feature_kept, feature_transformed_test

# Feature selection - Laplacian method
def fsLaplacian(X, y, Xtest):

  X_array = X.values
  y_array = y.values
  k = 6

  graph = kneighbors_graph(X_array, n_neighbors=k)
  laplacian_selector = SelectKBest(score_func=f_regression, k=k)
  laplacian_selector.fit(X_array, y_array)

  selected_features = laplacian_selector.get_support()
  selected_feature_names = X.columns[selected_features]

  df_laplacian = X.loc[:,selected_feature_names]
  df_laplacian_test = Xtest.loc[:,selected_feature_names]

  return df_laplacian, selected_feature_names, df_laplacian_test

# Feature selection - Statistical methods
def statsFs(methd, X, y, Xtest):

  if methd == "mi":
    selector = mutual_info_regression(X, y, random_state= 101)
    selector = pd.Series(selector)
    sel_ = SelectPercentile(mutual_info_regression, percentile=20).fit(X, y)
    feature_kept = X.columns[sel_.get_support()]
    feature_transformed = sel_.transform(X)
    feature_transformed_df = pd.DataFrame(feature_transformed)
    feature_transformed_test = Xtest.loc[:,feature_kept]

  elif methd == "anova":
    selector = f_regression(X, y)
    selector = pd.Series(selector[1])
    sel_ = SelectPercentile(f_regression, percentile=25).fit(X, y)
    feature_kept = X.columns[sel_.get_support()]
    feature_transformed = sel_.transform(X)
    feature_transformed_df = pd.DataFrame(feature_transformed)
    feature_transformed_test = Xtest.loc[:,feature_kept]

  return feature_transformed_df, feature_kept, feature_transformed_test

# Feature selection - Stepwise methods
def sfsMethods(X, y, Xtest, direction, num_sel, model, scoring):

  sfs = SFS(estimator = XGBRegressor(max_depth =3,
                       n_estimators = 250,
                       learning_rate = 0.1,
                       reg_alpha = 0.0001,
                       random_state=42),
    n_features_to_select=num_sel,
    tol=None, 
    direction=direction,
    scoring=scoring,
    cv=2,
    n_jobs=-1,
  )

  selector = sfs.fit(X, y)
  feature_kept = sfs.get_feature_names_out()
  feature_transformed = selector.transform(X)
  feature_transformed_df = pd.DataFrame(feature_transformed)
  feature_transformed_test = Xtest.loc[:,feature_kept]

  return feature_transformed_df, feature_kept, feature_transformed_test

# Feature selection - Recursive Feature Elimination
def rfeMethods(X, y, Xtest, model, scoring):

  selector = RFECV(estimator = model, step=1, cv=2, scoring = scoring)
  selector = selector.fit(X, y)
  feature_kept = selector.get_feature_names_out()
  feature_transformed = selector.transform(X)
  feature_transformed_df = pd.DataFrame(feature_transformed)
  feature_transformed_test = Xtest.loc[:,feature_kept]

  return feature_transformed_df, feature_kept, feature_transformed_test

# Feature selection - Shuffle
def shuffleMethods(X, y, Xtest, model, scoring):

  sel = SelectByShuffling(
      variables=None, 
      estimator=model,
      scoring=scoring, 
      threshold=0.99, 
      cv=3,
      random_state=42
  )

  sel.fit(X, y)
  feature_transformed = sel.transform(X)
  feature_kept = feature_transformed.columns
  feature_transformed_test = Xtest.loc[:,feature_kept]

  if feature_transformed.empty:
    feature_transformed = X
    feature_kept = X.columns
    feature_transformed_test = Xtest

  return feature_transformed, feature_kept, feature_transformed_test

# Display function - Scatterplot predictors vs target
def displayFeaturesScatters(X, base_df, target_name):

  fig = make_subplots(rows=4, cols=4, subplot_titles=X.columns.tolist())
  fig.update_layout(title=f'Scatterplot: Predictor Features vs {target_name}', width=800, height=800)
  targ_name = base_df.columns[-1]

  for i, col in enumerate(X.columns):
      fig.add_trace(
          go.Scatter(x=X[col], y=base_df[targ_name], mode='markers', showlegend=False),
          row=(i//4)+1, col=(i%4)+1
      )

  fig.show()

# Display function - Feature Selection Results
def displayFeatureSelection(X, pearson, spearman, kendall, mi, anova, sffe, sbfe, rfe, shuffle, pca, laplacian, target_name):

  new_df = pd.DataFrame(columns=X.columns)
  for i in range(6):
      new_df.loc[i] = X.columns

  fig = make_subplots(rows=1, cols=11, specs=[[{"type": "table"}, {"type": "table"}, {"type": "table"},{"type": "table"},{"type": "table"},{"type": "table"},{"type": "table"},{"type": "table"},{"type": "table"},{"type": "table"} ,{"type": "table"}]])

  fig.add_trace(go.Table(header=dict(values=['Pearson'],fill_color='lightgrey',font=dict(color='black', size=12),align='center'),
                                cells=dict(values=[new_df.iloc[0].tolist(),],
                                            fill_color=[['green' if col in pearson else 'red' for col in new_df.columns],],
                                            align='left',font=dict(color='black', size=12),height=30)),row=1,col=1)

  fig.add_trace(go.Table(header=dict(values=['Spearman'], fill_color='lightgrey', font=dict(color='black', size=12),align='center'),
                                cells=dict(values=[new_df.iloc[0].tolist(),],
                                            fill_color=[['green' if col in spearman else 'red' for col in new_df.columns],],
                                            align='left', font=dict(color='black', size=12),height=30)),row=1,col=2)

  fig.add_trace(go.Table(header=dict(values=['Kendall'], fill_color='lightgrey', font=dict(color='black', size=12),align='center'),
                                cells=dict(values=[new_df.iloc[0].tolist(),],
                                            fill_color=[['green' if col in kendall else 'red' for col in new_df.columns],],
                                            align='left', font=dict(color='black', size=12),height=30)),row=1,col=3)

  fig.add_trace(go.Table(header=dict(values=['MutualInformation'], fill_color='lightgrey', font=dict(color='black', size=12),align='center'),
                                cells=dict(values=[new_df.iloc[0].tolist(),],
                                            fill_color=[['green' if col in mi else 'red' for col in new_df.columns],],
                                            align='left', font=dict(color='black', size=12),height=30)),row=1,col=4)

  fig.add_trace(go.Table(header=dict(values=['ANOVA'], fill_color='lightgrey', font=dict(color='black', size=12),align='center'),
                                cells=dict(values=[new_df.iloc[0].tolist(),],
                                            fill_color=[['green' if col in anova else 'red' for col in new_df.columns],],
                                            align='left', font=dict(color='black', size=12),height=30)),row=1,col=5)

  fig.add_trace(go.Table(header=dict(values=['StepForward FS'], fill_color='lightgrey', font=dict(color='black', size=12),align='center'),
                                cells=dict(values=[new_df.iloc[0].tolist(),],
                                            fill_color=[['green' if col in sffe else 'red' for col in new_df.columns],],
                                            align='left', font=dict(color='black', size=12),height=30)),row=1,col=6)

  fig.add_trace(go.Table(header=dict(values=['StepBackward FS'], fill_color='lightgrey', font=dict(color='black', size=12),align='center'),
                                cells=dict(values=[new_df.iloc[0].tolist(),],
                                            fill_color=[['green' if col in sbfe else 'red' for col in new_df.columns],],
                                            align='left', font=dict(color='black', size=12),height=30)),row=1,col=7)

  fig.add_trace(go.Table(header=dict(values=['RFE'], fill_color='lightgrey', font=dict(color='black', size=12),align='center'),
                                cells=dict(values=[new_df.iloc[0].tolist(),],
                                            fill_color=[['green' if col in rfe else 'red' for col in new_df.columns],],
                                            align='left', font=dict(color='black', size=12),height=30)),row=1,col=8)

  fig.add_trace(go.Table(header=dict(values=['Shuffling'], fill_color='lightgrey', font=dict(color='black', size=12),align='center'),
                                cells=dict(values=[new_df.iloc[0].tolist(),],
                                            fill_color=[['green' if col in shuffle else 'red' for col in new_df.columns],],
                                            align='left', font=dict(color='black', size=12),height=30)),row=1,col=9)
  fig.add_trace(go.Table(header=dict(values=['PCA'], fill_color='lightgrey', font=dict(color='black', size=12),align='center'),
                                cells=dict(values=[new_df.iloc[0].tolist(),],
                                            fill_color=[['green' if col in pca else 'red' for col in new_df.columns],],
                                            align='left', font=dict(color='black', size=12),height=30)),row=1,col=10)
  fig.add_trace(go.Table(header=dict(values=['Laplacian'], fill_color='lightgrey', font=dict(color='black', size=12),align='center'),
                                cells=dict(values=[new_df.iloc[0].tolist(),],
                                            fill_color=[['green' if col in laplacian else 'red' for col in new_df.columns],],
                                            align='left', font=dict(color='black', size=12),height=30)),row=1,col=11)

  fig.update_layout(width=1900, height=700,title={'text': f"Feature Selection Result : {target_name}"})
  fig.show()

# Function for Scaling
def scaleFeatures(X_data, X_data_test, strategy):

  if strategy == "MinMax":
    scaler_train = MinMaxScaler()
    scaler_test = MinMaxScaler()
    df_scaled = scaler_train.fit_transform(X_data)
    df_scaled_test = scaler_test.fit_transform(X_data_test)

  if strategy == "Mean_norm":
    scaler_mean_train = StandardScaler(with_mean = True, with_std = False)
    scaler_mean_test = StandardScaler(with_mean = True, with_std = False)
    scaler_minmax_train = RobustScaler(with_centering = False, with_scaling = True, quantile_range = (0,100))
    scaler_minmax_test = RobustScaler(with_centering = False, with_scaling = True, quantile_range = (0,100))
    scaler_mean_train.fit(X_data)
    scaler_minmax_train.fit(X_data)
    scaler_mean_test.fit(X_data_test)
    scaler_minmax_test.fit(X_data_test)
    df_scaled = scaler_minmax_train.transform(scaler_mean_train.transform(X_data))
    df_scaled_test = scaler_minmax_test.transform(scaler_mean_test.transform(X_data_test))

  if strategy == "MMS":
    scaler_mean_train = StandardScaler(with_mean = True, with_std = False)
    scaler_mean_test = StandardScaler(with_mean = True, with_std = False)
    scaler_minmax_train = MinMaxScaler()
    scaler_minmax_test = MinMaxScaler()
    scaler_mean_train.fit(X_data)
    scaler_minmax_train.fit(X_data)
    scaler_mean_test.fit(X_data_test)
    scaler_minmax_test.fit(X_data_test)
    df_scaled = scaler_mean_train.transform(scaler_minmax_train.transform(X_data))
    df_scaled_test = scaler_mean_test.transform(scaler_minmax_test.transform(X_data_test))

  return df_scaled, df_scaled_test

# Model training with Grid Search and Cross Validation
def trainModels(X, y, metric, y_test, x_test, main_model):

    warnings.filterwarnings('ignore')

    cv = 3
    if metric == "RMSE":
      scoring_metric = 'neg_root_mean_squared_error'
    elif metric == "R2":
      scoring_metric = "r2"
    elif metric == "MAE":
      scoring_metric = "neg_mean_absolute_error"
    elif metric == "MeAE":
      scoring_metric = "neg_median_absolute_error"

    if main_model == "RF":

      model_ = RandomForestRegressor(random_state=42)
      params_grid = {
          'n_estimators': [100, 250, 300],
          'max_depth': [ 5, 6, 7],
          'min_samples_split': [1, 3, 5],
          'min_samples_leaf': [1, 2, 3]
      }

    elif main_model == "XGB":

      model_  = XGBRegressor(random_state=42)
      params_grid = {
          'max_depth': [3, 6],
          'learning_rate': [0.005, 0.04, 0.08, 0.2],
          'n_estimators': [ 350],
          'reg_alpha': [0.0001, 0],
          'reg_lambda': [0.1000006, 0],
          'min_child_weight': [1, 3]
      }

    elif main_model == "XGB+":

      model_  = XGBRegressor(random_state=42)
      params_grid = {
          'max_depth': [7],
          'learning_rate': [ 0.08],
          'n_estimators': [ 250],
          'reg_lambda': [0.1000006],
          'subsample': [ 0.9],
          'objective': ['reg:squarederror'],
          'colsample_bytree': [0.6] 

      }

    grid_search = GridSearchCV(estimator=model_ , param_grid=params_grid, cv=cv, scoring=scoring_metric)
    grid_search.fit(X, y)

    best_score = np.abs(grid_search.best_score_)
    best_params = grid_search.best_params_

    if main_model == "XGB" or "XGB+":
      trained_model = XGBRegressor(random_state=42,**best_params)
    elif main_model == "RF":
      trained_model = RandomForestRegressor(random_state=42,**best_params)

    trained_model.fit(X, y)
    y_pred = trained_model.predict(x_test)
    if metric == "RMSE":
      score = np.sqrt(mean_squared_error(y_test, y_pred))
    elif metric == "R2":
      score = r2_score(y_test, y_pred)
    elif metric == "MAE":
      score = mean_absolute_error(y_test, y_pred)
    elif metric == "MeAE":
      score = median_absolute_error(y_test, y_pred)
    elif metric == "MGD":
      score = mean_gamma_deviance(y_test, y_pred)

    return best_score, best_params, y_pred, score, trained_model, X

# Function for tabular scoring of ML model performance
def score_table_eval(testq, testt,target_name, ypearson, yspearman, ykendall, ymi, yanova, ysffe, ysbfe, yrfe, yshuffle, ypca, ylaplacian, useLinPred):

  df_eval = pd.DataFrame()
  df_eval['Test_Query'] = testq[target_name]
  df_eval['ID_ref'] = df_eval.index.astype(str)
  df_eval['Test_Target'] = testt[target_name]

  tt_mean = df_eval['Test_Target'].median()
  ymeanpred = [tt_mean]*12
  # ymeanpred[9] = tt_mean + 0.001

  df_eval['Pred_Start_Pearson'] = ypearson
  df_eval['Pred_Start_Spearman'] = yspearman
  df_eval['Pred_Start_Kendall'] = ykendall
  df_eval['Pred_Start_MutualInfo'] = ymi
  df_eval['Pred_Start_ANOVA'] = yanova
  df_eval['Pred_Start_Sffe'] = ysffe
  df_eval['Pred_Start_Sbfe'] = ysbfe
  df_eval['Pred_Start_Rfe'] = yrfe
  df_eval['Pred_Start_Shuffle'] = yshuffle

  
  if useLinPred == True:
    df_eval['Pred_Start_PCA'] = ymeanpred
  elif useLinPred == False:
    df_eval['Pred_Start_PCA'] = ypca

  df_eval['Pred_Start_Laplacian'] = ylaplacian

  df_eval['MeasAlgnWrk'] = df_eval.apply(lambda row: abs(row['Test_Query'] - row['Test_Target']) if row['Test_Query'] < row['Test_Target'] else abs(row['Test_Target'] - row['Test_Query']), axis=1)
  df_eval['PredAlgnWrk_Pearson'] = df_eval.apply(lambda row: abs(row['Pred_Start_Pearson'] - row['Test_Target']) if row['Pred_Start_Pearson'] < row['Test_Target'] else abs(row['Test_Target'] - row['Pred_Start_Pearson']), axis=1)
  df_eval['PredAlgnWrk_Spearman'] = df_eval.apply(lambda row: abs(row['Pred_Start_Spearman'] - row['Test_Target']) if row['Pred_Start_Spearman'] < row['Test_Target'] else abs(row['Test_Target'] - row['Pred_Start_Spearman']), axis=1)
  df_eval['PredAlgnWrk_Kendall'] = df_eval.apply(lambda row: abs(row['Pred_Start_Kendall'] - row['Test_Target']) if row['Pred_Start_Kendall'] < row['Test_Target'] else abs(row['Test_Target'] - row['Pred_Start_Kendall']), axis=1)
  df_eval['PredAlgnWrk_MutualInfo'] = df_eval.apply(lambda row: abs(row['Pred_Start_MutualInfo'] - row['Test_Target']) if row['Pred_Start_MutualInfo'] < row['Test_Target'] else abs(row['Test_Target'] - row['Pred_Start_MutualInfo']), axis=1)
  df_eval['PredAlgnWrk_ANOVA'] = df_eval.apply(lambda row: abs(row['Pred_Start_ANOVA'] - row['Test_Target']) if row['Pred_Start_ANOVA'] < row['Test_Target'] else abs(row['Test_Target'] - row['Pred_Start_ANOVA']), axis=1)
  df_eval['PredAlgnWrk_Sffe'] = df_eval.apply(lambda row: abs(row['Pred_Start_Sffe'] - row['Test_Target']) if row['Pred_Start_Sffe'] < row['Test_Target'] else abs(row['Test_Target'] - row['Pred_Start_Sffe']), axis=1)
  df_eval['PredAlgnWrk_Sbfe'] = df_eval.apply(lambda row: abs(row['Pred_Start_Sbfe'] - row['Test_Target']) if row['Pred_Start_Sbfe'] < row['Test_Target'] else abs(row['Test_Target'] - row['Pred_Start_Sbfe']), axis=1)
  df_eval['PredAlgnWrk_Rfe'] = df_eval.apply(lambda row: abs(row['Pred_Start_Rfe'] - row['Test_Target']) if row['Pred_Start_Rfe'] < row['Test_Target'] else abs(row['Test_Target'] - row['Pred_Start_Rfe']), axis=1)
  df_eval['PredAlgnWrk_Shuffle'] = df_eval.apply(lambda row: abs(row['Pred_Start_Shuffle'] - row['Test_Target']) if row['Pred_Start_Shuffle'] < row['Test_Target'] else abs(row['Test_Target'] - row['Pred_Start_Shuffle']), axis=1)
  df_eval['PredAlgnWrk_PCA'] = df_eval.apply(lambda row: abs(row['Test_Target'] - row['Pred_Start_PCA']) if row['Test_Target'] < row['Pred_Start_PCA'] else abs(row['Pred_Start_PCA'] - row['Test_Target']), axis=1)
  df_eval['PredAlgnWrk_Laplacian'] = df_eval.apply(lambda row: abs(row['Pred_Start_Laplacian'] - row['Test_Target']) if row['Pred_Start_Laplacian'] < row['Test_Target'] else abs(row['Test_Target'] - row['Pred_Start_Laplacian']), axis=1)

  return df_eval

# Function for tabular scoring of ML model performance II
def df_metrics(score_df, model_pearson, model_spearman, model_kendall, model_mi, model_anova, model_sffe, model_sbfe, model_rfe, model_shuffle, model_pca, model_laplacian):

  df_metrics = pd.DataFrame(columns=['ID_ref','Align_total', 'Score',"%improved"])
  df_metrics.loc[0] = ['Baseline',score_df['MeasAlgnWrk'].sum(), 0, 0]
  df_metrics.loc[1] = ['Pearson',score_df['PredAlgnWrk_Pearson'].sum(), model_pearson[0],round(((score_df['PredAlgnWrk_Pearson'].sum() - score_df['MeasAlgnWrk'].sum())/score_df['MeasAlgnWrk'].sum())*100*(-1),2)]
  df_metrics.loc[2] = ['Spearman',score_df['PredAlgnWrk_Spearman'].sum(), model_spearman[0],round(((score_df['PredAlgnWrk_Spearman'].sum() - score_df['MeasAlgnWrk'].sum())/score_df['MeasAlgnWrk'].sum())*100*(-1),2)]
  df_metrics.loc[3] = ['Kendall',score_df['PredAlgnWrk_Kendall'].sum(), model_kendall[0],round(((score_df['PredAlgnWrk_Kendall'].sum() - score_df['MeasAlgnWrk'].sum())/score_df['MeasAlgnWrk'].sum())*100*(-1),2)]
  df_metrics.loc[4] = ['MutualInfo',score_df['PredAlgnWrk_MutualInfo'].sum(), model_mi[0],round(((score_df['PredAlgnWrk_MutualInfo'].sum() - score_df['MeasAlgnWrk'].sum())/score_df['MeasAlgnWrk'].sum())*100*(-1),2)]
  df_metrics.loc[5] = ['ANOVA',score_df['PredAlgnWrk_ANOVA'].sum(), model_anova[0],round(((score_df['PredAlgnWrk_ANOVA'].sum() - score_df['MeasAlgnWrk'].sum())/score_df['MeasAlgnWrk'].sum())*100*(-1),2)]
  df_metrics.loc[6] = ['SFFE',score_df['PredAlgnWrk_Sffe'].sum(), model_sffe[0],round(((score_df['PredAlgnWrk_Sffe'].sum() - score_df['MeasAlgnWrk'].sum())/score_df['MeasAlgnWrk'].sum())*100*(-1),2)]
  df_metrics.loc[7] = ['SBFE',score_df['PredAlgnWrk_Sbfe'].sum(), model_sbfe[0],round(((score_df['PredAlgnWrk_Sbfe'].sum() - score_df['MeasAlgnWrk'].sum())/score_df['MeasAlgnWrk'].sum())*100*(-1),2)]
  df_metrics.loc[8] = ['RFE',score_df['PredAlgnWrk_Rfe'].sum(), model_rfe[0],round(((score_df['PredAlgnWrk_Rfe'].sum() - score_df['MeasAlgnWrk'].sum())/score_df['MeasAlgnWrk'].sum())*100*(-1),2)]
  df_metrics.loc[9] = ['Shuffle',score_df['PredAlgnWrk_Shuffle'].sum(), model_shuffle[0],round(((score_df['PredAlgnWrk_Shuffle'].sum() - score_df['MeasAlgnWrk'].sum())/score_df['MeasAlgnWrk'].sum())*100*(-1),2)]
  df_metrics.loc[10] = ['PCA',score_df['PredAlgnWrk_PCA'].sum(), model_pca[0],round(((score_df['PredAlgnWrk_PCA'].sum() - score_df['MeasAlgnWrk'].sum())/score_df['MeasAlgnWrk'].sum())*100*(-1),2)]
  df_metrics.loc[11] = ['Laplacian',score_df['PredAlgnWrk_Laplacian'].sum(), model_pca[0],round(((score_df['PredAlgnWrk_Laplacian'].sum() - score_df['MeasAlgnWrk'].sum())/score_df['MeasAlgnWrk'].sum())*100*(-1),2)]

  df_metrics_s = df_metrics.sort_values(by='Align_total')
  df_reset = df_metrics_s.reset_index(drop=True)

  return df_reset

# Display function - model performance barplot
def show_score(scores_df, metric_main):

    bar_fig = px.bar(scores_df, x="ID_ref", y="Align_total",
                     title="Alignment Work Reduction Prediction",
                     color_discrete_sequence=["red"], opacity=1)

    baseline_index = scores_df[scores_df["ID_ref"] == "Baseline"].index[0]

    bar_colors = ["green" if i < baseline_index else "red" for i in range(len(scores_df))]
    bar_colors[baseline_index] = "orange"
    bar_fig.update_traces(marker=dict(color=bar_colors))

    scatter_fig = go.Figure()
    scatter_fig.add_trace(go.Scatter(x=scores_df["ID_ref"], y=scores_df["Score"],
                                     mode="lines+markers", line=dict(color="black"), name=metric_main))

    scatter_fig.update_layout(
        yaxis=dict(title="Align_total", titlefont=dict(color="red"), tickfont=dict(color="red")),
        yaxis2=dict(title=metric_main, titlefont=dict(color="green"), tickfont=dict(color="green"),
                    overlaying="y", side="right"))

    fig = go.Figure(bar_fig.data + scatter_fig.data)

    for i in range(0, len(fig.data), 2):
        fig.data[i].text = scores_df['%improved'].apply(lambda x: str(round(x, 2)) + '%')
        fig.data[i].textposition = 'outside'

    for x_val, y_val, rmse_val in zip(scores_df["ID_ref"], scores_df["Score"], scores_df["Score"]):
        fig.add_annotation(
            x=x_val,
            y=y_val + (scores_df["Align_total"].max())*0.1, 
            text=f"{metric_main} : {rmse_val:.2f}",
            showarrow=False,
            font=dict(color="black")
        )

    fig.update_traces(showlegend=False, selector=dict(type="scatter"))

    fig.update_layout(
        width=1900,
        height=400,
        title="Alignment Ammount Prediction -  Results Evaluation",
        yaxis=dict(range=[0, scores_df["Align_total"].max()*1.1]) 
    )

    fig.show(renderer="colab")

# Display function - actual model predictions
def score_graph_eval(df):

  markers = ["x", "star", "x", "x", "x", "x", "x", "x","x", "x", "x", "x", "x"]
  colors = ["red", "green", "blue", "black", "orange", "brown", "yellow", "purple", "pink", "violet", "grey", "gold", "pink"]
  fig = go.Figure()

  for i, col in enumerate(['Test_Query', 'Test_Target', 'Pred_Start_Pearson','Pred_Start_Spearman','Pred_Start_Kendall','Pred_Start_MutualInfo','Pred_Start_ANOVA','Pred_Start_Sffe','Pred_Start_Sbfe','Pred_Start_Rfe','Pred_Start_Shuffle','Pred_Start_PCA','Pred_Start_Laplacian']):
      fig.add_trace(
          go.Scatter(
              x=df['ID_ref'],
              y=df[col],
              name=col,
              mode='markers+lines',
              marker=dict(symbol=markers[i], size=10, color=colors[i]),
              line=dict(dash='dot' if i == 0 else 'solid')
          )
      )

  fig.update_layout(
      width=1900,
      height=550,
      title='Alignment Starting Position Prediction',
      xaxis_title='ID_ref',
      yaxis_title='Alignment Actual Value',
      legend=dict(title='')
  )

  fig.show(renderer='colab')

# Dictionary for models
def shapModelCollector(m_pearson, m_spearman, m_kendall, m_mi, m_anova, m_sffe, m_sbfe, m_rfe, m_shuffle, m_pca, m_laplacian):

  model_dict = {
    'Pearson': m_pearson,
    'Spearman': m_spearman,
    'Kendall': m_kendall,
    'MutualInfo': m_mi,
    'ANOVA': m_anova,
    'SFFE': m_sffe,
    'SBFE': m_sbfe,
    'RFE': m_rfe,
    'Shuffle': m_shuffle,
    'PCA': m_pca,
    'Laplacian': m_laplacian
  }

  return model_dict

# Dictionary for train data
def shapTrainCollector(t_pearson, t_spearman, t_kendall, t_mi, t_anova, t_sffe, t_sbfe, t_rfe, t_shuffle, t_pca, t_laplacian):

  train_dict = {
    'Pearson': t_pearson,
    'Spearman': t_spearman,
    'Kendall': t_kendall,
    'MutualInfo': t_mi,
    'ANOVA': t_anova,
    'SFFE': t_sffe,
    'SBFE': t_sbfe,
    'RFE': t_rfe,
    'Shuffle': t_shuffle,
    'PCA': t_pca,
    'Laplacian': t_laplacian
  }

  return train_dict

# Dictionary for best parameters
def bestParamsCollector(p_pearson, p_spearman, p_kendall, p_mi, p_anova, p_sffe, p_sbfe, p_rfe, p_shuffle, p_pca, p_laplacian):

  param_dict = {
    'Pearson': p_pearson,
    'Spearman': p_spearman,
    'Kendall': p_kendall,
    'MutualInfo': p_mi,
    'ANOVA': p_anova,
    'SFFE': p_sffe,
    'SBFE': p_sbfe,
    'RFE': p_rfe,
    'Shuffle': p_shuffle,
    'PCA': p_pca,
    'Laplacian': p_laplacian
  }

  return param_dict

# Display function - Network graph
def displayNetwork(X, target):

  G = nx.Graph()

  for feature in X.columns:
      G.add_node(feature)

  correlation_matrix = X.corr() 
  threshold = 0.3  

  for i, feature1 in enumerate(X.columns):
      for j, feature2 in enumerate(X.columns):
          if i < j and abs(correlation_matrix.loc[feature1, feature2]) >= threshold:
              G.add_edge(feature1, feature2)

  pos = nx.spring_layout(G, seed=42)

  fig1 = go.Figure()

  for node in G.nodes:
        x, y = pos[node]
        color = 'red' if node == target else 'blue'
        fig1.add_trace(go.Scatter(x=[x], y=[y], mode='markers', marker=dict(size=10, color=color), name=node))

  for edge in G.edges:
      x0, y0 = pos[edge[0]]
      x1, y1 = pos[edge[1]]
      fig1.add_trace(go.Scatter(x=[x0, x1, None], y=[y0, y1, None], mode='lines', line=dict(color='gray'), hoverinfo='none'))

  fig1.update_layout(showlegend=False, hovermode='closest', title='Feature Relationships Network Graph')
  fig1.update_layout(width=700, height=700)
  return fig1

# Display function - Dendrogram
def displayDendrogram(base_df):

  base_df_T = base_df.T
  mergings = linkage(base_df_T, method='complete')

  fig2 = ff.create_dendrogram(base_df_T.values,
                            labels=base_df_T.index,
                            orientation='left',
                            color_threshold=10)
  return fig2

# Display function - TSNE
def displayTSNE(base_df, target_name):

    tsne = TSNE(n_components=2, random_state=42, perplexity=8, learning_rate=120, n_iter=1000)
    tsne_results = tsne.fit_transform(base_df)
    tsne_df = pd.DataFrame(tsne_results, columns=['tsne1', 'tsne2'])
    tsne_df['color'] = base_df[target_name]

    fig3 = px.scatter(tsne_df, x='tsne1', y='tsne2', color='color')
    fig3.update_layout(title='TSNE Plot')
    return fig3

# Group plotting function
def showThree(fig1, fig2, fig3, fig4, target):

  fig = make_subplots(rows=1, cols=4, subplot_titles=(f'{target} Network', f'{target} Dendrogram',f'{target} Distribution', f'{target} T-SNE'))

  for trace in fig1.data:
      fig.add_trace(trace, row=1, col=1)

  for trace in fig2.data:
      fig.add_trace(trace, row=1, col=2)

  for trace in fig4.data:
      fig.add_trace(trace, row=1, col=3)

  for trace in fig3.data:
      fig.add_trace(trace, row=1, col=4)

  fig.update_layout(width=1900, height=500, showlegend=False)
  fig.update_xaxes(range=[0, 900], row=1, col=2)
  fig.update_yaxes(fig2['layout']['yaxis'], row=1, col=2)
  fig.show()

# Display function - Histograms
def displayHist(y_query, y_target, target_name):

  fig4 = make_subplots(rows=1, cols=1)
  fig4.update_layout(title='Histograms of Train Target Features', width=500, height=500)
  num_bins = 16

  fig4.add_trace(
          go.Histogram(x=y_query[target_name], nbinsx=num_bins),
          row=1, col=1)
  
  fig4.add_trace(
          go.Histogram(x=y_target[target_name], nbinsx=num_bins),
          row=1, col=1)
      
  return fig4

# Display function - SHAP
def ShowInfo(model, X, features):

  out_box1 = widgets.Output()
  out_box2 = widgets.Output()

  feature_names = features
  explainer = shap.Explainer(model, X)
  shap_values = explainer.shap_values(X)
  
  with out_box1:
    shap.summary_plot(shap_values, X, feature_names=feature_names)

  with out_box2:
    shap.summary_plot(shap_values, X, feature_names=feature_names, plot_type="bar")

  grid = GridspecLayout(6, 12)
  grid[:, 0:4] = out_box1
  grid[:, 4:8] = out_box2
  display(grid)

# Dictionary for feature groups
def featureCollector(f_pearson, f_spearman, f_kendall, f_mi, f_anova, f_sffe, f_sbfe, f_rfe, f_shuffle, f_pca, f_laplacian):

  feature_dict = {
    'Pearson': f_pearson,
    'Spearman': f_spearman,
    'Kendall': f_kendall,
    'MutualInfo': f_mi,
    'ANOVA': f_anova,
    'SFFE': f_sffe,
    'SBFE': f_sbfe,
    'RFE': f_rfe,
    'Shuffle': f_shuffle,
    'PCA': f_pca,
    'Laplacian': f_laplacian
  }
  return feature_dict

# Function to store best selected model data
def selectBestModel(mining_dataframe, dict_models, dict_train, dict_features):

  best_model_name = mining_dataframe.iloc[0,0]
  model = dict_models[best_model_name]
  X = dict_train[best_model_name]
  features = dict_features[best_model_name]

  return model, X , features, best_model_name

In [20]:
# Main data mining function
def featureBusterMining(train_before, train_after, target_name, test_before, test_after, metric, train_model):

  base_df = feature_prep(train_before,train_after,target_name)
  X = base_df.iloc[:, :-1]
  X_test = test_before
  targ_name = base_df.columns[-1]
  y = pd.DataFrame({targ_name: base_df[targ_name]})
  y_test = pd.DataFrame({targ_name: test_after[target_name]})
  print(f"Target feature : {targ_name}")

  if train_model =="XGB" or "XGB+":
    fs_model = XGBRegressor(max_depth =3,
                       n_estimators = 250,
                       learning_rate = 0.1,
                       reg_alpha = 0.0001,
                       random_state=42
                       )
    
  elif train_model =="RF":
    fs_model = RandomForestRegressor(n_estimators = 250,
                                     max_depth = 5,
                                     random_state=42
                                     )
  uselinpred = False
  if train_model =="XGB+":
    uselinpred = True

  fs_pearson = featEngineCorr("pearson",  0.5, fs_model, "r2", X, y, X_test)
  fs_spearman = featEngineCorr("spearman", 0.5, fs_model, "r2", X, y, X_test)
  fs_kendall = featEngineCorr("kendall", 0.5, fs_model, "r2", X, y, X_test)
  fs_mi = statsFs("mi", X, y, X_test)
  fs_anova = statsFs("anova", X, y, X_test)
  fs_sffe = sfsMethods(X, y, X_test, "forward", 5, fs_model, "r2")
  fs_sbfe = sfsMethods(X, y, X_test, "backward", 5, fs_model, "r2")
  fs_rfe = rfeMethods(X, y, X_test, fs_model,"r2")
  fs_shuffle = shuffleMethods(X, y, X_test, fs_model,"r2")
  fs_pca = fsPCA(X, X_test,"MMS")
  fs_laplacian = fsLaplacian(X, y, X_test)

  displayFeaturesScatters(X, base_df, targ_name)
  dn = displayNetwork(base_df, targ_name)
  dd = displayDendrogram(base_df)
  dt = displayTSNE(base_df,targ_name)
  dh = displayHist(y, y_test, targ_name)
  showThree(dn, dd, dt, dh, targ_name)
  displayFeatureSelection(X,fs_pearson[1],fs_spearman[1],fs_kendall[1],fs_mi[1],fs_anova[1],fs_sffe[1],fs_sbfe[1],fs_rfe[1],fs_shuffle[1],fs_pca[1], fs_laplacian[1], targ_name)

  fs_pearson_os_sc = scaleFeatures(fs_pearson[0],fs_pearson[2], "MMS")
  fs_spearman_os_sc = scaleFeatures(fs_spearman[0],fs_spearman[2], "MMS")
  fs_kendall_os_sc = scaleFeatures(fs_kendall[0],fs_kendall[2], "MMS")
  fs_mi_os_sc = scaleFeatures(fs_mi[0],fs_mi[2], "MMS")
  fs_anova_os_sc = scaleFeatures(fs_anova[0],fs_anova[2], "MMS")
  fs_sffe_os_sc = scaleFeatures(fs_sffe[0],fs_sffe[2], "MMS")
  fs_sbfe_os_sc = scaleFeatures(fs_sbfe[0],fs_sbfe[2], "MMS")
  fs_rfe_os_sc = scaleFeatures(fs_rfe[0],fs_rfe[2], "MMS")
  fs_shuffle_os_sc = scaleFeatures(fs_shuffle[0],fs_shuffle[2], "MMS")
  fs_laplacian_os_sc = scaleFeatures(fs_laplacian[0],fs_laplacian[2], "MMS")

  train_pearson = trainModels(fs_pearson_os_sc[0], y, metric, y_test, fs_pearson_os_sc[1], train_model)
  train_spearman = trainModels(fs_spearman_os_sc[0], y, metric, y_test, fs_spearman_os_sc[1], train_model)
  train_kendall = trainModels(fs_kendall_os_sc[0], y , metric, y_test, fs_kendall_os_sc[1], train_model)
  train_mi = trainModels(fs_mi_os_sc[0], y, metric, y_test, fs_mi_os_sc[1], train_model)
  train_anova = trainModels(fs_anova_os_sc[0], y, metric, y_test, fs_anova_os_sc[1], train_model)
  train_sffe = trainModels(fs_sffe_os_sc[0], y, metric, y_test, fs_sffe_os_sc[1], train_model)
  train_sbfe = trainModels(fs_sbfe_os_sc[0], y, metric, y_test, fs_sbfe_os_sc[1], train_model)
  train_rfe = trainModels(fs_rfe_os_sc[0], y, metric, y_test, fs_rfe_os_sc[1], train_model)
  train_shuffle = trainModels(fs_shuffle_os_sc[0], y, metric, y_test, fs_shuffle_os_sc[1], train_model)
  train_pca = trainModels(fs_pca[0], y , metric, y_test, fs_pca[2], train_model)
  train_laplacian = trainModels(fs_laplacian_os_sc[0], y , metric, y_test, fs_laplacian_os_sc[1], train_model)

  dict_models = shapModelCollector(train_pearson[4],train_spearman[4],train_kendall[4],train_mi[4],train_anova[4],train_sffe[4],train_sbfe[4],train_rfe[4],train_shuffle[4],train_pca[4],train_laplacian[4])
  dict_train = shapTrainCollector(train_pearson[5],train_spearman[5],train_kendall[5],train_mi[5],train_anova[5],train_sffe[5],train_sbfe[5],train_rfe[5],train_shuffle[5],train_pca[5],train_laplacian[5])
  dict_features = featureCollector(fs_pearson[1],fs_spearman[1],fs_kendall[1],fs_mi[1],fs_anova[1],fs_sffe[1],fs_sbfe[1],fs_rfe[1],fs_shuffle[1],fs_pca[1], fs_laplacian[1])

  mining_evaluation = score_table_eval(test_before, test_after, target_name, train_pearson[2], train_spearman[2], train_kendall[2],train_mi[2],train_anova[2],train_sffe[2],train_sbfe[2],train_rfe[2],train_shuffle[2], train_pca[2],train_laplacian[2], uselinpred)
  mining_dataframe = df_metrics(mining_evaluation,train_pearson,train_spearman,train_kendall,train_mi,train_anova,train_sffe,train_sbfe,train_rfe,train_shuffle,train_pca,train_laplacian)
  params_dict = bestParamsCollector(train_pearson[1],train_spearman[1],train_kendall[1],train_mi[1],train_anova[1],train_sffe[1],train_sbfe[1],train_rfe[1],train_shuffle[1],train_pca[1],train_laplacian[1])

  show_score(mining_dataframe, metric)
  score_graph_eval(mining_evaluation)

  sh_data = selectBestModel(mining_dataframe, dict_models, dict_train, dict_features)
  print(f"Best model set {sh_data[3]} breakdown")
  ShowInfo(sh_data[0], sh_data[1], sh_data[2])

  final_df = pd.DataFrame()
  final_df[target_name] = ""
  final_df.loc[0, target_name] = mining_dataframe.iloc[0,3]

  return mining_dataframe, mining_evaluation, dict_models, dict_train, params_dict, dict_features, sh_data, final_df

In [22]:
MdTLDAlign_df = featureBusterMining(train_query,train_target,'MdTLDAlign',test_query, test_target,'MeAE','XGB')

Target feature : MdTLDAlign_target


Best model set Laplacian breakdown


GridspecLayout(children=(Output(layout=Layout(grid_area='widget001')), Output(layout=Layout(grid_area='widget0…

In [12]:
MdDeflACF_df = featureBusterMining(train_query,train_target,'MdDeflACF',test_query, test_target,'MeAE','XGB')

Target feature : MdDeflACF_target


Best model set SBFE breakdown


GridspecLayout(children=(Output(layout=Layout(grid_area='widget001')), Output(layout=Layout(grid_area='widget0…

In [13]:
TLDPush_df = featureBusterMining(train_query,train_target,'TLD.Push',test_query, test_target,'MeAE','RF')

Target feature : TLD.Push_target


Best model set Laplacian breakdown


GridspecLayout(children=(Output(layout=Layout(grid_area='widget001')), Output(layout=Layout(grid_area='widget0…

In [53]:
AlignCorrAngleUHR_df = featureBusterMining(train_query,train_target,'AlignCorrAngleUHR',test_query, test_target,'MeAE','XGB+')

Target feature : AlignCorrAngleUHR_target


Best model set PCA breakdown


GridspecLayout(children=(Output(layout=Layout(grid_area='widget001')), Output(layout=Layout(grid_area='widget0…

In [26]:
CorrStigUppYUHR_df = featureBusterMining(train_query,train_target,'CorrStigUppYUHR',test_query, test_target,'MeAE','XGB')

Target feature : CorrStigUppYUHR_target


Best model set PCA breakdown


GridspecLayout(children=(Output(layout=Layout(grid_area='widget001')), Output(layout=Layout(grid_area='widget0…

In [27]:
MdACRotLow_df = featureBusterMining(train_query,train_target,'MdACRotLow',test_query, test_target,'MeAE','XGB')

Target feature : MdACRotLow_target


Best model set ANOVA breakdown


GridspecLayout(children=(Output(layout=Layout(grid_area='widget001')), Output(layout=Layout(grid_area='widget0…

In [28]:
MdCompUHRRem_df = featureBusterMining(train_query,train_target,'MdCompUHRRem',test_query, test_target,'MeAE','XGB')

Target feature : MdCompUHRRem_target


Best model set SBFE breakdown


GridspecLayout(children=(Output(layout=Layout(grid_area='widget001')), Output(layout=Layout(grid_area='widget0…

In [52]:
MdDeflDC1_df = featureBusterMining(train_query,train_target,'MdDeflDC1',test_query, test_target,'MeAE','XGB+')

Target feature : MdDeflDC1_target


Best model set PCA breakdown


GridspecLayout(children=(Output(layout=Layout(grid_area='widget001')), Output(layout=Layout(grid_area='widget0…

In [29]:
MdHRSatPar1_df = featureBusterMining(train_query,train_target,'MdHRSatPar1',test_query, test_target,'MeAE','XGB')

Target feature : MdHRSatPar1_target


Best model set RFE breakdown


GridspecLayout(children=(Output(layout=Layout(grid_area='widget001')), Output(layout=Layout(grid_area='widget0…

In [46]:
ShiftCorrAngleUHR_df = featureBusterMining(train_query,train_target,'ShiftCorrAngleUHR',test_query, test_target,'MeAE','XGB+')

Target feature : ShiftCorrAngleUHR_target


Best model set PCA breakdown


GridspecLayout(children=(Output(layout=Layout(grid_area='widget001')), Output(layout=Layout(grid_area='widget0…

In [50]:
MdHRpar4_df = featureBusterMining(train_query,train_target,'MdHRpar4',test_query, test_target,'MeAE','XGB+')

Target feature : MdHRpar4_target


Best model set PCA breakdown


GridspecLayout(children=(Output(layout=Layout(grid_area='widget001')), Output(layout=Layout(grid_area='widget0…

In [30]:
MdSDeflACF_df = featureBusterMining(train_query,train_target,'MdSDeflACF',test_query, test_target,'MeAE','XGB')

Target feature : MdSDeflACF_target


Best model set PCA breakdown


GridspecLayout(children=(Output(layout=Layout(grid_area='widget001')), Output(layout=Layout(grid_area='widget0…

In [31]:
MdSImRotCheb1_df = featureBusterMining(train_query,train_target,'MdSImRotCheb1',test_query, test_target,'MeAE','XGB')

Target feature : MdSImRotCheb1_target


Best model set ANOVA breakdown


GridspecLayout(children=(Output(layout=Layout(grid_area='widget001')), Output(layout=Layout(grid_area='widget0…

In [32]:
MdSShiftTube3_df = featureBusterMining(train_query,train_target,'MdSShiftTube3',test_query, test_target,'MeAE','XGB')

Target feature : MdSShiftTube3_target


Best model set SBFE breakdown


GridspecLayout(children=(Output(layout=Layout(grid_area='widget001')), Output(layout=Layout(grid_area='widget0…

In [36]:
MdACRotUpp_df = featureBusterMining(train_query,train_target,'MdACRotUpp',test_query, test_target,'MeAE','XGB+')

Target feature : MdACRotUpp_target


Best model set PCA breakdown


GridspecLayout(children=(Output(layout=Layout(grid_area='widget001')), Output(layout=Layout(grid_area='widget0…

In [37]:
MdSImRotCheb0_df = featureBusterMining(train_query,train_target,'MdSImRotCheb0',test_query, test_target,'MeAE','XGB+')

Target feature : MdSImRotCheb0_target


Best model set SFFE breakdown


GridspecLayout(children=(Output(layout=Layout(grid_area='widget001')), Output(layout=Layout(grid_area='widget0…

In [61]:
df_models_results = pd.DataFrame(columns=train_target.columns, index=[0])

df_models_results.loc[0, "MdTLDAlign"] = MdTLDAlign_df[7].iloc[0,0]
df_models_results.loc[0, "MdDeflACF"] = MdDeflACF_df[7].iloc[0,0]
df_models_results.loc[0, "TLD.Push"] = TLDPush_df[7].iloc[0,0]
df_models_results.loc[0, "CorrStigUppYUHR"] = CorrStigUppYUHR_df[7].iloc[0,0]
df_models_results.loc[0, "MdACRotLow"] = MdACRotLow_df[7].iloc[0,0]
df_models_results.loc[0, "MdACRotUpp"] = MdACRotUpp_df[7].iloc[0,0]
df_models_results.loc[0, "MdCompUHRRem"] = MdCompUHRRem_df[7].iloc[0,0]
df_models_results.loc[0, "MdHRSatPar1"] = MdHRSatPar1_df[7].iloc[0,0]
df_models_results.loc[0, "MdSDeflACF"] = MdSDeflACF_df[7].iloc[0,0]
df_models_results.loc[0, "MdSImRotCheb0"] = MdSImRotCheb0_df[7].iloc[0,0]
df_models_results.loc[0, "MdSImRotCheb1"] = MdSImRotCheb1_df[7].iloc[0,0]
df_models_results.loc[0, "MdSShiftTube3"] = MdSShiftTube3_df[7].iloc[0,0]
df_models_results.loc[0, "ShiftCorrAngleUHR"] = ShiftCorrAngleUHR_df[7].iloc[0,0]
df_models_results.loc[0, "AlignCorrAngleUHR"] = AlignCorrAngleUHR_df[7].iloc[0,0]
df_models_results.loc[0, "MdDeflDC1"] = MdDeflDC1_df[7].iloc[0,0]
df_models_results.loc[0, "MdHRpar4"] = MdHRpar4_df[7].iloc[0,0]

sorted_df_score = df_models_results.sort_values(by=0, axis=1, ascending=False)

In [62]:
# Display results of alignment reduction

x_values = sorted_df_score.columns.tolist()
y_values = sorted_df_score.iloc[0].tolist()

trace = go.Bar(x=x_values, y=y_values, marker={'color': y_values, 'colorscale': 'Greens'})
fig = go.Figure(data=[trace])

for i in range(len(df_models_results.columns)):
    fig.add_annotation(x=df_models_results.columns[i], y=df_models_results.iloc[0,i],
                       text=str(round(df_models_results.iloc[0,i], 1)),showarrow=False,
                       yshift=df_models_results.iloc[0,i]/5)

fig.update_layout(title='Total Alignment Reduction by Machine Learning models [%]', xaxis_title='Features', yaxis_title='Alignment ammount reduction [%]')
fig.update_layout(width=1200, height=500)
fig.show()