In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn import model_selection
from sklearn.metrics import accuracy_score
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler
import xgboost as xgb
import warnings
import matplotlib.pyplot as plt
import os
import sys
warnings.filterwarnings('ignore')

Importation of custom packages

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
os.chdir('/content/drive/My Drive/QRT_FOOT_DATA_CHALLENGE')

In [None]:
from Utilities.data_process import scores_to_target
from Utilities.get_data import get_train
from Utilities.get_data import get_test
from Utilities.bench_validation import test_bench

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
os.chdir('/content/drive/My Drive/QRT_FOOT_DATA_CHALLENGE/Clustering')
from clustering_methods import kmeans_clustering

# Importation of the data from the drive

In [None]:
os.chdir('/content/drive/My Drive/QRT_FOOT_DATA_CHALLENGE/Utilities/Data')
train_data, train_scores = get_train()

# Missing Values

We have initially many samples with missing values

In [None]:
(train_data.isna().sum(axis=1) > 0).sum() / len(train_data)

0.3283751930423474

32% of the samples have at least one feature witha a missing value (!)

## Method to manage those missing values

A first, albeit radical, solution could be to drop of every sample that contains a missing value. In the following lines of code, one function for each method for filling the missing values

In [None]:
def fill_with_knn(train_data, train_scores, normalize = True, n_knn = 5):
  train_data_output = train_data.copy()
  train_data_processed = train_data.copy()
  if(normalize):
    scaler = MinMaxScaler()
    train_data_processed = scaler.fit_transform(train_data)
  imputer = KNNImputer(n_neighbors=n_knn)
  train_data_processed = imputer.fit_transform(train_data)
  train_target = scores_to_target(train_scores)
  if(normalize):
    train_data_processed = scaler.inverse_transform(train_data_processed)
  train_data_output[:] = train_data_processed
  return train_data_output, train_target

In [None]:
def fill_with_BayesianBridge(train_data, train_scores, normalize = True, max_iter = 100, tol = 0.01):
  train_data_output = train_data.copy()
  train_data_processed = train_data.copy()
  if(normalize):
    scaler = MinMaxScaler()
    train_data_processed = scaler.fit_transform(train_data_processed)
  imputer = IterativeImputer(max_iter=max_iter, tol=tol, verbose=1)
  train_data_processed = imputer.fit_transform(train_data_processed)
  if(normalize):
    train_data_processed = scaler.inverse_transform(train_data_processed)
  train_target = scores_to_target(train_scores)
  train_data_output[:] = train_data_processed
  return train_data_output, train_target

In [None]:
def fill_with_median(train_data, train_scores):
  train_data = train_data.fillna(train_data.median())
  train_target = scores_to_target(train_scores)
  return train_data, train_target

In [None]:
def fill_with_mean(train_data, train_scores):
  train_data = train_data.fillna(train_data.mean())
  train_target = scores_to_target(train_scores)
  return train_data, train_target

In [None]:
def fill_with_kmeans(train_data, train_scores, n_clusters = 4, is_median = True, normalize = True):
  train_data_output = train_data.copy()
  train_data_processed = train_data.copy()
  train_data_processed = kmeans_clustering(train_data_processed, n_clusters, normalize = normalize).copy()
  if(is_median):
    for label in train_data_processed['cluster'].unique():
      train_data_processed.loc[train_data_processed['cluster'] == label] = train_data_processed.loc[train_data_processed['cluster'] == label].fillna(train_data_processed.loc[train_data_processed['cluster'] == label].median())
  else:
    for label in train_data_processed['cluster'].unique():
      train_data_processed.loc[train_data_processed['cluster'] == label] = train_data_processed.loc[train_data_processed['cluster'] == label].fillna(train_data_processed.loc[train_data_processed['cluster'] == label].mean())
  train_target = scores_to_target(train_scores)
  train_data_output[:] = train_data_processed
  return train_data_output, train_target

In [None]:
def drop_missing_values(train_data, train_scores):
  train_data = train_data.loc[(train_data.isna().sum(axis=1) == 0)].copy()
  train_scores = train_scores.loc[train_data.index].copy()
  train_target = scores_to_target(train_scores)
  return train_data, train_target

In [None]:
max_depth_options = [3, 5, 7]
learning_rate_options = [0.01, 0.1, 0.3]
n_estimators_options = [100, 200, 300]
subsample_options = [0.6, 0.8, 1.0]
colsample_bytree_options = [0.6, 0.8, 1.0]

In [None]:
def fill_with_xgb(df: pd.DataFrame, max_depth = 3, learning_rate = 0.1, n_estimators = 100, subsample = 1.0, colsample_bytree = 1.0, random_state = 42):
    dataframe = df.copy()
    for col in dataframe.columns:
        nan_idx = np.where(dataframe[col].isna())[0]
        dataframe['NaN_values'] = 0
        dataframe.loc[nan_idx, 'NaN_values'] = 1
        df_train = dataframe[dataframe['NaN_values'] == 0]
        df_test = dataframe[dataframe['NaN_values'] == 1]
        X_train = df_train.drop([col, 'NaN_values'], axis=1)
        y_train = df_train[col]
        X_test = df_test.drop([col, 'NaN_values'], axis=1)
        model_xgb = xgb.XGBRegressor(random_state=random_state, n_estimators = n_estimators, max_depth = max_depth, learning_rate = learning_rate, subsample = subsample, colsample_bytree = colsample_bytree)
        model_xgb.fit(X_train, y_train)
        y_pred = model_xgb.predict(X_test)
        df.loc[nan_idx, col] = y_pred
    return df

In order to compare those different methods, it might be best not to delete some samples. However, it seems even more import not to drop entire features, in doing so it would become harder to compare the different methods with a same fixed model

## Validation of the filling / dropping method

In order to select the best method for handling the missing values we will proceed as follows:
- We select a benchmark model, not fitted, in our case a Logistic Regression.
- For a given method for handling the missing values chosen, we perform a
grid search on a *static* set of parameters. cross validation, we select the parameters once and for all and we use the same to compare the different methods.

In [None]:
def grid_search_knn(train_data, train_scores, verbose = True, n_min = 2, n_max = 20, only_normalize = True, only_median = True):
  fill_metric = ['median'] if(only_median) else ['median', 'mean']
  normalize = [True] if(only_normalize) else [True, False]
  n_neighbors = np.arange(n_min, n_max + 1, 1)
  score_opti = 0
  n_opti = n_min
  metric_opti = 'median'
  for norm in tqdm(normalize):
    for metric in tqdm(fill_metric):
      for n in tqdm(n_neighbors):
        train_data_processed, train_target_processed = fill_with_knn(train_data, train_scores, normalize = norm, n_knn = n)
        scores_cross_val = test_bench(train_data_processed, train_target_processed, verbose = False)
        if(scores_cross_val > score_opti):
          score_opti = scores_cross_val
          metric_opti = metric
          n_opti = n
  if(verbose):
    print(scores_cross_val, metric_opti, n_opti)
  return scores_cross_val, metric_opti, n_opti

In [None]:
def grid_search_k_means(train_data, train_scores, verbose = True, n_min = 2, n_max = 20, normalize = True, only_median = True):
  fill_metric = ['median'] if(only_median) else ['median', 'mean']
  n_clusters = np.arange(n_min, n_max + 1, 1)
  score_opti = 0
  for metric in tqdm(fill_metric):
    for k in tqdm(n_clusters):
      train_data_processed, train_target_processed = fill_with_kmeans(train_data, train_scores, n_clusters = k, is_median = (metric == 'median'), normalize = normalize)
      scores_cross_val = test_bench(train_data_processed, train_target_processed, verbose = False)
      if(scores_cross_val > score_opti):
        score_opti = scores_cross_val
        best_metric = metric
        best_k = k
  if(verbose):
    print(scores_cross_val, best_metric, best_k)
  return scores_cross_val, best_metric, best_k

In [None]:
def grid_search_ridge_regression(train_data, train_scores, verbose = True, normalize = True, only_median = True, n_iter = 100, tol = 0.1, alpha_1 = [1e-6, 1e-5, 1e-4], alpha_2 = [1e-6, 1e-5, 1e-4], lambda_1 = [1e-6, 1e-5, 1e-4], lambda_2 = [1e-6, 1e-5, 1e-4]):
  score_opti = 0
  for alpha_1_value in tqdm(alpha_1):
    for alpha_2_value in tqdm(alpha_2):
      for lambda_1_value in tqdm(lambda_1):
        for lambda_2_value in tqdm(lambda_2):
          train_data_processed, train_target_processed = fill_with_BayesianBridge(train_data, train_scores, normalize = normalize, max_iter = n_iter, tol = tol)
          scores_cross_val = test_bench(train_data_processed, train_target_processed, verbose = False)
          if(scores_cross_val > score_opti):
            score_opti = scores_cross_val
            best_alpha_1 = alpha_1_value
            best_alpha_2 = alpha_2_value
            best_lambda_1 = lambda_1_value
            best_lambda_2 = lambda_2_value
          if(verbose):
            print(scores_cross_val, best_alpha_1, best_alpha_2, best_lambda_1, best_lambda_2)
          return scores_cross_val, best_alpha_1, best_alpha_2, best_lambda_1, best_lambda_2

In [None]:
def grid_search_xgb(train_data, target, verbose = True, max_depth_options =[3, 5, 7], learning_rate_options = [0.01, 0.1, 0.3], n_estimators_options = [100, 200, 300], subsample_options = [1.0], colsample_bytree_options = [1.0]):
  score_opti = 0
  for max_depth in max_depth_options:
      for learning_rate in learning_rate_options:
          for n_estimators in tqdm(n_estimators_options):
              for subsample in tqdm(subsample_options):
                  for colsample_bytree in tqdm(colsample_bytree_options):
                      train_data_processed = fill_with_xgb(train_data, max_depth, learning_rate, n_estimators, subsample, colsample_bytree)
                      scores_cross_val = test_bench(train_data_processed, target, verbose = False)
                      if(scores_cross_val > score_opti):
                          score_opti = scores_cross_val
                          best_max_depth = max_depth
                          best_learning_rate = learning_rate
                          best_n_estimators = n_estimators
                          best_subsample = subsample
                          best_colsample_bytree = colsample_bytree
  if(verbose):
    print(scores_cross_val, best_max_depth, best_learning_rate, best_n_estimators, best_subsample, best_colsample_bytree)
  return scores_cross_val, best_max_depth, best_learning_rate, best_n_estimators, best_subsample, best_colsample_bytree

In [None]:
train_data, train_scores = get_train()
train_data_processed, train_target_processed = fill_with_knn(train_data, train_scores)
scores_cross_val = test_bench(train_data_processed, train_target_processed)

Accuracy: 0.48687360217771214
Std:  0.006315970447003278


In [None]:
'''
train_data, train_scores = get_train()
grid_search_knn(train_data, train_scores)
'''

'\ntrain_data, train_scores = get_train()\ngrid_search_knn(train_data, train_scores)\n'

best knn (0.4891498267278488, 'median', 13)

In [None]:
train_data, train_scores = get_train()
grid_search_k_means(train_data, train_scores)

  0%|          | 0/1 [00:00<?, ?it/s]
  0%|          | 0/19 [00:00<?, ?it/s][A
  5%|▌         | 1/19 [02:37<47:11, 157.32s/it][A
 11%|█         | 2/19 [05:09<43:39, 154.08s/it][A
 16%|█▌        | 3/19 [07:43<41:08, 154.25s/it][A
 21%|██        | 4/19 [10:16<38:23, 153.54s/it][A
 26%|██▋       | 5/19 [12:50<35:54, 153.88s/it][A
 32%|███▏      | 6/19 [15:22<33:13, 153.36s/it][A
 37%|███▋      | 7/19 [17:55<30:38, 153.23s/it][A
 42%|████▏     | 8/19 [20:31<28:14, 154.05s/it][A
 47%|████▋     | 9/19 [23:09<25:51, 155.14s/it][A
 53%|█████▎    | 10/19 [25:42<23:11, 154.59s/it][A
 58%|█████▊    | 11/19 [28:18<20:38, 154.87s/it][A
 63%|██████▎   | 12/19 [30:54<18:08, 155.48s/it][A
 68%|██████▊   | 13/19 [33:32<15:37, 156.25s/it][A
 74%|███████▎  | 14/19 [36:14<13:09, 157.86s/it][A
 79%|███████▉  | 15/19 [38:49<10:27, 156.97s/it][A
 84%|████████▍ | 16/19 [41:25<07:50, 156.67s/it][A
 89%|████████▉ | 17/19 [43:59<05:11, 155.78s/it][A
 95%|█████████▍| 18/19 [46:34<02:35, 155.68s/

0.49638434372966245 median 13





(0.49638434372966245, 'median', 13)

best k means : (0.49695335031367377, 'median', 10)

In [None]:
'''
train_data, train_scores = get_train()
train_data_processed, train_target_processed = fill_with_mean(train_data, train_scores)
scores_cross_val = test_bench(train_data_processed, train_target_processed)

train_data, train_scores = get_train()
train_data_processed, train_target_processed = fill_with_median(train_data, train_scores)
scores_cross_val = test_bench(train_data_processed, train_target_processed)

train_data_processed, train_target_processed = fill_by_kmeans(train_data, train_scores)
scores_cross_val = test_bench(train_data_processed, train_target_processed)

train_data, train_scores = get_train()
train_data_processed, train_target_processed = drop_missing_values(train_data, train_scores)
scores_cross_val = test_bench(train_data_processed, train_target_processed)
'''

'\ntrain_data, train_scores = get_train()\ntrain_data_processed, train_target_processed = fill_with_mean(train_data, train_scores)\nscores_cross_val = test_bench(train_data_processed, train_target_processed)\n\ntrain_data, train_scores = get_train()\ntrain_data_processed, train_target_processed = fill_with_median(train_data, train_scores)\nscores_cross_val = test_bench(train_data_processed, train_target_processed)\n\ntrain_data_processed, train_target_processed = fill_by_kmeans(train_data, train_scores)\nscores_cross_val = test_bench(train_data_processed, train_target_processed)\n\ntrain_data, train_scores = get_train()\ntrain_data_processed, train_target_processed = drop_missing_values(train_data, train_scores)\nscores_cross_val = test_bench(train_data_processed, train_target_processed)\n'

For some reasons, there is an issue when we try to denormalize the data with the MinMaxScaler after the inputation

In [None]:
os.chdir('/content/drive/My Drive/QRT_FOOT_DATA_CHALLENGE/Utilities/Data')
train_data, train_scores = get_train()
train_data_processed, train_target_processed = fill_with_BayesianBridge(train_data, train_scores, normalize = True, max_iter = 500, tol = 0.05)
scores_cross_val = test_bench(train_data_processed, train_target_processed)

NameError: name 'fill_with_BayesianBridge' is not defined

In [None]:
train_data, train_scores = get_train()
scores_cross_val, best_alpha_1, best_alpha_2, best_lambda_1, best_lambda_2 = grid_serach_ridge_regression(train_data, train_scores, n_iter = 100, tol = 0.5, alpha_1 = [1e-6, 1e-7, 1e-8], alpha_2 = [1e-6, 1e-7, 1e-8], lambda_1 = [1e-6, 1e-7, 1e-8], lambda_2 = [1e-6, 1e-7, 1e-8])

  0%|          | 0/3 [00:00<?, ?it/s]
  0%|          | 0/3 [00:00<?, ?it/s][A

  0%|          | 0/3 [00:00<?, ?it/s][A[A


  0%|          | 0/3 [00:00<?, ?it/s][A[A[A

[IterativeImputer] Completing matrix with shape (12303, 280)
[IterativeImputer] Change: 10.13668874859399, scaled tolerance: 0.5 
[IterativeImputer] Change: 3.689775364887116, scaled tolerance: 0.5 
[IterativeImputer] Change: 2.394355418042688, scaled tolerance: 0.5 
[IterativeImputer] Change: 1.7165676969891557, scaled tolerance: 0.5 
[IterativeImputer] Change: 1.3726404029223693, scaled tolerance: 0.5 
[IterativeImputer] Change: 1.2014420109867803, scaled tolerance: 0.5 
[IterativeImputer] Change: 1.0652367532622216, scaled tolerance: 0.5 
[IterativeImputer] Change: 0.9650917500697951, scaled tolerance: 0.5 
[IterativeImputer] Change: 0.8872313849635852, scaled tolerance: 0.5 
[IterativeImputer] Change: 0.8205378439314867, scaled tolerance: 0.5 
[IterativeImputer] Change: 0.7614996252158659, scaled tolerance: 0.5 
[IterativeImputer] Change: 0.7080337787140436, scaled tolerance: 0.5 
[IterativeImputer] Change: 0.659763819555693, scaled tolerance: 0.5 
[IterativeImputer] Change: 0.6165

  0%|          | 0/3 [52:55<?, ?it/s]
  0%|          | 0/3 [52:55<?, ?it/s]
  0%|          | 0/3 [52:55<?, ?it/s]
  0%|          | 0/3 [52:55<?, ?it/s]

0.49638431069398053 1e-06 1e-06 1e-06 1e-06





Opti Fill method

For now the method that score the highest with the benchmark model was the "median k means" method.

In [None]:
train_data, train_scores = get_train()
train_data_processed, train_target_processed = fill_with_kmeans(train_data, train_scores, n_clusters = 13)
scores_cross_val = test_bench(train_data_processed, train_target_processed)

Accuracy: 0.49760342646092043
Std:  0.008139208720353782


Export of the dataset

In [None]:
file_path = '/content/drive/My Drive/QRT_FOOT_DATA_CHALLENGE/Utilities/Data'
os.chdir(file_path)
train_data_processed.to_csv('train_data_processed.csv', index=False)

### xgboost

In [None]:
os.chdir('/content/drive/My Drive/QRT_FOOT_DATA_CHALLENGE/Utilities/Data')
train_data, train_scores = get_train()
target = scores_to_target(train_scores)
scores_cross_val, best_max_depth, best_learning_rate, best_n_estimators, best_subsample, best_colsample_bytree = grid_search_xgb(train_data, target)
print(scores_cross_val, best_max_depth, best_learning_rate, best_n_estimators, best_subsample, best_colsample_bytree)

  0%|          | 0/3 [00:00<?, ?it/s]
  0%|          | 0/1 [00:00<?, ?it/s][A

  0%|          | 0/1 [00:00<?, ?it/s][A[A

100%|██████████| 1/1 [06:45<00:00, 405.13s/it]

100%|██████████| 1/1 [06:45<00:00, 405.15s/it]
 33%|███▎      | 1/3 [06:45<13:30, 405.16s/it]
  0%|          | 0/1 [00:00<?, ?it/s][A

  0%|          | 0/1 [00:00<?, ?it/s][A[A

100%|██████████| 1/1 [08:38<00:00, 518.48s/it]

100%|██████████| 1/1 [08:38<00:00, 518.49s/it]
 67%|██████▋   | 2/3 [15:23<07:51, 471.83s/it]
  0%|          | 0/1 [00:00<?, ?it/s][A

  0%|          | 0/1 [00:00<?, ?it/s][A[A

100%|██████████| 1/1 [10:55<00:00, 655.55s/it]

100%|██████████| 1/1 [10:55<00:00, 655.57s/it]
100%|██████████| 3/3 [26:19<00:00, 526.41s/it]
  0%|          | 0/3 [00:00<?, ?it/s]
  0%|          | 0/1 [00:00<?, ?it/s][A

  0%|          | 0/1 [00:00<?, ?it/s][A[A

100%|██████████| 1/1 [06:04<00:00, 364.72s/it]

100%|██████████| 1/1 [06:04<00:00, 364.74s/it]
 33%|███▎      | 1/3 [06:04<12:09, 364.75s/it]
  0%|   

0.49589680313706835 3 0.01 100 1.0 1.0
0.49589680313706835 3 0.01 100 1.0 1.0





#Filling the Test data

In [None]:
os.chdir('/content/drive/My Drive/QRT_FOOT_DATA_CHALLENGE/Utilities/Data')
test_data = get_test()

In [None]:
test_data_processed, train_target_processed = fill_with_kmeans(test_data, train_scores, n_clusters = 13)

In [None]:
file_path = '/content/drive/My Drive/QRT_FOOT_DATA_CHALLENGE/Utilities/Data'
os.chdir(file_path)
test_data_processed.to_csv('test_data_processed.csv', index=False)