In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import os
import sys
import warnings
warnings.filterwarnings('ignore')

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
os.chdir('/content/drive/My Drive/QRT_FOOT_DATA_CHALLENGE')

from Utilities.submit import submit
from Utilities.data_process import scores_to_target
from Utilities.get_data import get_test
from Utilities.get_data import get_train_processed
from Utilities.get_data import get_test_processed
from Utilities.get_data import get_train
from Utilities.bench_validation import test_bench

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
os.chdir('/content/drive/My Drive/QRT_FOOT_DATA_CHALLENGE/Utilities/Data')

train_data, train_scores = get_train_processed()
test_data = get_test_processed()
test_data_pre_processed = get_test()
target = scores_to_target(train_scores)

# Features Engineering

In [5]:
list_columns = list(train_data.columns)
n_columns = len(train_data.columns)
print(f"We have {n_columns} columns")

We have 280 columns


In [6]:
list_columns

['HOME_TEAM_SHOTS_TOTAL_season_sum',
 'HOME_TEAM_SHOTS_INSIDEBOX_season_sum',
 'HOME_TEAM_SHOTS_OFF_TARGET_season_sum',
 'HOME_TEAM_SHOTS_ON_TARGET_season_sum',
 'HOME_TEAM_SHOTS_OUTSIDEBOX_season_sum',
 'HOME_TEAM_PASSES_season_sum',
 'HOME_TEAM_SUCCESSFUL_PASSES_season_sum',
 'HOME_TEAM_SAVES_season_sum',
 'HOME_TEAM_CORNERS_season_sum',
 'HOME_TEAM_FOULS_season_sum',
 'HOME_TEAM_YELLOWCARDS_season_sum',
 'HOME_TEAM_REDCARDS_season_sum',
 'HOME_TEAM_OFFSIDES_season_sum',
 'HOME_TEAM_ATTACKS_season_sum',
 'HOME_TEAM_PENALTIES_season_sum',
 'HOME_TEAM_SUBSTITUTIONS_season_sum',
 'HOME_TEAM_BALL_SAFE_season_sum',
 'HOME_TEAM_DANGEROUS_ATTACKS_season_sum',
 'HOME_TEAM_INJURIES_season_sum',
 'HOME_TEAM_GOALS_season_sum',
 'HOME_TEAM_GAME_WON_season_sum',
 'HOME_TEAM_GAME_DRAW_season_sum',
 'HOME_TEAM_GAME_LOST_season_sum',
 'HOME_TEAM_SHOTS_TOTAL_season_average',
 'HOME_TEAM_SHOTS_INSIDEBOX_season_average',
 'HOME_TEAM_SHOTS_OFF_TARGET_season_average',
 'HOME_TEAM_SHOTS_ON_TARGET_season_a

#Dropping of correlated features

In [7]:
def drop_corrolated_features(train_data, corr_matrix, threshold = 0.85):
  features_to_drop = set()
  for i in range(len(corr_matrix.columns)):
    for j in range(i):
        if abs(corr_matrix.iloc[i, j]) > threshold:
            colname = corr_matrix.columns[i]
            features_to_drop.add(colname)
  train_data_processed = train_data.drop(columns = features_to_drop).copy()
  return train_data_processed

In [8]:
train_data_processed = drop_corrolated_features(train_data, train_data.corr(), threshold = 0.8)

After dropping the corrolated we get the following

In [9]:
list_columns = list(train_data_processed.columns)
n_columns = len(train_data_processed.columns)
print(f"After dropping the correlated features we have {n_columns} columns")

After dropping the correlated features we have 161 columns


In [10]:
def grid_search_drop_corrolated_features(train_data, threshold_min, threshold_max, n_thresholds = 10):
  corr_matrix = train_data.corr().copy()
  thresholds = np.linspace(threshold_min, threshold_max, n_thresholds)
  score_opti, threshold_opti = 0, 0
  for threshold in tqdm(thresholds):
    train_data_processed = drop_corrolated_features(train_data, corr_matrix, threshold = threshold)
    scores_cross_val = test_bench(train_data_processed, target, verbose = False)
    if(scores_cross_val > score_opti):
      score_opti = scores_cross_val
      threshold_opti = threshold
  return score_opti, threshold_opti

In [11]:
score_opti, threshold_opti = grid_search_drop_corrolated_features(train_data, 0.7, 0.9, n_thresholds=20)

  0%|          | 0/20 [00:38<?, ?it/s]


KeyboardInterrupt: 

We now save the dataset

In [12]:
score_opti, threshold_opti = (0.4958156014311057, 0.8789473684210527)
train_data_processed = drop_corrolated_features(train_data, train_data.corr(), threshold = threshold_opti)
test_data_processed = drop_corrolated_features(test_data, train_data.corr(), threshold = threshold_opti)

In [13]:
file_path = '/content/drive/My Drive/QRT_FOOT_DATA_CHALLENGE/Utilities/Data'
os.chdir(file_path)
train_data_processed.to_csv('train_data_features_and_nan_processed.csv', index=False)
test_data_processed.to_csv('test_data_features_and_nan_processed.csv', index=False)

# Locally Linear Embedding

In [14]:
from sklearn.manifold import LocallyLinearEmbedding

In [15]:
def drop_linear_embedding(train_data, reduc = 10, n_neighbors = 10):
  n_col_target = int(len(train_data.columns) * (1- 0.01*reduc))
  embedding = LocallyLinearEmbedding(n_components=n_col_target, n_neighbors=n_neighbors)
  train_data_embedded = embedding.fit_transform(train_data)
  train_data_embedded = pd.DataFrame(train_data_embedded)
  return train_data_embedded

In [16]:
def grid_search_linear_embedding(train_data, reduc_min, reduc_max, n_reduc, n_neighbors_min, n_neighbors_max):
  score_opti, reduc_opti, n_neighbors_opti = 0, reduc_min, n_neighbors_min
  for reduc in tqdm(np.linspace(reduc_min, reduc_max, n_reduc)):
    for n_neighbors in range(n_neighbors_min, n_neighbors_max+1):
      train_data_embedded = drop_linear_embedding(train_data, reduc = reduc, n_neighbors = n_neighbors)
      scores_cross_val = test_bench(train_data_embedded, target, verbose = False)
      if(scores_cross_val > score_opti):
        score_opti = scores_cross_val
        reduc_opti = reduc
        n_neighbors_opti = n_neighbors
  return score_opti, reduc_opti, n_neighbors_opti

# Combination drop

In [17]:
grid_search_linear_embedding(train_data, reduc_min = 1, reduc_max = 20, n_reduc = 5, n_neighbors_min = 10, n_neighbors_max=10)

100%|██████████| 5/5 [27:32<00:00, 330.59s/it]


(0.4094165568230245, 5.75, 10)

It doesn't seem to work well