In [None]:
pip install scikit-optimize



In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import lightgbm as lgb
from sklearn import model_selection
from sklearn.metrics import accuracy_score
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
import warnings
import matplotlib.pyplot as plt
import os
import sys
from skopt import BayesSearchCV
warnings.filterwarnings('ignore')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
os.chdir('/content/drive/My Drive/QRT_FOOT_DATA_CHALLENGE')

from Utilities.submit import submit
from Utilities.data_process import scores_to_target
from Utilities.get_data import get_test_data_normalized_features_low_importance_dropped
from Utilities.get_data import get_test_data_non_normalized_features_low_importance_dropped
from Utilities.get_data import get_train_data_normalized_features_low_importance_dropped
from Utilities.get_data import get_train_data_non_normalized_features_low_importance_dropped
from Utilities.get_data import get_train_processed
from Utilities.get_data import get_train
from Utilities.get_data import get_train_data_features_and_nan_processed
from Utilities.get_data import get_test
from Utilities.get_data import get_test_data_features_and_nan_processed
from Utilities.get_data import get_test_data_normalized_features_low_importance_dropped_and_correlated_features
from Utilities.get_data import get_train_data_normalized_features_low_importance_dropped_and_correlated_features
from Utilities.bench_validation import test_bench

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
os.chdir('/content/drive/My Drive/QRT_FOOT_DATA_CHALLENGE/Utilities/Data')

train_data, train_scores = get_train_data_normalized_features_low_importance_dropped()
target = scores_to_target(train_scores)

In [None]:
train_data_dropped, train_scores = get_train_data_normalized_features_low_importance_dropped_and_correlated_features()
test_data_dropped = get_test_data_normalized_features_low_importance_dropped_and_correlated_features()

# Logistic Regression

In [None]:
param_space = {
    'C': (0.03, 0.04, 'log-uniform'),  # Bayesian search uses continuous ranges
    'penalty': ['l1'],
    'solver': ['saga']
}

bayes_search = BayesSearchCV(
    LogisticRegression(max_iter=200),
    search_spaces=param_space,
    n_iter=100,
    cv=5,
    scoring='accuracy',
    random_state=42
)
bayes_search.fit(train_data, target)
print(bayes_search.best_score_)
print(bayes_search.best_params_)

0.49817246608061366
OrderedDict([('C', 0.03270861845887186), ('penalty', 'l1'), ('solver', 'saga')])


In [None]:
#C=0.0030026315789473683
C = 0.003
n_c = 30
eps = 0.0001
param_grid = {
    'C': np.linspace(C-eps, C+eps, n_c),  # Valeurs de régularisation
    'penalty': ['l1'],       # Type de pénalité
    'solver': ['saga'] # Solveurs compatibles avec les pénalités choisies
}

model = LogisticRegression(max_iter=1000)
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)

grid_search.fit(train_data, target)

best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

print("Best score:", grid_search.best_score_)
print("Best params :", best_params)
print("Best Model :", best_model)

Best params : {'C': 0.0031, 'penalty': 'l1', 'solver': 'saga'}
Best Model : LogisticRegression(C=0.0031, max_iter=1000, penalty='l1', solver='saga')


In [None]:
best_params['C']

0.0031111111111111114

In [None]:
max_iter = 1000

model_opti = LogisticRegression(C=best_params['C'], max_iter=max_iter, multi_class='multinomial', penalty=best_params['penalty'], solver=best_params['solver'])
model_opti.fit(train_data, target)
model_opti.score(train_data, target)

In [None]:
os.chdir('/content/drive/My Drive/QRT_FOOT_DATA_CHALLENGE/Utilities/Data')
submission = submit(model_opti, get_test_data_features_and_nan_processed(), get_test())

In [None]:
file_path = '/content/drive/My Drive/QRT_FOOT_DATA_CHALLENGE/Utilities/Submissions'
os.chdir(file_path)
submission.to_csv('submission_6.csv', index=False)

### With the dropped data

In [None]:
param_space = {
    'C': (0.03, 0.04, 'log-uniform'),  # Bayesian search uses continuous ranges
    'penalty': ['l1'],
    'solver': ['saga']
}

bayes_search = BayesSearchCV(
    LogisticRegression(max_iter=200),
    search_spaces=param_space,
    n_iter=100,
    cv=5,
    scoring='accuracy',
    random_state=42
)
bayes_search.fit(train_data_dropped, target)
print(bayes_search.best_score_)
print(bayes_search.best_params_)

0.4963845089080716
OrderedDict([('C', 0.03686993972814559), ('penalty', 'l1'), ('solver', 'saga')])


In [None]:
#C=0.0030026315789473683
C = 0.003
n_c = 30
eps = 0.0001
param_grid = {
    'C': np.linspace(C-eps, C+eps, n_c),  # Valeurs de régularisation
    'penalty': ['l1'],       # Type de pénalité
    'solver': ['saga'] # Solveurs compatibles avec les pénalités choisies
}

model = LogisticRegression(max_iter=1000)
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)

grid_search.fit(train_data_dropped, target)

best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

print("Best score:", grid_search.best_score_)
print("Best params :", best_params)
print("Best Model :", best_model)

Best score: 0.43534097779011105
Best params : {'C': 0.0029000000000000002, 'penalty': 'l1', 'solver': 'saga'}
Best Model : LogisticRegression(C=0.0029000000000000002, max_iter=1000, penalty='l1',
                   solver='saga')


# Random Forest

In [None]:
# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],          # Number of trees in the forest
    'max_depth': [None, 10, 20, 30],          # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],          # Minimum samples required to split a node
    'min_samples_leaf': [1, 2, 4],            # Minimum samples required at each leaf node
    'max_features': ['sqrt', 'log2']          # Number of features to consider when looking for the best split
}

# Initialize the model
rf = RandomForestClassifier(random_state=42)

# Set up GridSearchCV
grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    scoring='accuracy',       # Use accuracy as the scoring metric
    cv=5,                     # 5-fold cross-validation
    verbose=1,
    n_jobs=-1                 # Use all available cores
)

# Fit the grid search to the data
grid_search.fit(train_data_nan_filled, scores_to_target(train_scores_nan_filled))

# Access the best model and parameters
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation Score:", grid_search.best_score_)
best_rf = grid_search.best_estimator_


Fitting 5 folds for each of 216 candidates, totalling 1080 fits
Best Parameters: {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 200}
Best Cross-Validation Score: 0.4927262696438423


# Xgboost


In [None]:
os.chdir('/content/drive/My Drive/QRT_FOOT_DATA_CHALLENGE/Utilities/Data')

train_data, train_scores = get_train_data_features_and_nan_processed()
target = scores_to_target(train_scores) + 1

In [None]:
model = xgb.XGBClassifier(objective='multi:softmax', num_class=3, random_state=42)

In [None]:
param_grid = {
  'n_estimators': [200],             # Number of boosting rounds
  'max_depth': [15],                 # Maximum depth of trees
  'learning_rate': [0.0001, 0.05],   # Step size shrinkage
  'subsample': [0.1, 0.2, 0.5],      # Subsample ratio of training instances
  'colsample_bytree': [0.6, 1.0],    # Subsample ratio of columns when constructing each tree
  'gamma': [0, 0.2, 0.5],            # Minimum loss reduction required to make a further partition
  'reg_alpha': [0, 0.1],             # L1 regularization term on weights
  'reg_lambda': [0.1, 1.0]           # L2 regularization term on weights
}

randomized_search = RandomizedSearchCV(
    estimator=model,
    param_distributions=param_grid,
    n_iter=50,                  # Number of random combinations to try
    scoring='accuracy',         # Metric for evaluation
    cv=5,                       # 5-fold cross-validation
    verbose=1,
    random_state=42,
    n_jobs=-1
)

# Fit RandomizedSearchCV
randomized_search.fit(train_data_dropped, target+1)

# Print the best parameters and best score
print("Best parameters found: ", randomized_search.best_params_)
print("Best cross-validation score: ", randomized_search.best_score_)


Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best parameters found:  {'subsample': 0.1, 'reg_lambda': 1.0, 'reg_alpha': 0.1, 'n_estimators': 200, 'max_depth': 15, 'learning_rate': 0.0001, 'gamma': 0, 'colsample_bytree': 0.6}
Best cross-validation score:  0.4927257410729329


In [None]:
param_grid = {
  'n_estimators': [150, 200],        # Number of boosting rounds
  'max_depth': [3],                 # Maximum depth of trees
  'learning_rate': [0.05], # Step size shrinkage
  'subsample': [0.6, 0.8, 1.0],           # Subsample ratio of training instances
  'colsample_bytree': [0.4, 0.6],    # Subsample ratio of columns when constructing each tree
  'gamma': [0.005, 0.1, 0.15],                 # Minimum loss reduction required to make a further partition
  'reg_alpha': [0, 0.1],             # L1 regularization term on weights
  'reg_lambda': [0.5, 1.0]           # L2 regularization term on weights
}

grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring='accuracy',         # Metric for evaluation
    cv=5,                       # 5-fold cross-validation
    verbose=1,
    n_jobs=-1                   # Use all available cores
)

grid_search.fit(train_data_dropped, target+1)

# Print the best parameters and best score
print("Best parameters found: ", randomized_search.best_params_)
print("Best cross-validation score: ", randomized_search.best_score_)

Fitting 5 folds for each of 144 candidates, totalling 720 fits
Best parameters found:  {'subsample': 0.1, 'reg_lambda': 1.0, 'reg_alpha': 0.1, 'n_estimators': 200, 'max_depth': 15, 'learning_rate': 0.0001, 'gamma': 0, 'colsample_bytree': 0.6}
Best cross-validation score:  0.4927257410729329


In [None]:
os.chdir('/content/drive/My Drive/QRT_FOOT_DATA_CHALLENGE/Utilities/Data')
model_opti = xgb.XGBClassifier(objective='multi:softmax', num_class=3, random_state=42, subsample = 0.8, reg_lambda = 1.5, reg_alpha = 0.5, n_estimators = 200, max_depth = 3, learning_rate = 0.05, gamma = 0.1, colsample_bytree=0.6)
model_opti.fit(train_data, target+1)
submission = submit(model_opti, get_test_data_features_and_nan_processed(), get_test(), 1)

In [None]:
file_path = '/content/drive/My Drive/QRT_FOOT_DATA_CHALLENGE/Utilities/Submissions'
os.chdir(file_path)
submission.to_csv('submission_7.csv', index=False)

with the dropped data

# LightGBM

In [None]:
lgb_model = lgb.LGBMClassifier(random_state=42)

# 3. Define the parameter grid
param_grid = {
    'num_leaves': [15, 31, 63],        # Controls tree complexity
    'max_depth': [-1, 5, 10],          # Maximum depth of the tree
    'learning_rate': [0.01, 0.05], # Step size for optimization
    'n_estimators': [50, 100, 200],    # Number of boosting iterations
    'subsample': [0.6, 0.8, 1.0],      # Fraction of data for bagging
    'colsample_bytree': [0.6, 0.8, 1.0] # Fraction of features for tree
}

# 4. Perform Grid Search
grid_search = GridSearchCV(
    estimator=lgb_model,
    param_grid=param_grid,
    scoring='accuracy',   # Metric to optimize
    cv=5,                 # 5-fold cross-validation
    verbose=1,
    n_jobs=-1             # Use all available cores
)

# Fit the GridSearchCV
grid_search.fit(train_data_dropped, target)

# 5. Display the best parameters and accuracy
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best Cross-Validated Accuracy: {grid_search.best_score_:.4f}")

Fitting 5 folds for each of 486 candidates, totalling 2430 fits
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006744 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1536
[LightGBM] [Info] Number of data points in the train set: 12303, number of used features: 140
[LightGBM] [Info] Start training from score -1.185158
[LightGBM] [Info] Start training from score -1.351077
[LightGBM] [Info] Start training from score -0.831626
Best Parameters: {'colsample_bytree': 1.0, 'learning_rate': 0.05, 'max_depth': 5, 'n_estimators': 50, 'num_leaves': 31, 'subsample': 0.6}
Best Cross-Validated Accuracy: 0.4927


In [None]:
lgb_model = lgb.LGBMClassifier(random_state=42)

# 3. Define the parameter grid
param_grid = {
    'num_leaves': [5, 15],        # Controls tree complexity
    'max_depth': [-1, 3],          # Maximum depth of the tree
    'learning_rate': [0.01, 0.05], # Step size for optimization
    'n_estimators': [200],    # Number of boosting iterations
    'subsample': [0.6, 0.8, 1.0],      # Fraction of data for bagging
    'colsample_bytree': [0.6, 0.8] # Fraction of features for tree
}

# 4. Perform Grid Search
grid_search = GridSearchCV(
    estimator=lgb_model,
    param_grid=param_grid,
    scoring='accuracy',   # Metric to optimize
    cv=5,                 # 5-fold cross-validation
    verbose=1,
    n_jobs=-1             # Use all available cores
)

# Fit the GridSearchCV
grid_search.fit(train_data_dropped, target)

# 5. Display the best parameters and accuracy
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best Cross-Validated Accuracy: {grid_search.best_score_:.4f}")

Fitting 5 folds for each of 48 candidates, totalling 240 fits
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005347 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1536
[LightGBM] [Info] Number of data points in the train set: 12303, number of used features: 140
[LightGBM] [Info] Start training from score -1.185158
[LightGBM] [Info] Start training from score -1.351077
[LightGBM] [Info] Start training from score -0.831626
Best Parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.01, 'max_depth': -1, 'n_estimators': 200, 'num_leaves': 15, 'subsample': 0.6}
Best Cross-Validated Accuracy: 0.4913


## Neural Networks

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

In [None]:
target = target + 1
X_train, X_val, y_train, y_val = train_test_split(train_data, target, test_size=0.2, random_state=42)

In [None]:
def create_model(input_shape):
    model = Sequential([
        Dense(512, activation='relu', input_shape=(X_train.shape[1],)),
        Dropout(0.3),
        Dense(256, activation='relu', input_shape=(X_train.shape[1],)),
        Dropout(0.3),
        Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
        Dropout(0.3),
        Dense(64, activation='relu'),
        Dropout(0.3),
        Dense(32, activation='relu'),
        Dropout(0.3),
        Dense(3, activation='softmax')  # 3 output units for 3 classes
    ])
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

In [None]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
fold_accuracies = []

In [None]:
for train_index, val_index in kf.split(train_data):
    # Split data
    X_train, X_val = train_data.iloc[train_index], train_data.iloc[val_index]
    y_train, y_val = target[train_index], target[val_index]

    # Create a new instance of the model
    model = create_model(train_data.shape[1])

    # Train the model
    model.fit(X_train, y_train, epochs=10, batch_size=32, callbacks=[early_stopping], verbose=0)  # Adjust epochs as needed

    # Evaluate the model on the validation set
    val_predictions = np.argmax(model.predict(X_val), axis=-1)
    val_accuracy = accuracy_score(y_val, val_predictions)
    fold_accuracies.append(val_accuracy)
    print(f"Validation Accuracy: {val_accuracy}")

print("Average Validation Accuracy:", np.mean(fold_accuracies))

In [None]:
from Utilities.submit import submit
from Utilities.get_data import get_test
from Utilities.get_data import get_test_processed
test_data = get_test_processed()
test_data_pre_processed = get_test()
submission = submit(create_model(test_data_pre_processed.shape[1]), test_data, test_data_pre_processed)

In [None]:
model_submission = create_model(test_data_pre_processed.shape[1])
predictions = np.argmax(model_submission.predict(test_data_pre_processed), axis=-1)

In [None]:
submission = test_data_pre_processed.copy()
submission['AWAY_WINS'] = 0
submission['DRAW'] = 0
submission['HOME_WINS'] = 0
submission['AWAY_WINS'][predictions == 0] = 1
submission['DRAW'][predictions == 1] = 1
submission['HOME_WINS'][predictions == 2] = 1
submission = submission[['AWAY_WINS', 'DRAW', 'HOME_WINS']]
submission.head()

In [None]:
submission = submission.reset_index()
submission.head()

In [None]:
file_path = '/content/drive/My Drive/QRT_FOOT_DATA_CHALLENGE/Utilities/Submissions'
os.chdir(file_path)
submission.to_csv('submission_4.csv', index=False)

The results were way below the accuracy obtained with random forest or logistic regression

#Model Ensembling

Optimal models with the non dropped dataset

In [None]:
logistic_regression_opti = LogisticRegression(C=0.0031, max_iter=1000, penalty='l1', solver='saga')
xgboost_opti = xgb.XGBClassifier(**{'subsample': 0.2, 'reg_lambda': 1.0, 'reg_alpha': 0, 'n_estimators': 200, 'max_depth': 15, 'learning_rate': 0.0001, 'gamma': 0, 'colsample_bytree': 0.6, 'objective': 'multi:softmax', 'num_class': 3, 'random_state': 42})
lightboost_opti = lgb.LGBMClassifier(**{'colsample_bytree': 1.0, 'learning_rate': 0.01, 'max_depth': -1, 'n_estimators': 200, 'num_leaves': 15, 'subsample': 0.6, 'random_state':42})

Optimal models with the dropped dataset

In [None]:
logistic_regression_opti = LogisticRegression(C=0.03686993972814559, max_iter=1000, penalty='l1', solver='saga')
xgboost_opti = xgb.XGBClassifier(**{'subsample': 0.1, 'reg_lambda': 1.0, 'reg_alpha': 0.1, 'n_estimators': 200, 'max_depth': 15, 'learning_rate': 0.0001, 'gamma': 0, 'colsample_bytree': 0.6})
lightboost_opti = lgb.LGBMClassifier(**{'colsample_bytree': 0.8, 'learning_rate': 0.01, 'max_depth': -1, 'n_estimators': 200, 'num_leaves': 15, 'subsample': 0.6, 'random_state':42})

In [None]:
from sklearn.ensemble import StackingClassifier

stacking_model = StackingClassifier(
    estimators=[
        ('logistic', logistic_regression_opti),
        ('xgb', xgboost_opti),
        ('lgb', lightboost_opti)
    ],
    final_estimator=LogisticRegression(C=0.01, penalty='l2', solver='lbfgs', max_iter=1000, class_weight='balanced'),
    n_jobs=-1
)

In [None]:
param_grid = {
    'final_estimator__C': [0.001, 0.01, 0.1, 0.3],  # Regularization for meta-model
    'final_estimator__penalty': ['l2', 'l1'],        # Regularization type
    'cv': [3, 5],                                    # Cross-validation splits for stacking
}

# 4. Grid Search with Cross-Validation
grid_search = GridSearchCV(
    estimator=stacking_model,
    param_grid=param_grid,
    scoring='accuracy',   # Use accuracy as the metric
    verbose=2,
    n_jobs=-1             # Use all available cores
)

# 5. Fit Grid Search
grid_search.fit(train_data_dropped, target)

# 6. Evaluate the Best Model
best_stacking_model = grid_search.best_estimator_

print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best Cross-Validated Accuracy: {grid_search.best_score_:.4f}")

Fitting 5 folds for each of 16 candidates, totalling 80 fits
Best Parameters: {'cv': 5, 'final_estimator__C': 0.001, 'final_estimator__penalty': 'l2'}
Best Cross-Validated Accuracy: 0.4818


In [None]:
best_stacking_model.fit(train_data_dropped, target)
print(best_stacking_model.score(train_data_dropped, target))

0.5071933674713485


In [None]:
print(np.unique(best_stacking_model.predict(train_data_dropped)))

[0. 2.]


In [None]:
best_stacking_model.fit(train_data_dropped, target)
print(np.unique(best_stacking_model.predict(train_data_dropped)))

[0. 2.]


In [None]:
best_stacking_model.predict(train_data_dropped)

array([1., 3., 1., ..., 1., 1., 1.])

In [None]:
np.unique(best_stacking_model.predict(train_data_dropped))

array([1., 3.])

In [None]:
os.chdir('/content/drive/My Drive/QRT_FOOT_DATA_CHALLENGE/Utilities/Data')
test_data_init = get_test()
test_data = get_test_data_normalized_features_low_importance_dropped_and_correlated_features()

In [None]:
submission = submit(best_stacking_model, test_data, test_data_init, target_increment=1)

In [None]:
submission.head()

Unnamed: 0,ID,HOME_WINS,DRAW,AWAY_WINS
0,12303,1,0,0
1,12304,0,0,1
2,12305,1,0,0
3,12306,1,0,0
4,12307,0,0,1


In [None]:
file_path = '/content/drive/My Drive/QRT_FOOT_DATA_CHALLENGE/Utilities/Submissions'
os.chdir(file_path)
submission.to_csv('submission_12.csv', index=False)

# Model ensembling with dataset with dropped correlated features