MAIN Notebook to compare different model results

1. Loading the dataset

In [1]:
# 1.1. Launch commands to automatically reload modules
%load_ext autoreload
%autoreload 2

In [2]:
# 1.2. Import the packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import altair as alt
import joblib

In [3]:
# 1.3. Load the training, validation and test datasets into a dataframes 
# Load the Parquet files into DataFrames
X_train = pd.read_parquet('../data/processed/X_train.parquet')
X_val = pd.read_parquet('../data/processed/X_val.parquet')
y_train = pd.read_parquet('../data/processed/y_train.parquet')
y_val = pd.read_parquet('../data/processed/y_val.parquet')
X_test = pd.read_parquet('../data/processed/test_cleaned.parquet')

In [4]:
# taking out the player id out of the test data
player_id_test = X_test.pop('player_id')

In [5]:
y_val = y_val.iloc[:, 0]  # Convert the single column DataFrame to a Series
y_val.shape

(9324,)

In [6]:
y_train = y_train.iloc[:, 0]  # Convert the single column DataFrame to a Series
y_train.shape

(37294,)

2. Load the saved models, fit and evaluate each of their performances

In [7]:
# 2.1 Load the Models
RamdomForest_Model = joblib.load('../models/randomForest.joblib')
GradientBoost_Model = joblib.load('../models/GB.joblib')

In [8]:
# Select the probability for the positive class (class 1) 
y_train_probs_rf = RamdomForest_Model.predict_proba(X_train)[:, 1]
y_val_probs_rf = RamdomForest_Model.predict_proba(X_val)[:, 1]

In [9]:
# Select the probability for the positive class (class 1) 
y_train_probs_gb = GradientBoost_Model.predict_proba(X_train)[:, 1]
y_val_probs_gb = GradientBoost_Model.predict_proba(X_val)[:, 1]

In [10]:
# 3.4 Save the probability of the test data of various models
y_test_probs_rf = RamdomForest_Model.predict_proba(X_test)[:, 1]

In [11]:
# 3.4 Save the probability of the test data of various models
y_test_probs_gb = GradientBoost_Model.predict_proba(X_test)[:, 1]

In [12]:
# 3.5 add the 'player_id'column  into the prediction probability
# Convert y_test_prob_rf3 (which is a NumPy array) to a DataFrame
y_test_probs_df_rf = pd.DataFrame(y_test_probs_rf, columns=['drafted'])
y_test_probs_df_gb = pd.DataFrame(y_test_probs_gb, columns=['drafted'])

# Concatenate player_id_test and y_test_prob_df along the columns
Results_final_rf = pd.concat([player_id_test, y_test_probs_df_rf], axis=1)
Results_final_gb = pd.concat([player_id_test, y_test_probs_df_gb], axis=1)

In [14]:
#3.6 save the probability as CSV file

# Convert to DataFrame and save
pd.DataFrame(Results_final_rf).to_csv('../data/external/final/Results_final_rf.csv', index=False)
pd.DataFrame(Results_final_gb).to_csv('../data/external/final/Results_final_gb.csv', index=False)
