In [19]:
import pandas as pd
import numpy as np
from src import configuration as config
from sklearn.model_selection import train_test_split

DATA_DIR = config.DATA_RAW_DIR

df_train = config.load_dataset(DATA_DIR / "dataset_rank_train.csv")
# split the data
df_train, df_test = train_test_split(df_train, test_size=0.25, random_state=42)

# print the shape of the data
print(f"df_train shape: {df_train.shape}")
print(f"df_test shape: {df_test.shape}")
print(df_train.head(2))

df_train shape: (27040, 7)
df_test shape: (9014, 7)
       dataset model tuning scoring     encoder  cv_score  rank
22557       29   DTC   full     AUC  CV10RGLMME  0.837250   0.0
18217    40981    LR     no      F1          CE  0.780664  28.0


In [12]:
factors = ['dataset', 'model', 'tuning', 'scoring']
new_index =  ['encoder']

In [26]:
# Here we essentially remove all the encoders and make sure that the factors are unique meaning that we have no duplicates

print(f"df_train shape: {df_train.shape}")
print(f"Initial df_train: \n {df_train.head(2)}")

# Select the factors and encoder columns from the training data and group by the factors
grouped_train = df_train[factors + ["encoder"]].groupby(factors)
print(f"After the groupby: \n {grouped_train.head(2)}")

# Aggregate the grouped data by replacing all values with NaN
aggregated_train = grouped_train.agg(lambda x: np.nan)
print(f"Before the reset:\n {aggregated_train.head(2)}")
print(f"aggregated_train shape: {aggregated_train.shape}")

# Reset the index of the aggregated data to make the factors columns regular columns
X_train = aggregated_train.reset_index()[factors]
print(f"After the reset:\n {X_train.head(2)}")
print(f"X_train shape: {X_train.shape}")

# Do the same for the test data
grouped_test = df_test[factors + ["encoder"]].groupby(factors)
aggregated_test = grouped_test.agg(lambda x: np.nan)
X_test = aggregated_test.reset_index()[factors]

df_train shape: (27040, 7)
Initial df_train: 
        dataset model tuning scoring     encoder  cv_score  rank
22557       29   DTC   full     AUC  CV10RGLMME  0.837250   0.0
18217    40981    LR     no      F1          CE  0.780664  28.0
After the groupby: 
        dataset model tuning scoring     encoder
22557       29   DTC   full     AUC  CV10RGLMME
18217    40981    LR     no      F1          CE
21858      981    LR     no      F1       CV2TE
13962     1169   DTC  model      F1     BUCV2TE
1680     40981   DTC  model     AUC        ME1E
...        ...   ...    ...     ...         ...
17699    42344   SVC     no     ACC         CBE
3767     42343   KNC  model     AUC          TE
1137     42750   KNC  model      F1       DTEM5
13089    42344   SVC     no      F1         CBE
17130    42343   SVC     no     ACC          OE

[2317 rows x 5 columns]
Before the reset:
                               encoder
dataset model tuning scoring         
3       DTC   full   ACC          NaN
      

In [14]:
from src.features import pairwise_utils as pu

# join to ensure X_train and y_train's indices are ordered the same
y_train = pd.merge(X_train,
                   pu.get_pairwise_target(df_train, features=factors, target="rank", column_to_compare="encoder"),
                   on=factors, how="left").drop(factors, axis=1).fillna(0)

In [18]:
# shape of y_train
print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_train: \n {y_train.head(2)}")

X_train shape: (1160, 4)
y_train shape: (1160, 992)
y_train: 
    (CV10RGLMME, CV2TE)  (CV2TE, CV10RGLMME)  (CV10RGLMME, ME1E)  \
0                  0.0                  0.0                 0.0   
1                  0.0                  0.0                 0.0   

   (ME1E, CV10RGLMME)  (CV10RGLMME, PBTE01)  (PBTE01, CV10RGLMME)  \
0                 0.0                   1.0                   0.0   
1                 0.0                   1.0                   0.0   

   (CV10RGLMME, DTEM5)  (DTEM5, CV10RGLMME)  (CV10RGLMME, CV10TE)  \
0                  0.0                  0.0                   0.0   
1                  0.0                  0.0                   0.0   

   (CV10TE, CV10RGLMME)  ...  (BUCV5RGLMME, DTEM2)  (DTEM2, BUCV5RGLMME)  \
0                   0.0  ...                   0.0                   0.0   
1                   0.0  ...                   0.0                   0.0   

   (BUCV5RGLMME, PBTE0001)  (PBTE0001, BUCV5RGLMME)  (BUCV5RGLMME, CBE)  \
0              

In [28]:
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
# import mutli output random forest classifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from category_encoders import OneHotEncoder
from src.features import encoder_utils as eu
import src.pipeline.evaluation.evaluation_utils as er

# create a random forest classifier
rf = RandomForestClassifier()

# create a multi-output classifier using the random forest
multi_rf = MultiOutputClassifier(rf)

dummy_pipe = Pipeline([("encoder", eu.NoY(OneHotEncoder())), ("model", multi_rf)])
y_pred = pd.DataFrame(dummy_pipe.fit(X_train, y_train).predict(X_test), columns=y_train.columns, index=X_test.index)
df_pred = pd.merge(df_test,
                   pu.join_pairwise2rankings(X_test, y_pred, factors),
                   on=factors + ["encoder"], how="inner")

rankings_test = er.get_rankings(df_pred, factors=factors, new_index=new_index, target="rank")
rankings_pred = er.get_rankings(df_pred, factors=factors, new_index=new_index, target="rank_pred")
print(er.average_spearman(rankings_test, rankings_pred))

  t = rs * np.sqrt((dof / ((rs+1.0) * (1.0-rs))).clip(0))
  t = rs * np.sqrt((dof / ((rs+1.0) * (1.0-rs))).clip(0))
  t = rs * np.sqrt((dof / ((rs+1.0) * (1.0-rs))).clip(0))
  t = rs * np.sqrt((dof / ((rs+1.0) * (1.0-rs))).clip(0))
  t = rs * np.sqrt((dof / ((rs+1.0) * (1.0-rs))).clip(0))
  t = rs * np.sqrt((dof / ((rs+1.0) * (1.0-rs))).clip(0))
  t = rs * np.sqrt((dof / ((rs+1.0) * (1.0-rs))).clip(0))


0.00038541406487959
