In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '2'

In [1]:
import numpy as np
import pandas as pd
import pickle
from pytorch_tabnet.tab_model import TabNetClassifier
from sklearn.metrics import accuracy_score, ndcg_score
from sklearn.model_selection import train_test_split
import torch

In [2]:
torch.cuda.is_available = lambda: False

In [3]:
def open_file(file):
    with open(file, 'rb') as f:
        data = pickle.load(f)
    return data

In [4]:
class Data_From_File():
    
    @staticmethod
    def open_file(file):
        with open(file, 'rb') as f:
            data = pickle.load(f)
        return data

    @staticmethod
    def split(data):
        return pd.DataFrame(data[0]), pd.DataFrame(data[1]), pd.DataFrame(data[2])  
    
    @staticmethod
    def get_data_from_pd(data, need = True) :
    
        X = data['fl_features'].to_numpy()
        y = data['labels'].to_numpy() / 4
        q = []
        if need:
            for i in range(len(data)):
                q.append([data['query_id'][i] for j in range(data['labels'][i].shape[0])])
                
        return X, y, q

    @staticmethod
    def get_doc_query(X,y,q):
        
        qfull = np.concatenate(q)
        yfull = np.concatenate(y)
        Xfull = np.vstack(X)
        
        return Xfull, yfull, qfull

    def __init__(self, file, pool = False, full = True):
        
        self.data_full = self.open_file(file)
        self.data_train, self.data_test, self.data_vali = self.split(self.data_full)
        
        self.get_data_test()
        self.get_data_train()
        self.get_data_vali()
        
        if full:
            self.X_train, self.y_train, self.q_train = self.get_doc_query(self.X_train, self.y_train, self.q_train)
            self.X_test, self.y_test, self.q_test = self.get_doc_query(self.X_test, self.y_test, self.q_test)
            self.X_vali, self.y_vali, self.q_vali = self.get_doc_query(self.X_vali, self.y_vali, self.q_vali)
            
        # if pool:
        #     self.train = Pool(data = self.X_train,
        #                      label= self.y_train,
        #                      group_id= self.q_train)
            
        #     self.test = Pool(data = self.X_test,
        #                      label= self.y_test,
        #                      group_id= self.q_test)
            
        #     self.vali = Pool(data = self.X_vali,
        #                      label= self.y_vali,
        #                      group_id= self.q_vali)
            
    def get_data_train(self):
        
        self.X_train, self.y_train, self.q_train = self.get_data_from_pd(self.data_train)
        return self.X_train, self.y_train, self.q_train
    
    def get_data_test(self):
        
        self.X_test, self.y_test, self.q_test = self.get_data_from_pd(self.data_test)
        return self.X_test, self.y_test, self.q_test
    
    def get_data_vali(self):
        
        self.X_vali, self.y_vali, self.q_vali = self.get_data_from_pd(self.data_vali)
        return self.X_vali, self.y_vali, self.q_vali


In [5]:
dataset = Data_From_File('datas_full.pkl')

In [6]:
score_to_class = {0.0: 0, 0.25: 1, 0.5: 2, 0.75: 3, 1.0: 4}

In [7]:
X_train, X_val = dataset.X_train, dataset.X_vali
y_train, y_val = np.vectorize(score_to_class.get)(dataset.y_train), np.vectorize(score_to_class.get)(dataset.y_vali)

In [8]:
model = TabNetClassifier(
    n_d=8, n_a=8, n_steps=3,
    gamma=1.3, n_independent=2, n_shared=2,
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=2e-2),
    scheduler_params={"step_size":10, "gamma":0.9},
    scheduler_fn=torch.optim.lr_scheduler.StepLR,
    mask_type='sparsemax'  # "sparsemax" or "entmax"
)



In [9]:
def calculate_ndcg(y_true, y_pred_proba):
    y_pred_score = np.argmax(y_pred_proba, axis=1) / 4.0  # Convert classes back to scores
    return ndcg_score([y_true], [y_pred_score])

In [None]:
max_epochs = 10
batch_size = 1024
eval_interval = 2  
model.fit(
        X_train=X_train,
        y_train=y_train,
        eval_set=[(X_val, y_val)],
        eval_name=["validation"],
        eval_metric=["accuracy"],
        max_epochs=1,
        batch_size=batch_size,
        drop_last=True  # Ensure all data is used in each epoch
    )
    
    val_preds = model.predict(X_val)
    val_accuracy = accuracy_score(y_val, val_preds)
    print(f"Validation Accuracy: {val_accuracy:.4f}")

    if epoch % eval_interval == 0:
        print("Calculating NDCG on test set...")

        ndcg_scores = []
        for query_id, group in dataset.data_test.groupby("query_id"):
            sample_group = group.sample(n=100) if len(group) > 100 else group
            test_features = np.array(sample_group['fl_features'].tolist())
            true_labels = np.array(sample_group['labels'].tolist())
            
            true_classes = np.vectorize(score_to_class.get)(true_labels)
            
            pred_proba = model.predict_proba(test_features)

            ndcg = calculate_ndcg(true_labels, pred_proba)
            ndcg_scores.append(ndcg)
        
        avg_ndcg = np.mean(ndcg_scores)
        print(f"NDCG on Test Set (Epoch {epoch}): {avg_ndcg:.4f}")



Epoch 1/10


KeyboardInterrupt: 