In [87]:
import numpy as np 
import pandas as pd
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, GRU, LayerNormalization, RNN, GRUCell, SpatialDropout1D
from sklearn.model_selection import train_test_split
# from tensorflow.keras.utils import to_categorical
from keras.callbacks import EarlyStopping
from keras.layers import Dropout
import time
from datetime import datetime
from sklearn.metrics import accuracy_score, jaccard_score, classification_report, hamming_loss
import tensorflow_ranking as tfr

# The maximum number of words to be used. (most frequent)
MAX_NB_WORDS = 60000
# Max number of words in each complaint.
MAX_SEQUENCE_LENGTH = 250
# This is fixed.
EMBEDDING_DIM = 100


In [2]:

def load_gru():
    X = np.load('../../vectorised_data/X_gru.npy')
    y = np.load('../../vectorised_data/y.npy')
    
    # split data into train and test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    print('Loaded data')
    
    return X_train, X_test, y_train, y_test

def load_data(gru=False,w2v=False):
    if gru:
        return load_gru()
    
    # X = scipy.sparse.load_npz('vectorised_data/X.npz')
    # y = np.load('vectorised_data/y.npy')
    
    # # split data into train and test
    # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # print('Loaded data')

    # return X_train, X_test, y_train, y_test

In [95]:
class RankGRU:
    def __init__(self, load_models=False):
        self.train_time = 0
        self.predict_time = 0
        self.preds = None
        self.params = {
            'units': 128,
            'dropout': 0.2,
            'layers': 2,
            'batch_size': 64,
            'epochs': 5,
            'lr': 0.001,
        }
        self.epochs = self.params['epochs']
        self.batch_size = self.params['batch_size']
        self.train_time = 0
        self.predict_time = 0
        self.create_model(load_models)
    
    def create_model(self, load_models=False):
        gpus = tf.config.list_physical_devices('GPU')
        if gpus:
            try:
                tf.config.set_visible_devices(gpus[0], 'GPU')
                logical_gpus = tf.config.list_logical_devices('GPU')
                print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPU")
            except RuntimeError as e:
                # Visible devices must be set before GPUs have been initialized
                print(e)
        
        if load_models:
            self.model = tf.keras.models.load_model('./pretrained/binary_gru.keras')
            print(self.model.summary())
        else:
            model = Sequential()
            model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH))
            model.add(SpatialDropout1D(self.params['dropout']))
            for i in range(self.params['layers']):
                model.add(GRU(self.params['units'], return_sequences=i != self.params['layers']-1, recurrent_dropout=self.params['dropout']))
                model.add(LayerNormalization())
            model.add(Dense(20, activation='sigmoid'))
            optimizer = tf.keras.optimizers.Adam(learning_rate=self.params['lr'])
            model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=[tf.keras.metrics.CategoricalAccuracy()])
            print(model.summary())
            self.model = model
    
    def fit(self, X, y):
        st = time.time()
        print("Fitting model...")
        
        self.model.fit(X, y, epochs=self.epochs, batch_size=self.batch_size, validation_split=0.1, callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])
        
        self.train_time = time.time() - st
        print("Done fitting model")
        print(f"Train time: {self.train_time}")
    
    def predict(self, X):
        st = time.time()
        print("Predicting...")
        
        self.preds = self.model.predict(X)
        # self.preds = np.round(self.preds)
        
        self.predict_time = time.time() - st
        print(f"Predict time: {self.predict_time}")
        return self.preds
    
    def write_metrics(self, y_test):
        file_name = f'binary_gru_{datetime.now().strftime("%Y%m%d%H%M")}.txt'

        file_path = f'./src/gru/metrics/{file_name}'

        with open(file_path, 'w') as f:
            f.write(f'Predict time: {self.predict_time}\n')
            f.write(f'Accuracy: {accuracy_score(y_test, self.preds)}\n')
            f.write(f'Hamming Score: {1 - hamming_loss(y_test, self.preds)}\n')
            f.write(f'Jaccard Score: {jaccard_score(y_test, self.preds, average="micro")}\n')
            # f.write(f'Hit Rate: {hit_rate(y_test, self.preds)}\n')
            f.write('Classification Report:\n')
            f.write(f'{classification_report(y_test, self.preds, zero_division=True)}\n')

    def save_model(self):
        self.model.save(f'./src/gru/pretrained/binary_gru.keras')

class RankGRURunner:
    def __init__(self, load_models=False):
        self.load_models = load_models
        self.X_train = None
        self.X_test = None
        self.y_train = None
        self.y_test = None
    
    def load_data(self):
        self.X_train, self.X_test, self.y_train, self.y_test = load_data(gru=True)
    
    def init_model(self):
        self.model = RankGRU(load_models=self.load_models)
    
    def run_training(self):
        self.load_data()
        self.init_model()
        
        self.model.fit(self.X_train, self.y_train)
        self.model.save_model()
    
    def run_inference(self):
        self.load_data()
        self.init_model()
        
        self.model.predict(self.X_test)
        self.model.write_metrics(self.y_test)


    

In [96]:


gru = RankGRU(load_models=False)
X_train, X_test, y_train, y_test = load_data(gru=True)
X_train = X_train[:1000]
y_train = y_train[:1000]


4 Physical GPUs, 1 Logical GPU


Model: "sequential_15"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_15 (Embedding)    (None, 250, 100)          6000000   
                                                                 
 spatial_dropout1d_15 (Spat  (None, 250, 100)          0         
 ialDropout1D)                                                   
                                                                 
 gru_30 (GRU)                (None, 250, 128)          88320     
                                                                 
 layer_normalization_30 (La  (None, 250, 128)          256       
 yerNormalization)                                               
                                                                 
 gru_31 (GRU)                (None, 128)               99072     
                                                                 
 layer_normalization_31 (La  (None, 128)             

In [97]:
gru.fit(X_train, y_train)

Fitting model...
Epoch 1/5


2023-11-18 22:41:25.724132: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:442] Loaded cuDNN version 8700
2023-11-18 22:41:26.264682: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x55746fb12f90 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2023-11-18 22:41:26.264746: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA GeForce RTX 2080 Ti, Compute Capability 7.5
2023-11-18 22:41:26.270625: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2023-11-18 22:41:26.387809: I ./tensorflow/compiler/jit/device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Done fitting model
Train time: 112.97887563705444


In [101]:
preds = gru.predict(X_train)

Predicting...
Predict time: 5.801921367645264


In [102]:
def get_thresholds(y_prob, y):
    thresholds = []
    for prob, true_labels in zip(y_prob, y):
        errors = np.sum(((prob > prob[:, None]) & (true_labels == 0) | (prob <= prob[:, None]) & (true_labels == 1)) , axis=1)
        thresholds.append(prob[np.argmin(errors)])
    return thresholds


In [103]:
thresholds = get_thresholds(preds, y_train)

In [104]:
import xgboost as xgb

In [105]:
reg = xgb.XGBRegressor(verbosity=2, tree_method="hist", n_jobs=39)

In [107]:
reg.fit(preds, thresholds);

In [109]:
preds = gru.predict(X_test)
thresholds = get_thresholds(preds, y_test)
y_thresh = reg.predict(preds)
y_pred = (preds >= y_thresh[:, None]).astype(int)

Predicting...
Predict time: 1.2277288436889648


In [110]:
y_pred

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 1, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [1]:
import numpy as np

def transform_labels(y):
    """
    Transform multi-label output array.

    Parameters:
    y (numpy.ndarray): Input array with shape (num_samples, num_classes).

    Returns:
    numpy.ndarray: Transformed array.
    """
    num_classes = y.shape[1]
    
    # Initialize the transformed array
    transformed_y = np.zeros_like(y, dtype=float)
    
    for row_idx in range(y.shape[0]):
        # Find indices of correct genres in the current row
        correct_indices = np.where(y[row_idx] == 1)[0]
        num_correct_genres = len(correct_indices)
        
        # Set target probability for each correct genre
        if num_correct_genres > 0:
            transformed_y[row_idx, correct_indices] = 1.0 / num_correct_genres
    
    return transformed_y

# Example usage:
# Assuming y is your input array
y = np.array([[1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1],
              [0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0]])

transformed_y = transform_labels(y)
print(transformed_y)


[[0.14285714 0.         0.14285714 0.         0.         0.14285714
  0.         0.14285714 0.         0.         0.         0.
  0.14285714 0.         0.         0.14285714 0.         0.
  0.         0.14285714]
 [0.         0.125      0.         0.         0.125      0.
  0.125      0.         0.         0.125      0.         0.125
  0.         0.125      0.         0.         0.125      0.
  0.125      0.        ]]
