In [1]:
# Install neccessary libraries
! pip install tensorflow numpy pandas transformers scikit-learn



In [2]:
# library imports
import tensorflow as tf
from tensorflow.keras import activations, optimizers, losses
from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification
from sklearn.metrics import recall_score, precision_score
import pickle
import numpy as np
import pandas as pd

2024-05-31 12:50:27.475566: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-05-31 12:50:27.475943: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-05-31 12:50:27.480109: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-05-31 12:50:27.512226: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# Load the dataset from the required file path
df = pd.read_csv('/home/saiganesh/Desktop/classification_dataset.csv')

# Define a function to update the category labels
def update(cat):
    """
    Update category labels.
    
    Args:
    - cat (str): Category label to be updated.
    
    Returns:
    - int: Updated category label (0 for "Biology", 1 for "Finance", unchanged otherwise).
    """
    if cat == "Biology":
        return 0
    elif cat == "Finance":
        return 1
    elif cat == "Java":
        return 2
    return cat

# Apply the update function to the 'Course' column in the DataFrame
df.loc[:, "Course"] = df["Course"].apply(update)

# df.head()

# Assign the 'Question' column to variable x and the updated 'Course' column to variable y
x = df['Question']
y = df['Course'].astype(np.int64)  # Update function is returning int, but TensorFlow v2 dataset accepts only int64


In [5]:
# Define the model name and maximum sequence length
MODEL_NAME = 'distilbert-base-uncased'
MAX_LEN = 20

# x is a Pandas DataFrame containing Questions, converting it to a list
review = x.to_list()

# Initializing a DistilBERT tokenizer with the specified model name
tkzr = DistilBertTokenizer.from_pretrained(MODEL_NAME)

# Tokenizing the Questions using the tokenizer, ensuring all inputs have the same length
# max_length specifies the maximum length of each input sequence after tokenization
# truncation=True ensures that sequences longer than max_length are truncated
# padding=True ensures that sequences shorter than max_length are padded with the appropriate token
inputs = tkzr(review, max_length=MAX_LEN, truncation=True, padding=True)

# print(f'review: \'{review}\'')
# print(f'input ids: {inputs["input_ids"]}')
# print(f'attention mask: {inputs["attention_mask"]}')

In [8]:
def construct_encodings(x, tkzr, max_len, trucation=True, padding=True):
    """
    Construct token encodings for the input data using the provided tokenizer.

    Args:
    - x (list): List of input data (e.g., reviews).
    - tkzr (PreTrainedTokenizer): Tokenizer object to tokenize the input data.
    - max_len (int): Maximum length of each tokenized sequence.
    - truncation (bool): Whether to truncate sequences longer than max_len (default=True).
    - padding (bool): Whether to pad sequences shorter than max_len (default=True).

    Returns:
    - encodings (dict): Token encodings generated by the tokenizer.
    """
    # Tokenize the input data using the provided tokenizer and specified parameters
    encodings = tkzr(x, max_length=max_len, truncation=True, padding=True)
    
    return encodings

encodings = construct_encodings(x.to_list(), tkzr, max_len=MAX_LEN)

In [9]:
def construct_tfdataset(encodings, y=None):
    """
    Construct a TensorFlow dataset from token encodings and labels (if provided).

    Args:
    - encodings (dict): Token encodings generated by a tokenizer.
    - y (array-like, optional): Labels associated with the token encodings.

    Returns:
    - tfdataset (tf.data.Dataset): TensorFlow dataset containing token encodings and labels (if provided).
    """
    if y is not None:
        # If labels are provided, create a TensorFlow dataset with token encodings and labels
        return tf.data.Dataset.from_tensor_slices((dict(encodings), y))
    else:
        # If labels are not provided (e.g., for inference/predictions), create a TensorFlow dataset with only token encodings
        return tf.data.Dataset.from_tensor_slices(dict(encodings))

tfdataset = construct_tfdataset(encodings, y)

2024-05-31 12:52:16.780888: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:282] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected


In [10]:
# Define the test split ratio and batch size
TEST_SPLIT = 0.2
BATCH_SIZE = 2

# Calculate the size of the training set based on the test split ratio
train_size = int(len(x) * (1 - TEST_SPLIT))

# Shuffle the dataset
tfdataset = tfdataset.shuffle(len(x))

# Split the shuffled dataset into training and testing sets
tfdataset_train = tfdataset.take(train_size)
tfdataset_test = tfdataset.skip(train_size)

# Batch the training and testing datasets
tfdataset_train = tfdataset_train.batch(BATCH_SIZE)
tfdataset_test = tfdataset_test.batch(BATCH_SIZE)


In [11]:
# Define the number of epochs
N_EPOCHS = 2

# Initialize the DistilBERT model for sequence classification
model = TFDistilBertForSequenceClassification.from_pretrained(MODEL_NAME)

# Define the optimizer and loss function
optimizer = optimizers.Adam(learning_rate=2e-5)
loss_fn = losses.SparseCategoricalCrossentropy(from_logits=True)

# Compile the model with the optimizer, loss function, and metrics
model.compile(optimizer=optimizer, loss=loss_fn, metrics=['accuracy'])

# Train the model
model.fit(tfdataset_train, batch_size=BATCH_SIZE, epochs=N_EPOCHS)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should 

ValueError: Could not interpret optimizer identifier: <keras.src.optimizers.adam.Adam object at 0x7097c4e96cc0>

In [None]:
# Evaluate the model on the test dataset
benchmarks = model.evaluate(tfdataset_test, return_dict=True, batch_size=BATCH_SIZE)

# Print the evaluation metrics
print(benchmarks)

In [None]:
# Initialize an empty list to store the true labels from the test dataset
y_test = []

# Iterate through the test dataset to extract true labels
for _, label in tfdataset_test.unbatch():
    y_test.append(label.numpy())

# Convert the list of true labels to a NumPy array
y_test = np.array(y_test)

# Initialize an empty list to store the predicted labels
y_pred = []

# Iterate through batches of the test dataset to predict labels
for batch in tfdataset_test:
    # Extract inputs from the batch (excluding labels)
    inputs, _ = batch
    # Use the model to predict labels for the inputs
    preds = model.predict(inputs).logits
    # Extract the predicted labels and append to the list
    y_pred.extend(np.argmax(preds, axis=-1))

# Convert the list of predicted labels to a NumPy array
y_pred = np.array(y_pred)

# Print true and predicted labels
print("True labels (y_test):", y_test)
print("Predicted labels (y_pred):", y_pred)

In [None]:
# Calculate recall
recall = recall_score(y_test, y_pred)

# Calculate precision
precision = precision_score(y_test, y_pred)

print("Recall:", recall)
print("Precision:", precision)

In [None]:
def create_predictor(model, model_name, max_len):
    """
    Create a predictor function for making predictions using a pre-trained model.

    Args:
    - model (TFDistilBertForSequenceClassification): Pre-trained model for sequence classification.
    - model_name (str): Name of the pre-trained model.
    - max_len (int): Maximum length of input sequences.

    Returns:
    - predict_proba (function): Predictor function that takes input text and returns the probability of the positive class i.e. label with 1.
    """
    # Initialize a DistilBERT tokenizer with the specified model name
    tkzr = DistilBertTokenizer.from_pretrained(model_name)
    
    def predict_proba(text):
        """
        Predict the probability of the positive class for the input text.

        Args:
        - text (str): Input text for prediction.

        Returns:
        - float: Probability of the positive class.
        """
        # Tokenize the input text
        x = [text]
        encodings = construct_encodings(x, tkzr, max_len=max_len)
        tfdataset = construct_tfdataset(encodings)
        tfdataset = tfdataset.batch(1)

        # Make predictions using the pre-trained model
        logits = model.predict(tfdataset).logits
        probabilities = tf.nn.softmax(logits, axis=-1).numpy()
        
        # Assuming binary classification, return probability of positive class (class 1)
        # Adjust this according to your specific classification task
        positive_probability = probabilities[:, 1]
        return positive_probability[0]

    return predict_proba

# Create a predictor function using the trained model, tokenizer, and max_len
clf = create_predictor(model, MODEL_NAME, MAX_LEN)

# Test the predictor function with an example input
print(clf('who is the father of biology?'))

In [None]:
# Save the pre-trained model to the specified directory
model.save_pretrained('./model/clf')

# Save the model information (model name and maximum sequence length) to a pickle file
import pickle
with open('./model/info.pkl', 'wb') as f:
    pickle.dump((MODEL_NAME, MAX_LEN), f)

In [None]:
# Load the pre-trained model from the specified directory
new_model = TFDistilBertForSequenceClassification.from_pretrained('./model/clf')

# Load the model information (model name and maximum sequence length) from the pickle file
model_name, max_len = pickle.load(open('./model/info.pkl', 'rb'))

# Create a predictor function using the loaded model and information
clf = create_predictor(new_model, model_name, max_len)

# Test the predictor function with an example input
print(clf('what are financial markets'))