In [2]:
# !pip install datasets joblib
import numpy as np
import pandas as pd
import tensorflow.lite as tflite
from datasets import load_dataset
from transformers import AutoTokenizer
from joblib import Parallel, delayed

  from .autonotebook import tqdm as notebook_tqdm


# code for production

In [23]:
tokenizer_path = r'C:\Users\tarun\Desktop\hugging-face-deployment\Youtube-Video-Comments-Sentiment-Analysis\model\saved_tokenizer'
fine_tuned_tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)

# Load the TFLite model
tflite_model_path = r'C:\Users\tarun\Desktop\hugging-face-deployment\Youtube-Video-Comments-Sentiment-Analysis\model\model_float16.tflite'
interpreter = tflite.Interpreter(model_path=tflite_model_path)
interpreter.allocate_tensors()


# Get input and output details for the TFLite model
input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()

BATCH_SIZE = 16

# Function to prepare inputs and get predictions from TFLite model
def tflite_predict_batch(text_batch):
    # Tokenize batch input text
    inputs = fine_tuned_tokenizer(text_batch, return_tensors="np", padding="max_length", truncation=True, max_length=55)
    
    # Prepare input data for TFLite model
    input_ids = inputs["input_ids"].astype(np.int64)
    attention_mask = inputs["attention_mask"].astype(np.int64)
    token_type_ids = inputs["token_type_ids"].astype(np.int64)

    # Run inference for the batch
    results = []
    for i in range(input_ids.shape[0]):  # Loop over the batch
        interpreter.set_tensor(input_details[1]['index'], np.expand_dims(input_ids[i], axis=0))
        interpreter.set_tensor(input_details[0]['index'], np.expand_dims(attention_mask[i], axis=0))
        interpreter.set_tensor(input_details[2]['index'], np.expand_dims(token_type_ids[i], axis=0))
        interpreter.invoke()

        output = interpreter.get_tensor(output_details[0]['index'])
        predicted_index = np.argmax(output, axis=1)[0]
        results.append(predicted_index)
    
    return results

In [24]:
# Example usage
text = "i just feel extremely comfortable with the group of people that i dont even need to hide myself"
predicted_sentiment = tflite_predict(text)
print(f"Predicted Sentiment: {predicted_sentiment}")

Predicted Sentiment: 1


# getting the dataset and accuracy

In [25]:
dataset1 = load_dataset('dair-ai/emotion', 'split')
dataset2 = load_dataset('SetFit/emotion')

Repo card metadata block was not found. Setting CardData to empty.


In [26]:
test_dataset1 = dataset1['test']
test_dataset2 = dataset2['test']
validation_dataset = dataset1['validation']

In [27]:
test_dict1 = test_dataset1.to_pandas().to_dict(orient='records')
test_dict2 = test_dataset2.to_pandas().to_dict(orient='records')
valid_dict = validation_dataset.to_pandas().to_dict(orient='records')

In [28]:
# Function to process a batch of rows
def process_batch(batch_rows):
    texts = [row['text'] for row in batch_rows]
    labels = [row['label'] for row in batch_rows]
    
    predictions = tflite_predict_batch(texts)
    
    # Compare predictions to true labels
    correct_count = sum([1 for pred, true_label in zip(predictions, labels) if pred == true_label])
    
    return correct_count, len(batch_rows)

In [29]:
# Use parallel processing with joblib
def parallel_predict(dataset_dict, batch_size, n_jobs):
    correct_predictions = 0
    total_samples = 0
    
    # Split dataset into batches
    batches = [dataset_dict[i:i + batch_size] for i in range(0, len(dataset_dict), batch_size)]
    
    # Process batches in parallel
    results = Parallel(n_jobs=n_jobs)(delayed(process_batch)(batch) for batch in batches)
    
    # Sum up the correct predictions
    for correct, total in results:
        correct_predictions += correct
        total_samples += total
    
    return correct_predictions, total_samples

In [30]:
correct_predictions, total_samples = parallel_predict(test_dict1, BATCH_SIZE, n_jobs=1)
accuracy = correct_predictions / total_samples
print(f"Accuracy of the TFLite model on the training dataset: {accuracy:.4f}")

Accuracy of the TFLite model on the training dataset: 0.9110


In [31]:
correct_predictions, total_samples = parallel_predict(test_dict2, BATCH_SIZE, n_jobs=1)
accuracy = correct_predictions / total_samples
print(f"Accuracy of the TFLite model on the training dataset: {accuracy:.4f}")

Accuracy of the TFLite model on the training dataset: 0.9110


In [32]:
correct_predictions, total_samples = parallel_predict(valid_dict, BATCH_SIZE, n_jobs=1)
accuracy = correct_predictions / total_samples
print(f"Accuracy of the TFLite model on the training dataset: {accuracy:.4f}")

Accuracy of the TFLite model on the training dataset: 0.9230
