# Benchmark No Fine-tuning

In [39]:
import pandas as pd
import ast
import numpy as np
from transformers import T5Tokenizer, TFT5ForConditionalGeneration
from sklearn.metrics import precision_score, recall_score, accuracy_score, fbeta_score, classification_report
import tensorflow as tf

# Display the entire sorted DataFrame
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 500)
pd.set_option('display.max_colwidth', None)

# Load data
merged_test_df = pd.read_csv('/Users/cyrusaghaee/DS 207/Final Project/merged_test_df.csv')

# Function to process the merged dataframe
def process_merged_df(merged_df):
    # Convert string representations of lists back to actual lists
    merged_df['FTMNT_MAKE'] = merged_df['FTMNT_MAKE'].apply(ast.literal_eval)

    # Group the data by description and aggregate the compatible makes into lists
    grouped = merged_df.groupby('processed_x_string')['FTMNT_MAKE'].apply(list).reset_index()
    grouped['FTMNT_MAKE'] = grouped['FTMNT_MAKE'].apply(lambda x: list(set(sum(x, []))))

    # Define X and Y
    X = grouped['processed_x_string']
    Y = grouped['FTMNT_MAKE'].apply(lambda x: ', '.join(x))
    
    return X, Y

# Process the test dataset
X_test, Y_test = process_merged_df(merged_test_df)

# Initialize the tokenizer and model with pre-trained T5
tokenizer = T5Tokenizer.from_pretrained('t5-base')
model = TFT5ForConditionalGeneration.from_pretrained('t5-base')

# Tokenize the test inputs and labels
def tokenize_data(data, labels, tokenizer, max_length=512):
    input_encodings = tokenizer(
        list(data),
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_tensors='tf'
    )
    
    label_encodings = tokenizer(
        list(labels),
        max_length=128,
        padding='max_length',
        truncation=True,
        return_tensors='tf'
    )
    
    return input_encodings, label_encodings

test_encodings, test_labels = tokenize_data(X_test, Y_test, tokenizer)

batch_size = 16

test_dataset = tf.data.Dataset.from_tensor_slices((
    {'input_ids': test_encodings['input_ids'], 'attention_mask': test_encodings['attention_mask'], 'labels': test_labels['input_ids']}
)).batch(batch_size).prefetch(tf.data.experimental.AUTOTUNE)

# Function to evaluate the pre-trained model on the test set
def evaluate_pretrained_model(model, tokenizer, dataset):
    all_preds = []
    all_labels = []

    with tf.device('/CPU:0'):
        for batch in dataset:
            inputs = batch['input_ids']
            attention_mask = batch['attention_mask']
            batch_labels = batch['labels']

            # Generate predictions
            pred_ids = model.generate(input_ids=inputs, attention_mask=attention_mask)
            preds = [set(tokenizer.decode(ids, skip_special_tokens=True).split(', ')) for ids in pred_ids.numpy()]
            labels = [set(tokenizer.decode(ids, skip_special_tokens=True).split(', ')) for ids in batch_labels.numpy()]

            all_preds.extend(preds)
            all_labels.extend(labels)

    # Convert to binary format for multi-label evaluation
    unique_labels = list(set([label for sublist in all_labels for label in sublist]))
    label_to_index = {label: i for i, label in enumerate(unique_labels)}
    
    y_true_bin = np.zeros((len(all_labels), len(unique_labels)))
    y_pred_bin = np.zeros((len(all_preds), len(unique_labels)))
    
    for i, labels in enumerate(all_labels):
        for label in labels:
            if label in label_to_index:
                y_true_bin[i, label_to_index[label]] = 1
    
    for i, preds in enumerate(all_preds):
        for pred in preds:
            if pred in label_to_index:
                y_pred_bin[i, label_to_index[pred]] = 1

    precision = precision_score(y_true_bin, y_pred_bin, average='weighted', zero_division=np.nan)
    recall = recall_score(y_true_bin, y_pred_bin, average='weighted', zero_division=np.nan)
    accuracy = accuracy_score(y_true_bin, y_pred_bin)
    fbeta = fbeta_score(y_true_bin, y_pred_bin, beta=0.2, average='weighted', zero_division=np.nan)
    
    # Generate classification report
    report = classification_report(y_true_bin, y_pred_bin, target_names=unique_labels, zero_division=np.nan)
    
    # Add accuracy to the report
    report_with_accuracy = f"Accuracy: {accuracy:.4f}\n\n{report}"

    return precision, recall, accuracy, fbeta, report_with_accuracy

# Evaluate the pre-trained model on the test set
precision, recall, accuracy, fbeta, report = evaluate_pretrained_model(model, tokenizer, test_dataset)
print(f"Test Precision: {precision}")
print(f"Test Recall: {recall}")
print(f"Test Accuracy: {accuracy}")
print(f"Test F-beta Score: {fbeta}")
print("Classification Report with Accuracy:\n", report)


All PyTorch model weights were used when initializing TFT5ForConditionalGeneration.

All the weights of TFT5ForConditionalGeneration were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


Test Precision: 1.0
Test Recall: 0.001177856301531213
Test Accuracy: 0.002
Test F-beta Score: 0.022565247039861135
Classification Report with Accuracy:
 Accuracy: 0.0020

                         precision    recall  f1-score   support

                renault        nan      0.00      0.00         1
                    geo        nan      0.00      0.00         3
             mitsubishi        nan      0.00      0.00        12
                 datsun        nan      0.00      0.00         1
                  acura        nan      0.00      0.00        14
                 desoto        nan      0.00      0.00         1
                   saab        nan      0.00      0.00         8
                    bmw        nan      0.00      0.00        27
                  lexus        nan      0.00      0.00        12
                 hummer        nan      0.00      0.00         5
                  tesla        nan      0.00      0.00         2
               chrysler        nan      0.00    

# Build Model w/ Early Stopping

In [60]:
import pandas as pd
import ast
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import T5Tokenizer, TFT5ForConditionalGeneration, AdamWeightDecay
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from sklearn.metrics import precision_score, accuracy_score, fbeta_score
import tensorflow as tf  # Import TensorFlow


# Display the entire sorted DataFrame
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 500)
pd.set_option('display.max_colwidth', None)

# Load data
merged_train_df = pd.read_csv('/Users/cyrusaghaee/DS 207/Final Project/merged_train_df.csv')
merged_val_df = pd.read_csv('/Users/cyrusaghaee/DS 207/Final Project/merged_val_df.csv')
merged_test_df = pd.read_csv('/Users/cyrusaghaee/DS 207/Final Project/merged_test_df.csv')

# Function to process the merged dataframe
def process_merged_df(merged_df):
    # Convert string representations of lists back to actual lists
    merged_df['FTMNT_MAKE'] = merged_df['FTMNT_MAKE'].apply(ast.literal_eval)

    # Group the data by description and aggregate the compatible makes into lists
    grouped = merged_df.groupby('processed_x_string')['FTMNT_MAKE'].apply(list).reset_index()
    grouped['FTMNT_MAKE'] = grouped['FTMNT_MAKE'].apply(lambda x: list(set(sum(x, []))))

    # Define X and Y
    X = grouped['processed_x_string']
    Y = grouped['FTMNT_MAKE'].apply(lambda x: ', '.join(x))
    
    return X, Y

# Process each dataset
X_train, Y_train = process_merged_df(merged_train_df)
X_val, Y_val = process_merged_df(merged_val_df)
X_test, Y_test = process_merged_df(merged_test_df)

# Initialize the tokenizer and model with T5
tokenizer = T5Tokenizer.from_pretrained('t5-base')
model = TFT5ForConditionalGeneration.from_pretrained('t5-base')

# Tokenize the inputs and labels
def tokenize_data(data, labels, tokenizer, max_length=512):
    input_encodings = tokenizer(
        list(data),
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_tensors='tf'
    )
    
    label_encodings = tokenizer(
        list(labels),
        max_length=128,
        padding='max_length',
        truncation=True,
        return_tensors='tf'
    )
    
    return input_encodings, label_encodings

train_encodings, train_labels = tokenize_data(X_train, Y_train, tokenizer)
val_encodings, val_labels = tokenize_data(X_val, Y_val, tokenizer)
test_encodings, test_labels = tokenize_data(X_test, Y_test, tokenizer)

batch_size = 16

train_dataset = tf.data.Dataset.from_tensor_slices((
    {'input_ids': train_encodings['input_ids'], 'attention_mask': train_encodings['attention_mask'], 'labels': train_labels['input_ids']}
)).shuffle(1000).batch(batch_size).prefetch(tf.data.experimental.AUTOTUNE)

val_dataset = tf.data.Dataset.from_tensor_slices((
    {'input_ids': val_encodings['input_ids'], 'attention_mask': val_encodings['attention_mask'], 'labels': val_labels['input_ids']}
)).batch(batch_size).prefetch(tf.data.experimental.AUTOTUNE)

test_dataset = tf.data.Dataset.from_tensor_slices((
    {'input_ids': test_encodings['input_ids'], 'attention_mask': test_encodings['attention_mask'], 'labels': test_labels['input_ids']}
)).batch(batch_size).prefetch(tf.data.experimental.AUTOTUNE)

# Compile the model
optimizer = AdamWeightDecay(learning_rate=3e-5)

model.compile(
    optimizer=optimizer,
    loss=SparseCategoricalCrossentropy(from_logits=True)
)

All PyTorch model weights were used when initializing TFT5ForConditionalGeneration.

All the weights of TFT5ForConditionalGeneration were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


# Train the Model

In [59]:
# Custom training loop with early stopping
epochs = 100
patience = 3
best_val_loss = float('inf')
patience_counter = 0

for epoch in range(epochs):
    print(f'Epoch {epoch + 1}/{epochs}')
    
    # Training step
    train_loss = 0
    for batch in train_dataset:
        inputs = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']

        with tf.GradientTape() as tape:
            outputs = model(input_ids=inputs, attention_mask=attention_mask, labels=labels, training=True)
            loss = outputs.loss
            train_loss += loss

        gradients = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    
    train_loss /= len(train_dataset)
    print(f'Training Loss: {train_loss}')
    
    # Validation step
    val_loss = 0
    for batch in val_dataset:
        inputs = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']

        outputs = model(input_ids=inputs, attention_mask=attention_mask, labels=labels, training=False)
        loss = outputs.loss
        val_loss += loss
    
    val_loss /= len(val_dataset)
    print(f'Validation Loss: {val_loss}')
    
    # Early stopping check
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        patience_counter = 0
        model.save_weights('./fine_tuned_t5v5.weights.h5')
        model.save_pretrained('./fine_tuned_t5v5')
        tokenizer.save_pretrained('./fine_tuned_t5v5')
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print("Early stopping triggered.")
            break

# Load the best model weights
model.load_weights('./fine_tuned_t5v5.weights.h5')



Epoch 1/100
Training Loss: [0.6445512]
Validation Loss: [0.10926604]




Epoch 2/100
Training Loss: [0.09909403]
Validation Loss: [0.04561129]




Epoch 3/100
Training Loss: [0.05306148]
Validation Loss: [0.03937169]




Epoch 4/100
Training Loss: [0.0447378]
Validation Loss: [0.03555105]




Epoch 5/100
Training Loss: [0.0405241]
Validation Loss: [0.03252235]




Epoch 6/100
Training Loss: [0.03655076]
Validation Loss: [0.03012721]




Epoch 7/100
Training Loss: [0.03470323]
Validation Loss: [0.02848039]




Epoch 8/100
Training Loss: [0.03230544]
Validation Loss: [0.02708086]




Epoch 9/100
Training Loss: [0.03100815]
Validation Loss: [0.02611739]




Epoch 10/100
Training Loss: [0.0293803]
Validation Loss: [0.02616521]
Epoch 11/100
Training Loss: [0.02803011]
Validation Loss: [0.02411519]




Epoch 12/100
Training Loss: [0.02692658]
Validation Loss: [0.02385205]




Epoch 13/100
Training Loss: [0.02606193]
Validation Loss: [0.02290814]




Epoch 14/100
Training Loss: [0.02502922]
Validation Loss: [0.02368581]
Epoch 15/100
Training Loss: [0.02425716]
Validation Loss: [0.02264706]




Epoch 16/100
Training Loss: [0.02375524]
Validation Loss: [0.02174065]




Epoch 17/100
Training Loss: [0.02246848]
Validation Loss: [0.02135939]




Epoch 18/100
Training Loss: [0.02159163]
Validation Loss: [0.02068828]




Epoch 19/100
Training Loss: [0.02060034]
Validation Loss: [0.02082878]
Epoch 20/100
Training Loss: [0.0205283]
Validation Loss: [0.01963025]




Epoch 21/100
Training Loss: [0.01983201]
Validation Loss: [0.01928295]




Epoch 22/100
Training Loss: [0.01955421]
Validation Loss: [0.019761]
Epoch 23/100
Training Loss: [0.0190254]


2024-08-05 21:11:00.530878: I tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


Validation Loss: [0.01922299]
Epoch 24/100
Training Loss: [0.01829443]
Validation Loss: [0.01837045]




Epoch 25/100
Training Loss: [0.01745993]
Validation Loss: [0.01776347]




Epoch 26/100
Training Loss: [0.01679077]
Validation Loss: [0.0171078]




Epoch 27/100
Training Loss: [0.0163037]
Validation Loss: [0.01735291]
Epoch 28/100
Training Loss: [0.01571654]
Validation Loss: [0.01729555]
Epoch 29/100
Training Loss: [0.01495163]
Validation Loss: [0.01589193]




Epoch 30/100
Training Loss: [0.01477762]
Validation Loss: [0.01640935]
Epoch 31/100
Training Loss: [0.01440358]
Validation Loss: [0.01596942]
Epoch 32/100
Training Loss: [0.01408947]
Validation Loss: [0.01540047]




Epoch 33/100
Training Loss: [0.01311158]
Validation Loss: [0.01522288]




Epoch 34/100
Training Loss: [0.01297749]
Validation Loss: [0.01532779]
Epoch 35/100
Training Loss: [0.01263092]
Validation Loss: [0.014961]




Epoch 36/100
Training Loss: [0.01214198]
Validation Loss: [0.0143023]




Epoch 37/100
Training Loss: [0.01126035]
Validation Loss: [0.01347507]




Epoch 38/100
Training Loss: [0.01129726]
Validation Loss: [0.01401551]
Epoch 39/100
Training Loss: [0.01016763]
Validation Loss: [0.01352121]
Epoch 40/100
Training Loss: [0.00986454]
Validation Loss: [0.01378261]
Early stopping triggered.




# Load Saved Model

In [61]:
# Paths to the saved models and tokenizer
weights3_path = './fine_tuned_t5v3.weights.h5'
model3_path = './fine_tuned_t5v3'

# Load the tokenizer
tokenizer3 = T5Tokenizer.from_pretrained(model3_path)

# Load the model architecture and weights
model3 = TFT5ForConditionalGeneration.from_pretrained(model3_path)
model3.load_weights(weights3_path)



# Paths to the saved models and tokenizer
weights2_path = './fine_tuned_t5v2.weights.h5'
model2_path = './fine_tuned_t5v2'

# Load the tokenizer
tokenizer2 = T5Tokenizer.from_pretrained(model2_path)

# Load the model architecture and weights
model2 = TFT5ForConditionalGeneration.from_pretrained(model2_path)
model2.load_weights(weights2_path)



# Paths to the saved models and tokenizer
weights4_path = './fine_tuned_t5v4.weights.h5'
model4_path = './fine_tuned_t5v4'

# Load the tokenizer
tokenizer4 = T5Tokenizer.from_pretrained(model4_path)

# Load the model architecture and weights
model4 = TFT5ForConditionalGeneration.from_pretrained(model4_path)
model4.load_weights(weights4_path)



# Paths to the saved models and tokenizer
weights5_path = './fine_tuned_t5v5.weights.h5'
model5_path = './fine_tuned_t5v5'

# Load the tokenizer
tokenizer5 = T5Tokenizer.from_pretrained(model5_path)

# Load the model architecture and weights
model5 = TFT5ForConditionalGeneration.from_pretrained(model5_path)
model5.load_weights(weights5_path)



All model checkpoint layers were used when initializing TFT5ForConditionalGeneration.

All the layers of TFT5ForConditionalGeneration were initialized from the model checkpoint at ./fine_tuned_t5v3.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.
All model checkpoint layers were used when initializing TFT5ForConditionalGeneration.

All the layers of TFT5ForConditionalGeneration were initialized from the model checkpoint at ./fine_tuned_t5v2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.
All model checkpoint layers were used when initializing TFT5ForConditionalGeneration.

All the layers of TFT5ForConditionalGeneration were initialized from the model checkpoint at ./fine_tuned_t5v4.
If your task is similar to the task the model of the checkpoi

('./fine_tuned_t5/tokenizer_config.json',
 './fine_tuned_t5/special_tokens_map.json',
 './fine_tuned_t5/spiece.model',
 './fine_tuned_t5/added_tokens.json')

# Evaluation + Classification Report

In [46]:
import numpy as np
import tensorflow as tf
from sklearn.metrics import precision_score, recall_score, accuracy_score, fbeta_score, classification_report


# Function to evaluate the model
def evaluate_model_with_report(model, tokenizer, dataset):
    total_loss = 0
    all_preds = []
    all_labels = []

    with tf.device('/CPU:0'):
        for batch in dataset:
            inputs = batch['input_ids']
            attention_mask = batch['attention_mask']
            batch_labels = batch['labels']

            # Ensure decoder_input_ids are created correctly
            decoder_input_ids = model._shift_right(batch_labels)
            
            outputs = model(input_ids=inputs, attention_mask=attention_mask, decoder_input_ids=decoder_input_ids, labels=batch_labels, training=False)
            loss = outputs.loss

            total_loss += loss.numpy()

            # Decode predictions and true labels
            pred_ids = model.generate(input_ids=inputs, attention_mask=attention_mask)
            preds = [set(tokenizer.decode(ids, skip_special_tokens=True).split(', ')) for ids in pred_ids.numpy()]
            labels = [set(tokenizer.decode(ids, skip_special_tokens=True).split(', ')) for ids in batch_labels.numpy()]

            all_preds.extend(preds)
            all_labels.extend(labels)

    avg_loss = total_loss / len(dataset)

    # Convert to binary format for multi-label evaluation
    unique_labels = list(set([label for sublist in all_labels for label in sublist]))
    label_to_index = {label: i for i, label in enumerate(unique_labels)}
    
    y_true_bin = np.zeros((len(all_labels), len(unique_labels)))
    y_pred_bin = np.zeros((len(all_preds), len(unique_labels)))
    
    for i, labels in enumerate(all_labels):
        for label in labels:
            if label in label_to_index:
                y_true_bin[i, label_to_index[label]] = 1
    
    for i, preds in enumerate(all_preds):
        for pred in preds:
            if pred in label_to_index:
                y_pred_bin[i, label_to_index[pred]] = 1

    precision = precision_score(y_true_bin, y_pred_bin, average='weighted', zero_division=np.nan)
    recall = recall_score(y_true_bin, y_pred_bin, average='weighted', zero_division=np.nan)
    accuracy = accuracy_score(y_true_bin, y_pred_bin)
    fbeta = fbeta_score(y_true_bin, y_pred_bin, beta=0.2, average='weighted', zero_division=np.nan)
    
    # Generate classification report
    report = classification_report(y_true_bin, y_pred_bin, target_names=unique_labels, zero_division=np.nan)
    
    # Add accuracy to the report
    report_with_accuracy = f"Accuracy: {accuracy:.4f}\n\n{report}"

    return avg_loss, precision, recall, accuracy, fbeta, report_with_accuracy




In [16]:
# Evaluate on the validation dataset
val_loss2, val_precision2, val_recall2, val_accuracy2, val_fbeta2, val_report2 = evaluate_model_with_report(model2, tokenizer2, val_dataset)
print(f"Validation Loss: {val_loss2}, Precision: {val_precision2}, Recall: {val_recall2}, Accuracy: {val_accuracy2}, Fbeta: {val_fbeta2}")
print("Classification Report:\n", val_report2)



Validation Loss: [0.05998851], Precision: 0.8377230771805018, Recall: 0.627039627039627, Accuracy: 0.7276341948310139, Fbeta: 0.8038365351276481
Classification Report:
                          precision    recall  f1-score   support

                renault       0.50      1.00      0.67         1
                    geo        nan      0.00      0.00         3
             mitsubishi       1.00      0.42      0.59        12
                 datsun       1.00      1.00      1.00         2
                  acura       0.80      0.57      0.67        14
                 desoto        nan      0.00      0.00         1
                   saab       0.83      0.62      0.71         8
                    bmw       1.00      0.89      0.94        27
                  lexus       1.00      0.62      0.76        13
                 hummer       1.00      0.40      0.57         5
                  tesla       1.00      1.00      1.00         1
               chrysler       0.62      0.62      

2024-08-04 12:48:40.899909: I tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [17]:
# Evaluate on the validation dataset
val_loss3, val_precision3, val_recall3, val_accuracy3, val_fbeta3, val_report3 = evaluate_model_with_report(model3, tokenizer3, val_dataset)
print(f"Validation Loss: {val_loss3}, Precision: {val_precision3}, Recall: {val_recall3}, Accuracy: {val_accuracy3}, Fbeta: {val_fbeta3}")
print("Classification Report:\n", val_report3)



Validation Loss: [0.04869421], Precision: 0.7905955033822475, Recall: 0.6165501165501166, Accuracy: 0.6918489065606361, Fbeta: 0.7500269388457076
Classification Report:
                          precision    recall  f1-score   support

                renault       1.00      1.00      1.00         1
                    geo        nan      0.00      0.00         3
             mitsubishi       1.00      0.42      0.59        12
                 datsun       1.00      0.50      0.67         2
                  acura       1.00      0.50      0.67        14
                 desoto        nan      0.00      0.00         1
                   saab       1.00      0.38      0.55         8
                    bmw       1.00      0.93      0.96        27
                  lexus       1.00      0.46      0.63        13
                 hummer        nan      0.00      0.00         5
                  tesla       1.00      1.00      1.00         1
               chrysler       0.88      0.62     

In [47]:
# Evaluate on the validation dataset
val_loss4, val_precision4, val_recall4, val_accuracy4, val_fbeta4, val_report4 = evaluate_model_with_report(model4, tokenizer4, val_dataset)
print(f"Validation Loss: {val_loss4}, Precision: {val_precision4}, Recall: {val_recall4}, Accuracy: {val_accuracy4}, Fbeta: {val_fbeta4}")
print("Classification Report:\n", val_report4)




Validation Loss: [0.01674004], Precision: 0.8116222228327404, Recall: 0.6596736596736597, Accuracy: 0.7475149105367793, Fbeta: 0.7801629621126409
Classification Report:
 Accuracy: 0.7475

                         precision    recall  f1-score   support

                renault       1.00      1.00      1.00         1
                    geo        nan      0.00      0.00         3
             mitsubishi       1.00      0.50      0.67        12
                 datsun       1.00      0.50      0.67         2
                  acura       0.64      0.64      0.64        14
                 desoto        nan      0.00      0.00         1
                   saab       1.00      0.38      0.55         8
                    bmw       1.00      0.89      0.94        27
                  lexus       1.00      0.46      0.63        13
                 hummer       0.00      0.00      0.00         5
                  tesla       1.00      1.00      1.00         1
               chrysler       0

In [62]:
# Evaluate on the validation dataset
val_loss5, val_precision5, val_recall5, val_accuracy5, val_fbeta5, val_report5 = evaluate_model_with_report(model5, tokenizer5, val_dataset)
print(f"Validation Loss: {val_loss5}, Precision: {val_precision5}, Recall: {val_recall5}, Accuracy: {val_accuracy5}, Fbeta: {val_fbeta5}")
print("Classification Report:\n", val_report5)



Validation Loss: [0.01347506], Precision: 0.8476508115962548, Recall: 0.6853146853146853, Accuracy: 0.7713717693836978, Fbeta: 0.8144640016475208
Classification Report:
 Accuracy: 0.7714

                         precision    recall  f1-score   support

                renault       0.50      1.00      0.67         1
                    geo        nan      0.00      0.00         3
             mitsubishi       1.00      0.50      0.67        12
                 datsun       1.00      0.50      0.67         2
                  acura       0.91      0.71      0.80        14
                 desoto        nan      0.00      0.00         1
                   saab       1.00      0.38      0.55         8
                    bmw       1.00      0.89      0.94        27
                  lexus       1.00      0.46      0.63        13
                 hummer        nan      0.00      0.00         5
                  tesla       1.00      1.00      1.00         1
               chrysler       0

# Evaluate Test Set

In [52]:
# Evaluate the model on the test set
avg_loss, precision, recall, accuracy, fbeta, report = evaluate_model_with_report(model2, tokenizer2, test_dataset)
print(f"Test Loss: {avg_loss}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"Accuracy: {accuracy}")
print(f"F-beta Score: {fbeta}")
print("Classification Report with Accuracy:\n", report)



Test Loss: [0.06417125]
Precision: 0.7945594479708957
Recall: 0.5948174322732627
Accuracy: 0.666
F-beta Score: 0.7628654683468066
Classification Report with Accuracy:
 Accuracy: 0.6660

                         precision    recall  f1-score   support

                renault        nan      0.00      0.00         1
                    geo       1.00      0.33      0.50         3
             mitsubishi       0.67      0.50      0.57        12
                 datsun       0.00      0.00      0.00         1
                  acura       0.88      0.50      0.64        14
                 desoto        nan      0.00      0.00         1
                   saab       0.60      0.38      0.46         8
                    bmw       1.00      0.89      0.94        27
                  lexus       1.00      0.58      0.74        12
                 hummer       0.75      0.60      0.67         5
                  tesla       1.00      1.00      1.00         2
               chrysler       0.5

In [51]:
# Evaluate the model on the test set
avg_loss, precision, recall, accuracy, fbeta, report = evaluate_model_with_report(model3, tokenizer3, test_dataset)
print(f"Test Loss: {avg_loss}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"Accuracy: {accuracy}")
print(f"F-beta Score: {fbeta}")
print("Classification Report with Accuracy:\n", report)



Test Loss: [0.05161441]
Precision: 0.773308569203846
Recall: 0.6018845700824499
Accuracy: 0.654
F-beta Score: 0.7361419631351723
Classification Report with Accuracy:
 Accuracy: 0.6540

                         precision    recall  f1-score   support

                renault        nan      0.00      0.00         1
                    geo       1.00      0.33      0.50         3
             mitsubishi       0.88      0.58      0.70        12
                 datsun        nan      0.00      0.00         1
                  acura       1.00      0.50      0.67        14
                 desoto        nan      0.00      0.00         1
                   saab       1.00      0.25      0.40         8
                    bmw       1.00      0.89      0.94        27
                  lexus       0.80      0.67      0.73        12
                 hummer       1.00      0.20      0.33         5
                  tesla       1.00      1.00      1.00         2
               chrysler       0.57

In [50]:
# Evaluate the model on the test set
avg_loss, precision, recall, accuracy, fbeta, report = evaluate_model_with_report(model4, tokenizer4, test_dataset)
print(f"Test Loss: {avg_loss}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"Accuracy: {accuracy}")
print(f"F-beta Score: {fbeta}")
print("Classification Report with Accuracy:\n", report)



Test Loss: [0.0177298]
Precision: 0.7562733613202286
Recall: 0.6348645465253239
Accuracy: 0.678
F-beta Score: 0.7222478438589839
Classification Report with Accuracy:
 Accuracy: 0.6780

                         precision    recall  f1-score   support

                renault        nan      0.00      0.00         1
                    geo        nan      0.00      0.00         3
             mitsubishi       1.00      0.67      0.80        12
                 datsun        nan      0.00      0.00         1
                  acura       0.52      0.79      0.63        14
                 desoto        nan      0.00      0.00         1
                   saab       1.00      0.25      0.40         8
                    bmw       1.00      0.89      0.94        27
                  lexus       0.89      0.67      0.76        12
                 hummer       1.00      0.40      0.57         5
                  tesla       1.00      1.00      1.00         2
               chrysler       0.63

In [63]:
# Evaluate the model on the test set
avg_loss, precision, recall, accuracy, fbeta, report = evaluate_model_with_report(model5, tokenizer5, test_dataset)
print(f"Test Loss: {avg_loss}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"Accuracy: {accuracy}")
print(f"F-beta Score: {fbeta}")
print("Classification Report with Accuracy:\n", report)



Test Loss: [0.01306892]
Precision: 0.8053516509328672
Recall: 0.6725559481743227
Accuracy: 0.708
F-beta Score: 0.7807955459845115
Classification Report with Accuracy:
 Accuracy: 0.7080

                         precision    recall  f1-score   support

                renault        nan      0.00      0.00         1
                    geo       1.00      0.33      0.50         3
             mitsubishi       0.89      0.67      0.76        12
                 datsun       1.00      1.00      1.00         1
                  acura       0.60      0.64      0.62        14
                 desoto        nan      0.00      0.00         1
                   saab       1.00      0.38      0.55         8
                    bmw       1.00      0.89      0.94        27
                  lexus       0.90      0.75      0.82        12
                 hummer       1.00      0.40      0.57         5
                  tesla       1.00      1.00      1.00         2
               chrysler       0.5

## Evaluate Model Performance

In [8]:
import numpy as np
import tensorflow as tf
from sklearn.metrics import precision_score, recall_score, accuracy_score, fbeta_score

def evaluate_metrics(model, tokenizer, dataset):
    try:
        return _evaluate_metrics_on_device(model, tokenizer, dataset, device='/GPU:0')
    except:
        print("Error encountered on GPU, falling back to CPU.")
        return _evaluate_metrics_on_device(model, tokenizer, dataset, device='/CPU:0')

def _evaluate_metrics_on_device(model, tokenizer, dataset, device):
    total_loss = 0
    all_preds = []
    all_labels = []

    with tf.device(device):
        for batch in dataset:
            inputs = batch['input_ids']
            attention_mask = batch['attention_mask']
            batch_labels = batch['labels']

            # Ensure decoder_input_ids are created correctly
            decoder_input_ids = model._shift_right(batch_labels)
            
            outputs = model(input_ids=inputs, attention_mask=attention_mask, decoder_input_ids=decoder_input_ids, labels=batch_labels, training=False)
            loss = outputs.loss

            total_loss += loss.numpy()

            # Decode predictions and true labels
            pred_ids = model.generate(input_ids=inputs, attention_mask=attention_mask)
            preds = [set(tokenizer.decode(ids, skip_special_tokens=True).split(', ')) for ids in pred_ids.numpy()]
            labels = [set(tokenizer.decode(ids, skip_special_tokens=True).split(', ')) for ids in batch_labels.numpy()]

            all_preds.extend(preds)
            all_labels.extend(labels)

    avg_loss = total_loss / len(dataset)

    # Convert to binary format for multi-label evaluation
    unique_labels = list(set([label for sublist in all_labels for label in sublist]))
    label_to_index = {label: i for i, label in enumerate(unique_labels)}
    
    y_true_bin = np.zeros((len(all_labels), len(unique_labels)))
    y_pred_bin = np.zeros((len(all_preds), len(unique_labels)))
    
    for i, labels in enumerate(all_labels):
        for label in labels:
            if label in label_to_index:
                y_true_bin[i, label_to_index[label]] = 1
    
    for i, preds in enumerate(all_preds):
        for pred in preds:
            if pred in label_to_index:
                y_pred_bin[i, label_to_index[pred]] = 1

    precision = precision_score(y_true_bin, y_pred_bin, average='weighted', zero_division=np.nan)
    recall = recall_score(y_true_bin, y_pred_bin, average='weighted', zero_division=np.nan)
    accuracy = accuracy_score(y_true_bin, y_pred_bin)
    fbeta = fbeta_score(y_true_bin, y_pred_bin, beta=0.2, average='weighted', zero_division=np.nan)
    
    return avg_loss, precision, recall, accuracy, fbeta


# Create Test Predictions

In [153]:
# Function to generate predictions on the test set
def generate_predictions(model, tokenizer, dataset):
    all_descriptions = []
    all_labels = []
    all_predictions = []

    with tf.device('/CPU:0'):
        for batch in dataset:
            inputs = batch['input_ids']
            attention_mask = batch['attention_mask']
            batch_labels = batch['labels']

            # Ensure decoder_input_ids are created correctly
            decoder_input_ids = model._shift_right(batch_labels)
            
            # Generate predictions
            pred_ids = model.generate(input_ids=inputs, attention_mask=attention_mask)
            preds = [tokenizer.decode(ids, skip_special_tokens=True) for ids in pred_ids.numpy()]
            labels = [tokenizer.decode(ids, skip_special_tokens=True) for ids in batch_labels.numpy()]
            descriptions = [tokenizer.decode(ids, skip_special_tokens=True) for ids in inputs.numpy()]

            all_descriptions.extend(descriptions)
            all_labels.extend(labels)
            all_predictions.extend(preds)

    # Create a DataFrame with the results
    df = pd.DataFrame({
        'Description': all_descriptions,
        'Labels': all_labels,
        'Predictions': all_predictions
    })

    return df

# Generate predictions on the test set
preds_df = generate_predictions(model2, tokenizer2, test_dataset)

print(preds_df.head())



  Description                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           \
0                                                                                       brand: ["caili's company"], make: mazda, model: 3, year: ['2010'], fitment: ['direct replacement'], type: ['seal'], manufacturer part number: ['g560-25-744'], part title: ['original equip data g560-25-744'], part brands: ['original equipment data'], subtype: ['drive axle shaft oil slinger'], part types: ['axle shaft'], category: 4.0, item_title: g560-25-744 for 2010 mazda 3 bl transaxle case half shaft 

In [148]:
# Adjust pandas display settings for better readability
pd.set_option('display.max_colwidth', 500)  # Adjust the value to control the width of each column
pd.set_option('display.colheader_justify', 'left')  # Justify column headers to the left
predictions_df = preds_df.copy()
print(predictions_df.shape)
predictions_df.head(50)

(3, 4)


Unnamed: 0,Description,Labels,Predictions,has_desc
0,This is a sample cleaned_desc: item1,"[label1, label2]","[label1, label3]",1
1,Another description without the keyword,[label3],"[label2, label3]",0
2,This one has cleaned_desc: item2,"[label2, label3, label4]","[label4, label2]",1


In [37]:
predictions_df[['Predictions']].to_csv('pretrained_y_preds.csv', index=False)

## Classification Report

In [None]:
import pandas as pd
from sklearn.metrics import classification_report

def create_classification_report_dataframe(model, tokenizer, dataset):
    try:
        return _create_classification_report_dataframe_on_device(model, tokenizer, dataset, device='/GPU:0')
    except:
        print("Error encountered on GPU, falling back to CPU.")
        return _create_classification_report_dataframe_on_device(model, tokenizer, dataset, device='/CPU:0')

def _create_classification_report_dataframe_on_device(model, tokenizer, dataset, device):
    all_preds = []
    all_labels = []

    with tf.device(device):
        for batch in dataset:
            inputs = batch['input_ids']
            attention_mask = batch['attention_mask']
            batch_labels = batch['labels']

            # Ensure decoder_input_ids are created correctly
            decoder_input_ids = model._shift_right(batch_labels)
            
            outputs = model(input_ids=inputs, attention_mask=attention_mask, decoder_input_ids=decoder_input_ids, labels=batch_labels, training=False)

            # Decode predictions and true labels
            pred_ids = model.generate(input_ids=inputs, attention_mask=attention_mask)
            preds = [set(tokenizer.decode(ids, skip_special_tokens=True).split(', ')) for ids in pred_ids.numpy()]
            labels = [set(tokenizer.decode(ids, skip_special_tokens=True).split(', ')) for ids in batch_labels.numpy()]

            all_preds.extend(preds)
            all_labels.extend(labels)

    # Convert to binary format for multi-label evaluation
    unique_labels = list(set([label for sublist in all_labels for label in sublist]))
    label_to_index = {label: i for i, label in enumerate(unique_labels)}
    
    y_true_bin = np.zeros((len(all_labels), len(unique_labels)))
    y_pred_bin = np.zeros((len(all_preds), len(unique_labels)))
    
    for i, labels in enumerate(all_labels):
        for label in labels:
            if label in label_to_index:
                y_true_bin[i, label_to_index[label]] = 1
    
    for i, preds in enumerate(all_preds):
        for pred in preds:
            if pred in label_to_index:
                y_pred_bin[i, label_to_index[pred]] = 1

    report = classification_report(y_true_bin, y_pred_bin, target_names=unique_labels, zero_division=0, output_dict=True)
    report_df = pd.DataFrame(report).transpose()

    return report_df


In [None]:
class_report_val = create_classification_report_dataframe(model, tokenizer, val_dataset)

## Predictions vs Actuals 

In [29]:
import pandas as pd

def create_evaluation_dataframe(model, tokenizer, dataset):
    try:
        return _create_evaluation_dataframe_on_device(model, tokenizer, dataset, device='/GPU:0')
    except:
        print("Error encountered on GPU, falling back to CPU.")
        return _create_evaluation_dataframe_on_device(model, tokenizer, dataset, device='/CPU:0')

def _create_evaluation_dataframe_on_device(model, tokenizer, dataset, device):
    all_descriptions = []
    all_labels = []
    all_predictions = []

    with tf.device(device):
        for batch in dataset:
            inputs = batch['input_ids']
            attention_mask = batch['attention_mask']
            batch_labels = batch['labels']

            # Ensure decoder_input_ids are created correctly
            decoder_input_ids = model._shift_right(batch_labels)
            
            outputs = model(input_ids=inputs, attention_mask=attention_mask, decoder_input_ids=decoder_input_ids, labels=batch_labels, training=False)

            # Decode input descriptions, predictions, and true labels
            descriptions = [tokenizer.decode(ids, skip_special_tokens=True) for ids in inputs.numpy()]
            pred_ids = model.generate(input_ids=inputs, attention_mask=attention_mask)
            preds = [tokenizer.decode(ids, skip_special_tokens=True) for ids in pred_ids.numpy()]
            labels = [tokenizer.decode(ids, skip_special_tokens=True) for ids in batch_labels.numpy()]

            all_descriptions.extend(descriptions)
            all_labels.extend(labels)
            all_predictions.extend(preds)

    # Create a DataFrame
    df = pd.DataFrame({
        'Description': all_descriptions,
        'Labels': all_labels,
        'Predictions': all_predictions
    })

    return df





In [78]:
# Use the function to create the DataFrame
evaluation_df = create_evaluation_dataframe(model, tokenizer, val_dataset)

# Display the DataFrame
print(evaluation_df.head())

NameError: name 'create_evaluation_dataframe' is not defined

# Analyze Results

In [83]:
# Display the entire sorted DataFrame
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 500)
pd.set_option('display.max_colwidth', None)

pred_test_df = pd.read_csv('/Users/cyrusaghaee/DS 207/Final Project/pred_test_df.csv')

pred_test_df.shape

(500, 2)

In [87]:
predictions_df.head()

Unnamed: 0,Description,Labels,Predictions
0,"brand: [""caili's company""], make: mazda, model: 3, year: ['2010'], fitment: ['direct replacement'], type: ['seal'], manufacturer part number: ['g560-25-744'], part title: ['original equip data g560-25-744'], part brands: ['original equipment data'], subtype: ['drive axle shaft oil slinger'], part types: ['axle shaft'], category: 4.0, item_title: g560-25-744 for 2010 mazda 3 bl transaxle case half shaft oil seal",mazda,mazda
1,"brand: ['2006 mini mini cooper stock number: 00017948'], make: mini, model: mini cooper, year: ['2006', '2006'], fitment: ['direct replacement'], manufacturer part number: ['quality,oem,recycled,part,1-year,3-year_ &,lifetime,warranties', 'true', 'alternator'], sku: ['00017948'], category: 0.0, item_title: 2006 mini cooper alternator 96k 957948, cleaned_desc: powered by frooition about us shipping returns contact us payment faq shop categories all categories helpful links add to favorite sellers view feedback contact seller visit seller's ebay shop about seller page 2006 mini cooper alternator 96k 957948 item description 30 day guarantee to ensure you are satisfied with your purchase. please make sure you purchase the correct part the first time, the part must match the year, make and model described below. it must also match the engine size, gear ratio or trim level described in fitment below. make 100% sure it fits before purchase. if in doubt, please email before purchasing. ebay compatibility is not always 100% accurate make sure to confirm with our data below. this alternator fits the following vehicles: 2005-2008 mini cooper 110 amp our inventory notes on exact part: quality,oem,recycled,part,1-year,3-year_ &,lifetime,warranties please make sure part will fit your vehicle before buying! questions ebay message us! this is a proven used alternator. it is being pulled from the vehicle pictured a 2006 mini mini cooper with 96,000 miles. this alternator has been: tested and verified to be in good working order. removed with and will include the pulley. properly boxed for safe shipment to you. no core charge! shipping shipping payment contact us / help shipping shipping • this item will be shipped throughout the continental us via fedex ground. • we do not ship to ak, pr, or hi. • international shipping is not available at this time. • we do not ship to po or apo boxes. your order must have a physical address. orders with only po boxes will not ship until a physical address is provided. • tracking information is sent at time of shipment in most cases",mini,mini
2,"brand: ['4 seasons', 'fan motor', 'four seasons'], model: four, year: ['1995', '1996', '1997', '1998', '1999', '2000', '2001'], type: ['motor only', 'oe mount', '4 pole'], manufacturer part number: ['35161'], part title: ['exhaust pipe flange gasket walker 35161', 'exhaust header-base, gas, natural 35161 fits 77-78 pontiac firebird 6.6l-v8', 'exhaust tail pipe tip magnaflow ny 35161', 'carquest/fel-pro 35161 gaskets and sealing systems - engine water pump gasket', 'engine cooling fan motor uni-select 35161 fits 95-01 jeep cherokee 4.0l-l6', 'auto plus/four seasons 35161 - a/c condenser fan motor', 'imco 35161', 'drive axle shaft assembly-axle kit usa standard differential 35161', 'engine cooling fan motor cooling depot 35161', 'exhaust tail pipe tip magnaflow ca 35161', 'engine cooling fan motor parts master 35161 fits 95-01 jeep cherokee 4.0l-l6', 'advance 35161 engine cooling fan motor', 'power steering pump seal kit carquest 35161','murray 35161 engine cooling fan motor', 'engine cooling fan motor carquest 35161', 'exhaust tail pipe tip magnaflow 49 state 35161', 'usa standard gear 35161 service kits - cv axle shaft kit', ""engine cooling fan motor o'reilly 35161 fits 95-01 jeep cherokee"", 'engine cooling fan motor factory air 35161 fits 95-01 jeep cherokee 4.0l-l6', 'coil spring suplex 35161',",jeep,jeep
3,"brand: ['4 seasons', 'four seasons'], make: hyundai kia, model: sonata optima, year: ['2013', '2011', '2012', '2014', '2013', '2011', '2012', '2014', '2011', '2012', '2013', '2014'], oem part number: ['bp37m5'], product name: ['auxiliary fan assembly'], manufacturer part number: ['radiator fan assembly. -- usa built', 'bp37m5'], sku: ['400:bp37m5'], category: 1.0, item_title: auxiliary fan assembly for 11-14 hyundai kia sonata optima 24l 4 cyl gas bp37m5","kia, hyundai",hyundai
4,"brand: ['4 seasons', 'four seasons'], type: ['a/c clutch control relay harness connector'], oem part number: ['89051437'], manufacturer part number: ['37220'], sku: ['fs:37220'], part title: ['power steering pressure line hose assembly gates 37220', 'icp/cpi 37220', 'disc brake rotor set-vented sport cross-drilled (pair) brembo 37220', 'auto extra 37220', 'advance 37220 shock absorber', 'a/c clutch control relay harness connector cooling depot 37220', 'auto plus/monroe/expert series 37220 - suspension shock absorber, front', 'federated 37220', 'tail light cover-taillight cover lund 37220', 'a/c clutch control relay harness connector parts master 37220', 'a/c clutch control relay harness connector factory air 37220', ""o'reilly 37220 shock absorber"", 'auto plus/four seasons 37220 - a/c compressor clutch connector', 'trumark 37220', 'power steering pressure line hose assembly-pressure line assembly carquest 37220', 'hvac blower relay harness connector-engine cooling fan motor relay connector', 'hvac harness connector murray 37220','suspension shock absorber-oespectrum light truck shock absorber canadian tire', 'carquest/monroe 37220 shocks and struts - suspension shock absorber, front', 'power steering pressure line hose assembly-pressure line assembly advance 37220', 'carquest power steering 37220 - power steering pressure hose', 'pb/monroe 37220 shocks and struts - suspension shock absorber, front','suspension shock absorber-oespectrum light truck shock absorber advance 37220', 'visteon 37220', 'auto plus/monroe 372","chevrolet, buick, pontiac, cadillac, chrysler, oldsmobile","dodge, chrysler"


In [158]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.metrics import precision_score, recall_score, accuracy_score, fbeta_score, classification_report

# Function to evaluate the model and return incorrect predictions
def evaluate_model_and_find_incorrect_chevrolet(model, tokenizer, dataset):
    all_descriptions = []
    all_labels = []
    all_predictions = []
    incorrect_chevrolet_cases = []

    with tf.device('/CPU:0'):
        for batch in dataset:
            inputs = batch['input_ids']
            attention_mask = batch['attention_mask']
            batch_labels = batch['labels']

            # Ensure decoder_input_ids are created correctly
            decoder_input_ids = model._shift_right(batch_labels)
            
            outputs = model(input_ids=inputs, attention_mask=attention_mask, decoder_input_ids=decoder_input_ids, labels=batch_labels, training=False)
            loss = outputs.loss

            # Decode input descriptions, predictions, and true labels
            descriptions = [tokenizer.decode(ids, skip_special_tokens=True) for ids in inputs.numpy()]
            pred_ids = model.generate(input_ids=inputs, attention_mask=attention_mask)
            preds = [set(tokenizer.decode(ids, skip_special_tokens=True).split(', ')) for ids in pred_ids.numpy()]
            labels = [set(tokenizer.decode(ids, skip_special_tokens=True).split(', ')) for ids in batch_labels.numpy()]

            all_descriptions.extend(descriptions)
            all_labels.extend(labels)
            all_predictions.extend(preds)

            # Identify incorrect predictions for 'chevrolet'
            for desc, label, pred in zip(descriptions, labels, preds):
                if ('chevrolet' in label and 'chevrolet' not in pred) or ('chevrolet' in pred and 'chevrolet' not in label):
                    incorrect_chevrolet_cases.append((desc, label, pred))

    # Create a DataFrame with all predictions
    all_results_df = pd.DataFrame({
        'Description': all_descriptions,
        'Labels': all_labels,
        'Predictions': all_predictions
    })

    # Create a DataFrame for incorrect 'chevrolet' predictions
    incorrect_chevrolet_df = pd.DataFrame(incorrect_chevrolet_cases, columns=['Description', 'Labels', 'Predictions'])

    return all_results_df, incorrect_chevrolet_df

# Assuming test_dataset is defined similarly to train_dataset and val_dataset
test_dataset = tf.data.Dataset.from_tensor_slices((
    {'input_ids': test_encodings['input_ids'], 'attention_mask': test_encodings['attention_mask'], 'labels': test_labels['input_ids']}
)).batch(8)

# Evaluate the model and find incorrect predictions
all_results_df, incorrect_chevrolet_df = evaluate_model_and_find_incorrect_chevrolet(model2, tokenizer2, test_dataset)

# Display the DataFrame with incorrect predictions for 'chevrolet'
print(incorrect_chevrolet_df.head())




  Description                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           \
0  brand: ['4 seasons', 'four seasons'], type: ['a/c clutch control relay harness connector'], oem part number: ['89051437'], manufacturer part number: ['37220'], sku: ['fs:37220'], part title: ['power steering pressure line hose assembly gates 37220', 'icp/cpi 37220', 'disc brake rotor set-vented sport cross-drilled (pair) brembo 37220', 'auto extra 37220', 'advance 37220 shock absorber', 'a/c clutch control relay harness connector cooling depot 37220', 'auto plus/monroe/expert series 372

In [169]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.metrics import precision_score, recall_score, accuracy_score, fbeta_score, classification_report

# Function to evaluate the model and return predictions
def evaluate_model_with_predictions(model, tokenizer, dataset):
    all_descriptions = []
    all_labels = []
    all_predictions = []

    with tf.device('/CPU:0'):
        for batch in dataset:
            inputs = batch['input_ids']
            attention_mask = batch['attention_mask']
            batch_labels = batch['labels']

            # Ensure decoder_input_ids are created correctly
            decoder_input_ids = model._shift_right(batch_labels)
            
            # Generate predictions
            pred_ids = model.generate(input_ids=inputs, attention_mask=attention_mask)
            preds = [set(tokenizer.decode(ids, skip_special_tokens=True).split(', ')) for ids in pred_ids.numpy()]
            labels = [set(tokenizer.decode(ids, skip_special_tokens=True).split(', ')) for ids in batch_labels.numpy()]
            descriptions = [tokenizer.decode(ids, skip_special_tokens=True) for ids in inputs.numpy()]

            all_descriptions.extend(descriptions)
            all_labels.extend(labels)
            all_predictions.extend(preds)

    # Create a DataFrame with the results
    df = pd.DataFrame({
        'Description': all_descriptions,
        'Labels': all_labels,
        'Predictions': all_predictions
    })

    return df

# Function to calculate metrics and create DataFrames
def calculate_metrics_and_create_dataframes(df):
    # Add the has_desc column
    df['has_desc'] = df['Description'].apply(lambda x: 1 if 'cleaned_desc:' in x else 0)

    # Separate DataFrames based on has_desc
    df_has_desc_1 = df[df['has_desc'] == 1]
    df_has_desc_0 = df[df['has_desc'] == 0]

    # Define a function to calculate individual label metrics
    def calculate_label_metrics(df):
        all_labels = list(set([label for labels in df['Labels'] for label in labels]))
        label_metrics = []

        for label in all_labels:
            y_true = [1 if label in labels else 0 for labels in df['Labels']]
            y_pred = [1 if label in preds else 0 for preds in df['Predictions']]

            precision = precision_score(y_true, y_pred, zero_division=np.nan)
            recall = recall_score(y_true, y_pred, zero_division=np.nan)
            accuracy = accuracy_score(y_true, y_pred)

            label_metrics.append({
                'Label': label,
                'Precision': precision,
                'Recall': recall,
                'Accuracy': accuracy
            })

        return pd.DataFrame(label_metrics)

    # Calculate individual label metrics for both DataFrames
    label_metrics_df_1 = calculate_label_metrics(df_has_desc_1)
    label_metrics_df_0 = calculate_label_metrics(df_has_desc_0)

    # Calculate overall weighted metrics
    y_true_1 = np.array([[1 if label in labels else 0 for label in label_metrics_df_1['Label']] for labels in df_has_desc_1['Labels']])
    y_pred_1 = np.array([[1 if label in preds else 0 for label in label_metrics_df_1['Label']] for preds in df_has_desc_1['Predictions']])
    
    y_true_0 = np.array([[1 if label in labels else 0 for label in label_metrics_df_0['Label']] for labels in df_has_desc_0['Labels']])
    y_pred_0 = np.array([[1 if label in preds else 0 for label in label_metrics_df_0['Label']] for preds in df_has_desc_0['Predictions']])
    
    overall_metrics_1 = {
        'Precision': precision_score(y_true_1, y_pred_1, average='weighted', zero_division=np.nan),
        'Recall': recall_score(y_true_1, y_pred_1, average='weighted', zero_division=np.nan),
        'Accuracy': accuracy_score(y_true_1, y_pred_1)
    }

    overall_metrics_0 = {
        'Precision': precision_score(y_true_0, y_pred_0, average='weighted', zero_division=np.nan),
        'Recall': recall_score(y_true_0, y_pred_0, average='weighted', zero_division=np.nan),
        'Accuracy': accuracy_score(y_true_0, y_pred_0)
    }

    return label_metrics_df_1, label_metrics_df_0, overall_metrics_1, overall_metrics_0

# Assuming test_dataset is defined similarly to train_dataset and val_dataset
test_dataset = tf.data.Dataset.from_tensor_slices((
    {'input_ids': test_encodings['input_ids'], 'attention_mask': test_encodings['attention_mask'], 'labels': test_labels['input_ids']}
)).batch(16)

# Generate predictions on the test set
predictions_df = evaluate_model_with_predictions(model5, tokenizer5, test_dataset)

# Calculate metrics and create DataFrames
label_metrics_df_1, label_metrics_df_0, overall_metrics_1, overall_metrics_0 = calculate_metrics_and_create_dataframes(predictions_df)



In [170]:
# Display the results


print("\nOverall metrics for has_desc = 1:")
print(overall_metrics_1)


print("\nOverall metrics for has_desc = 0:")
print(overall_metrics_0)



Overall metrics for has_desc = 1:
{'Precision': 0.9530465078593421, 'Recall': 0.8267716535433071, 'Accuracy': 0.7816091954022989}

Overall metrics for has_desc = 0:
{'Precision': 0.7652799476382725, 'Recall': 0.554016620498615, 'Accuracy': 0.6416464891041163}


In [173]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.metrics import precision_score, recall_score, accuracy_score, fbeta_score, classification_report

# Function to evaluate the model and return predictions
def evaluate_model_with_predictions(model, tokenizer, dataset):
    all_descriptions = []
    all_labels = []
    all_predictions = []

    with tf.device('/CPU:0'):
        for batch in dataset:
            inputs = batch['input_ids']
            attention_mask = batch['attention_mask']
            batch_labels = batch['labels']

            # Ensure decoder_input_ids are created correctly
            decoder_input_ids = model._shift_right(batch_labels)
            
            # Generate predictions
            pred_ids = model.generate(input_ids=inputs, attention_mask=attention_mask)
            preds = [set(tokenizer.decode(ids, skip_special_tokens=True).split(', ')) for ids in pred_ids.numpy()]
            labels = [set(tokenizer.decode(ids, skip_special_tokens=True).split(', ')) for ids in batch_labels.numpy()]
            descriptions = [tokenizer.decode(ids, skip_special_tokens=True) for ids in inputs.numpy()]

            all_descriptions.extend(descriptions)
            all_labels.extend(labels)
            all_predictions.extend(preds)

    # Create a DataFrame with the results
    df = pd.DataFrame({
        'Description': all_descriptions,
        'Labels': all_labels,
        'Predictions': all_predictions
    })

    return df

# Function to calculate metrics and create DataFrames
def calculate_metrics_and_create_dataframes(df):
    # Add the has_desc column
    df['has_desc'] = df['Description'].apply(lambda x: 1 if 'make:' in x else 0)

    # Separate DataFrames based on has_desc
    df_has_desc_1 = df[df['has_desc'] == 1]
    df_has_desc_0 = df[df['has_desc'] == 0]

    # Define a function to calculate individual label metrics
    def calculate_label_metrics(df):
        all_labels = list(set([label for labels in df['Labels'] for label in labels]))
        label_metrics = []

        for label in all_labels:
            y_true = [1 if label in labels else 0 for labels in df['Labels']]
            y_pred = [1 if label in preds else 0 for preds in df['Predictions']]

            precision = precision_score(y_true, y_pred, zero_division=np.nan)
            recall = recall_score(y_true, y_pred, zero_division=np.nan)
            accuracy = accuracy_score(y_true, y_pred)

            label_metrics.append({
                'Label': label,
                'Precision': precision,
                'Recall': recall,
                'Accuracy': accuracy
            })

        return pd.DataFrame(label_metrics)

    # Calculate individual label metrics for both DataFrames
    label_metrics_df_1 = calculate_label_metrics(df_has_desc_1)
    label_metrics_df_0 = calculate_label_metrics(df_has_desc_0)

    # Calculate overall weighted metrics
    y_true_1 = np.array([[1 if label in labels else 0 for label in label_metrics_df_1['Label']] for labels in df_has_desc_1['Labels']])
    y_pred_1 = np.array([[1 if label in preds else 0 for label in label_metrics_df_1['Label']] for preds in df_has_desc_1['Predictions']])
    
    y_true_0 = np.array([[1 if label in labels else 0 for label in label_metrics_df_0['Label']] for labels in df_has_desc_0['Labels']])
    y_pred_0 = np.array([[1 if label in preds else 0 for label in label_metrics_df_0['Label']] for preds in df_has_desc_0['Predictions']])
    
    overall_metrics_1 = {
        'Precision': precision_score(y_true_1, y_pred_1, average='weighted', zero_division=np.nan),
        'Recall': recall_score(y_true_1, y_pred_1, average='weighted', zero_division=np.nan),
        'Accuracy': accuracy_score(y_true_1, y_pred_1)
    }

    overall_metrics_0 = {
        'Precision': precision_score(y_true_0, y_pred_0, average='weighted', zero_division=np.nan),
        'Recall': recall_score(y_true_0, y_pred_0, average='weighted', zero_division=np.nan),
        'Accuracy': accuracy_score(y_true_0, y_pred_0)
    }

    return label_metrics_df_1, label_metrics_df_0, overall_metrics_1, overall_metrics_0

# Assuming test_dataset is defined similarly to train_dataset and val_dataset
test_dataset = tf.data.Dataset.from_tensor_slices((
    {'input_ids': test_encodings['input_ids'], 'attention_mask': test_encodings['attention_mask'], 'labels': test_labels['input_ids']}
)).batch(16)

# Generate predictions on the test set
predictions_df = evaluate_model_with_predictions(model5, tokenizer5, test_dataset)

# Calculate metrics and create DataFrames
label_metrics_df_1, label_metrics_df_0, overall_metrics_1, overall_metrics_0 = calculate_metrics_and_create_dataframes(predictions_df)



In [179]:
predictions_df.head(500)

Unnamed: 0,Description,Labels,Predictions,has_desc
0,"brand: [""caili's company""], make: mazda, model: 3, year: ['2010'], fitment: ['direct replacement'], type: ['seal'], manufacturer part number: ['g560-25-744'], part title: ['original equip data g560-25-744'], part brands: ['original equipment data'], subtype: ['drive axle shaft oil slinger'], part types: ['axle shaft'], category: 4.0, item_title: g560-25-744 for 2010 mazda 3 bl transaxle case half shaft oil seal",{mazda},{mazda},1
1,"brand: ['2006 mini mini cooper stock number: 00017948'], make: mini, model: mini cooper, year: ['2006', '2006'], fitment: ['direct replacement'], manufacturer part number: ['quality,oem,recycled,part,1-year,3-year_ &,lifetime,warranties', 'true', 'alternator'], sku: ['00017948'], category: 0.0, item_title: 2006 mini cooper alternator 96k 957948, cleaned_desc: powered by frooition about us shipping returns contact us payment faq shop categories all categories helpful links add to favorite sel...",{mini},{mini},1
2,"brand: ['4 seasons', 'fan motor', 'four seasons'], model: four, year: ['1995', '1996', '1997', '1998', '1999', '2000', '2001'], type: ['motor only', 'oe mount', '4 pole'], manufacturer part number: ['35161'], part title: ['exhaust pipe flange gasket walker 35161', 'exhaust header-base, gas, natural 35161 fits 77-78 pontiac firebird 6.6l-v8', 'exhaust tail pipe tip magnaflow ny 35161', 'carquest/fel-pro 35161 gaskets and sealing systems - engine water pump gasket', 'engine cooling fan motor u...",{jeep},{jeep},0
3,"brand: ['4 seasons', 'four seasons'], make: hyundai kia, model: sonata optima, year: ['2013', '2011', '2012', '2014', '2013', '2011', '2012', '2014', '2011', '2012', '2013', '2014'], oem part number: ['bp37m5'], product name: ['auxiliary fan assembly'], manufacturer part number: ['radiator fan assembly. -- usa built', 'bp37m5'], sku: ['400:bp37m5'], category: 1.0, item_title: auxiliary fan assembly for 11-14 hyundai kia sonata optima 24l 4 cyl gas bp37m5","{kia, hyundai}","{kia, hyundai}",1
4,"brand: ['4 seasons', 'four seasons'], type: ['a/c clutch control relay harness connector'], oem part number: ['89051437'], manufacturer part number: ['37220'], sku: ['fs:37220'], part title: ['power steering pressure line hose assembly gates 37220', 'icp/cpi 37220', 'disc brake rotor set-vented sport cross-drilled (pair) brembo 37220', 'auto extra 37220', 'advance 37220 shock absorber', 'a/c clutch control relay harness connector cooling depot 37220', 'auto plus/monroe/expert series 37220 - ...","{chevrolet, buick, pontiac, cadillac, chrysler, oldsmobile}","{chevrolet, pontiac, cadillac,, buick}",0
5,"brand: ['4 seasons', 'four seasons'], type: ['engine coolant outlet flange'], oem part number: ['06a121132'], manufacturer part number: ['85336'], sku: ['fs:85336'], part title: [""radiator coolant hose flange-coolant flange o'reilly 85336"", 'radiator coolant hose flange-coolant flange factory air 85336', 'trumark 85336', 'engine coolant outlet flange-coolant flange carquest 85336', 'engine coolant outlet flange-coolant flange advance 85336', 'engine coolant outlet flange-coolant flange 4 sea...",{volkswagen},"{chevrolet, pontiac, cadillac,, buick}",0
6,"brand: ['4 seasons', 'four seasons'], type: ['engine coolant thermostat/water outlet assembly'], oem part number: ['4893926ag'], manufacturer part number: ['86269'], sku: ['fs:86269'], part title: ['engine coolant thermostat / water outlet assembly 4 seasons 86269', 'engine coolant thermostat / water outlet assembly uni-select 86269','spark plug wire set-vin: s, 2bbl mighty 86269', 'auto extra 86269','spark plug wire set-vin: h prenco 86269', ""engine coolant thermostat / water outlet assembl...","{jeep, ram, dodge}","{chevrolet, pontiac, cadillac,, buick}",0
7,"brand: ['ac delco genuine gm'], make: gmc v3500, model: v3500, year: ['1989', '1990', '1991', '1989', '1990', '1991', '1989', '1990', '1991'], fitment: ['genuine gm'], type: ['distributor cap'], oem part number: ['65gvns'], sku: ['400:65gvns'], category: 3.0, item_title: ac delco genuine gm distributor cap fits gmc v3500 1989-1991 65gvns",{gmc},{gmc},1
8,"brand: ['ac delco gold (professional)'], make: gmc r2500 suburban, model: r2500 suburban, year: ['1989', '1990', '1991', '1989', '1990', '1991', '1989', '1990', '1991'], fitment: ['gold (professional)'], type: ['radiator hose'], oem part number: ['28ydth'], sku: ['400:28ydth'], category: 1.0, item_title: upper ac delco radiator hose fits gmc r2500 suburban 1989-1991 74l v8 28ydth",{gmc},{gmc},1
9,"brand: ['ac delco professional'], make: toyota 4runner, model: 4runner, year: ['2003', '2004', '2005', '2006', '2007', '2008', '2009', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2003', '2004', '2005', '2006', '2007', '2008', '2009'], fitment: ['professional -- new; interference engine application'], type: ['timing belt kit'], oem part number: ['42bbnc'], sku: ['400:42bbnc'], category: 2.0, item_title: ac delco timing belt kit fits toyota 4runner 2003-2009 47l v8 gas 42bbnc",{toyota},{toyota},1


In [175]:
print("\nOverall metrics for has_desc = 1:")
print(overall_metrics_1)


print("\nOverall metrics for has_desc = 0:")
print(overall_metrics_0)


Overall metrics for has_desc = 1:
{'Precision': 0.9414331736482472, 'Recall': 0.810126582278481, 'Accuracy': 0.8206521739130435}

Overall metrics for has_desc = 0:
{'Precision': 0.5172607619678126, 'Recall': 0.4155405405405405, 'Accuracy': 0.3939393939393939}
