## Imports

In [2]:
import time
from transformers import AutoTokenizer, AutoModelForCausalLM
from accelerate import init_empty_weights
from transformers import BitsAndBytesConfig
import torch
import csv
import pandas as pd
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report

## Loading Model Llama-3.1-8B-Instruct and Tokenizer

In [3]:
# Define the model name and cache directory
model_name = "meta-llama/Llama-3.1-8B-Instruct"
cache_dir = "/scratch/gilbreth/anand173/model_cache"

# Configure 4-bit quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,  # Use 4-bit quantization
    bnb_4bit_use_double_quant=True,  # Enable double quantization for memory savings
    bnb_4bit_compute_dtype=torch.bfloat16,  # Use bfloat16 for computation
)

# Load the tokenizer
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir)

# Load the model with 4-bit quantization and device map
print("Loading model...")
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",  # Automatically allocate model layers across GPU/CPU
    cache_dir=cache_dir,
)

# Ensure the pad token is set
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.pad_token_id

print("Model and tokenizer loaded successfully!")

Loading tokenizer...
Loading model...




Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Model and tokenizer loaded successfully!


Model and tokenizer load times depend on the hardware used. In our case, we have a NVIDIA A30 with 24gb RAM hosted on gilbreth

## Accuracy against gold standard function

In [8]:
def evaluate_predictions(actual_df, prediction_df):
    """
    Evaluate the performance of predicted labels against actual labels.

    Args:
        actual_df (pd.DataFrame): DataFrame containing the actual labels with "ReviewText" and "label".
        prediction_df (pd.DataFrame): DataFrame containing the predicted labels with "ReviewText" and "PredictedLabel".

    Returns:
        dict: A dictionary containing accuracy, precision, recall, and F1 score.
    """
    # Combine the actual and predicted DataFrames on "ReviewText"
    comparison_df = pd.merge(
        actual_df, 
        prediction_df, 
        on="ReviewText", 
        how="inner"
    )

    if comparison_df.empty:
        raise ValueError("No matching ReviewText found between actual and predicted data.")

    # Check for matches between 'label' and 'PredictedLabel'
    comparison_df["Match"] = comparison_df["label"] == comparison_df["PredictedLabel"]

    # Calculate accuracy
    accuracy = comparison_df["Match"].mean()

    # Convert labels to numeric for precision, recall, and F1 score
    y_true = comparison_df["label"]
    y_pred = comparison_df["PredictedLabel"]

    # Calculate evaluation metrics
    precision = precision_score(y_true, y_pred, average="weighted", zero_division=0)
    recall = recall_score(y_true, y_pred, average="weighted", zero_division=0)
    f1 = f1_score(y_true, y_pred, average="weighted", zero_division=0)

    # Print classification report for more insights (optional)
    print("\nClassification Report:")
    print(classification_report(y_true, y_pred))

    # Return the evaluation metrics
    return {
        "accuracy": accuracy * 100,
        "precision": precision * 100,
        "recall": recall * 100,
        "f1_score": f1 * 100  # Return as percentage for consistency
    }

---

## Predicting fit

- Correct Size/Just Right converted to Correct Size
- Nulls removed

### Fit - Zero Shot

In [11]:
# Define the file path for the input reviews
input_file = "fit.csv"
output_file = "results/fit_predictions_zero_shot_full_dataset.csv"

start_time = time.time()

# Prepare to write results to a new CSV file
with open(output_file, mode="w", newline="") as out_csv:
    writer = csv.writer(out_csv)
    writer.writerow(["ReviewText", "PredictedLabel"])  # Write headers

    # Read and process each review from the input CSV file
    with open(input_file, mode="r") as in_csv:
        reader = csv.DictReader(in_csv)
        for row in reader:
            review = row["ReviewText"]

            prompt = f"""### Instruction: Classify the following product reviews into one of the categories: "Correct Size," "Wrong Size," or "No Comment." Respond only with the category name.
                    ### Review:
                    {review}
                    ### Response:
                    """

            # Update the eos_token and synchronize it with the model
            tokenizer.eos_token = "</end>"
            model.config.eos_token_id = tokenizer.convert_tokens_to_ids(tokenizer.eos_token)

            # Tokenize the input
            inputs = tokenizer(
                prompt,
                return_tensors="pt",
                truncation=True,
                padding=True,
                max_length=2000
            ).to("cuda")  # Send input tensors to GPU

            # Generate the output
            outputs = model.generate(
                **inputs,
                max_new_tokens=2,  # Limit the response length
                eos_token_id=tokenizer.eos_token_id
            )

            # Decode the response and clean it
            response = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()

            # print(response)

            # Extract the category after "### Response:"
            if "### Response:" in response:
                category = response.split("### Response:")[-1].strip()
            else:
                category = "Invalid Response"  # Fallback if the format is incorrect

            print(f"Review: {review}")
            print(f"Predicted Label: {category}")

            # Write the review and predicted label to the output CSV
            writer.writerow([review, category])

print(f"Predictions for the reviews saved to {output_file}.")

# End timing
end_time = time.time()

# Print runtime
runtime = end_time - start_time
print(f"\n\n Runtime: {runtime:.2f} seconds \n\n")

Review: You will have to remove the window which is very easy and be very careful.  drop the window down to allow access to both screws holding the glass.  Lift the window up a bit and then drop the front of the windows down into the door slowly and then raise the rear of the glass up and you will start to lift the glass at a angle upwards and out of the door frame.  You will then be able to access the three screws to remove the door handle and you will only have to remove the cable pin from the door once you have access to the back of the handle.The process to remove the door handles is very very easy and you will need a P2 and P3 screw driver and a 10mm socket with extension.
Predicted Label: "Correct
Review:  It does what it is supposed to! Sure it does not come with any instructions, why take a star away for that? Once I got the power connected to it the correct way, the unit kicks on and off solidly. There is some audible  #34;clicking #34; when the relay switches on and off, so d

In [34]:
input_file = "fit.csv"  # Original file with actual labels
predictions_file = "results/fit_predictions_zero_shot_full_dataset.csv"  # File with model predictions

# Load input and prediction files as DataFrames
df_input = pd.read_csv(input_file)
df_predictions = pd.read_csv(predictions_file)

# # Check unique values in the 'label' column
# unique_labels = df_predictions['PredictedLabel'].unique()
# print(f"Unique labels: {unique_labels}")

# # Count occurrences of each label
# label_counts = df_predictions['PredictedLabel'].value_counts()

# # Display the counts
# print("Counts of each category:")
# print(label_counts)

# Define a mapping for correcting inconsistent labels
label_mapping = {
    '"Correct': 'Correct Size',
    'Correct Size': 'Correct Size',
    '"Wrong': 'Wrong Size',
    'Wrong Size': 'Wrong Size',
    '"No': 'No Comment',
    'No Comment': 'No Comment',
    "I'm": 'Wrong Size',  # Assuming "I'm" falls under Wrong Size
    'Incorrect Size': 'Wrong Size'  # Assuming 'Incorrect Size' is equivalent to 'Wrong Size'
}

# Replace incorrect labels with standardized ones
df_predictions['PredictedLabel'] = df_predictions['PredictedLabel'].replace(label_mapping)

# # Verify the changes by checking unique values and counts
# print("Unique standardized labels:")
# print(df_predictions['PredictedLabel'].unique())

# # Count occurrences of standardized labels
# standardized_label_counts = df_predictions['PredictedLabel'].value_counts()
# print("\nCounts of standardized categories:")
# print(standardized_label_counts)


# Evaluate predictions
metrics = evaluate_predictions(df_input, df_predictions)

# Print evaluation metrics
print(f"Accuracy: {metrics['accuracy']:.2f}%")
print(f"Precision: {metrics['precision']:.2f}%")
print(f"Recall: {metrics['recall']:.2f}%")
print(f"F1 Score: {metrics['f1_score']:.2f}")


Classification Report:
              precision    recall  f1-score   support

Correct Size       0.37      0.90      0.53       659
  No Comment       0.74      0.04      0.07      1375
  Wrong Size       0.31      0.82      0.45       221

    accuracy                           0.37      2255
   macro avg       0.47      0.59      0.35      2255
weighted avg       0.59      0.37      0.24      2255

Accuracy: 36.72%
Precision: 59.00%
Recall: 36.72%
F1 Score: 24.32


Inference:

- Runtime : Approximately 8 minutes
---

### Fit - One shot with 1 example per category

- Added context for the categories in the prompt
- Provided 1 example for each category adding more context for the LLM model

In [41]:
# Define the file path for the input reviews
input_file = "fit.csv"
output_file = "results/fit_predictions_one_shot_full_dataset.csv"

start_time = time.time()

# Prepare to write results to a new CSV file
with open(output_file, mode="w", newline="") as out_csv:
    writer = csv.writer(out_csv)
    writer.writerow(["ReviewText", "PredictedLabel"])  # Write headers

    # Read and process each review from the input CSV file
    with open(input_file, mode="r") as in_csv:
        reader = csv.DictReader(in_csv)
        for row in reader:
            review = row["ReviewText"]

            prompt = f"""### Instruction:
    Classify the following review into one of the categories: "Correct Size," "Wrong Size," or "No Comment."
    Strictly respond with the category name only.

    ### Categories:
    Correct Size: The product fits as expected and performs its intended function without issues.
    Wrong Size: The product does not fit or requires modifications to work correctly.
    No Comment: The review does not mention size or fitting issues.

    ### Examples:
    Correct Size: I put this into a 1999 Mazda Miata MX-5.  It fit perfectly.  It came fully charged.  Delivered it saved me $50.  Great deal.  Love it.
    Wrong Size: It was not the exact match. I had to rewire the battery in order to make it work. It was a toy for my Lil man. I am glad that I was able to to make it work.  But make sure you can iuse it.
    No Comment: I would recommend this product.  It lasts long and works fine.  Did the job for me.  It was a good price.
    
    ### Review:
    {review}
    ### Response:
    """

            # Update the eos_token and synchronize it with the model
            tokenizer.eos_token = "</end>"
            model.config.eos_token_id = tokenizer.convert_tokens_to_ids(tokenizer.eos_token)

            # Tokenize the input
            inputs = tokenizer(
                prompt,
                return_tensors="pt",
                truncation=True,
                padding=True,
                max_length=2000
            ).to("cuda")  # Send input tensors to GPU

            # Generate the output
            outputs = model.generate(
                **inputs,
                max_new_tokens=2,  # Limit the response length
                eos_token_id=tokenizer.eos_token_id
            )

            # Decode the response and clean it
            response = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()

            # print(response)

            # Extract the category after "### Response:"
            if "### Response:" in response:
                category = response.split("### Response:")[-1].strip()
            else:
                category = "Invalid Response"  # Fallback if the format is incorrect

            print(f"Review: {review}")
            print(f"Predicted Label: {category}")

            # Write the review and predicted label to the output CSV
            writer.writerow([review, category])

print(f"Predictions for the reviews saved to {output_file}.")

# End timing
end_time = time.time()

# Print runtime
runtime = end_time - start_time
print(f"\n\n Runtime: {runtime:.2f} seconds \n\n")

### Instruction:
    Classify the following review into one of the categories: "Correct Size," "Wrong Size," or "No Comment."
    Strictly respond with the category name only.

    ### Categories:
    Correct Size: The product fits as expected and performs its intended function without issues.
    Wrong Size: The product does not fit or requires modifications to work correctly.
    No Comment: The review does not mention size or fitting issues.

    ### Examples:
    Correct Size: I put this into a 1999 Mazda Miata MX-5.  It fit perfectly.  It came fully charged.  Delivered it saved me $50.  Great deal.  Love it.
    Wrong Size: It was not the exact match. I had to rewire the battery in order to make it work. It was a toy for my Lil man. I am glad that I was able to to make it work.  But make sure you can iuse it.
    No Comment: I would recommend this product.  It lasts long and works fine.  Did the job for me.  It was a good price.
    
    ### Review:
    You will have to remove the

In [43]:
input_file = "fit.csv"  # Original file with actual labels
predictions_file = "results/fit_predictions_one_shot_full_dataset.csv"  # File with model predictions

# Load input and prediction files as DataFrames
df_input = pd.read_csv(input_file)
df_predictions = pd.read_csv(predictions_file)

# Evaluate predictions
metrics = evaluate_predictions(df_input, df_predictions)

# Print evaluation metrics
print(f"Accuracy: {metrics['accuracy']:.2f}%")
print(f"Precision: {metrics['precision']:.2f}%")
print(f"Recall: {metrics['recall']:.2f}%")
print(f"F1 Score: {metrics['f1_score']:.2f}")


Classification Report:
              precision    recall  f1-score   support

Correct Size       0.38      0.82      0.52       659
  No Comment       0.86      0.01      0.02      1375
  Wrong Size       0.24      0.90      0.38       221

    accuracy                           0.33      2255
   macro avg       0.49      0.58      0.31      2255
weighted avg       0.66      0.33      0.20      2255

Accuracy: 33.35%
Precision: 65.75%
Recall: 33.35%
F1 Score: 19.99


### Fit - Few shot with 2 examples each

In [46]:
# Define the file path for the input reviews
input_file = "fit.csv"
output_file = "results/fit_predictions_two_shot_full_dataset.csv"

start_time = time.time()

# Prepare to write results to a new CSV file
with open(output_file, mode="w", newline="") as out_csv:
    writer = csv.writer(out_csv)
    writer.writerow(["ReviewText", "PredictedLabel"])  # Write headers

    # Read and process each review from the input CSV file
    with open(input_file, mode="r") as in_csv:
        reader = csv.DictReader(in_csv)
        for row in reader:
            review = row["ReviewText"]

            prompt = f"""### Instruction:
    Classify the following review into one of the categories: "Correct Size," "Wrong Size," or "No Comment."
    Respond only with the category name.

    ### Categories:
    Correct Size: The product fits as expected and performs its intended function without issues.
    Wrong Size: The product does not fit or requires modifications to work correctly.
    No Comment: The review does not mention size or fitting issues.

    ### Examples:
    Correct Size: I put this into a 1999 Mazda Miata MX-5.  It fit perfectly.  It came fully charged.  Delivered it saved me $50.  Great deal.  Love it.
    Correct Size: I have even used this to start my dodge 2500 which has a heavy duty battery for starting and it worked great. The light pulls out to expose a 12v car adapter socket.
    Wrong Size: It was not the exact match. I had to rewire the battery in order to make it work. It was a toy for my Lil man. I am glad that I was able to to make it work.  But make sure you can iuse it.
    Wrong Size: two different ends on cables. doesn't make sense. had to change the end on one side to fit it to the battery.
    No Comment: I would recommend this product.  It lasts long and works fine.  Did the job for me.  It was a good price.
    No Comment: Got the product and was as advertised. Have not yet had time to install. Does not look cheap and product was not damaged.
    
    ### Review:
    {review}
    ### Response:
    """

            # Update the eos_token and synchronize it with the model
            tokenizer.eos_token = "</end>"
            model.config.eos_token_id = tokenizer.convert_tokens_to_ids(tokenizer.eos_token)

            # Tokenize the input
            inputs = tokenizer(
                prompt,
                return_tensors="pt",
                truncation=True,
                padding=True,
                max_length=2000
            ).to("cuda")  # Send input tensors to GPU

            # Generate the output
            outputs = model.generate(
                **inputs,
                max_new_tokens=2,  # Limit the response length
                eos_token_id=tokenizer.eos_token_id
            )

            # Decode the response and clean it
            response = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()

            # print(response)

            # Extract the category after "### Response:"
            if "### Response:" in response:
                category = response.split("### Response:")[-1].strip()
            else:
                category = "Invalid Response"  # Fallback if the format is incorrect

            print(f"Review: {review}")
            print(f"Predicted Label: {category}")

            # Write the review and predicted label to the output CSV
            writer.writerow([review, category])

print(f"Predictions for the reviews saved to {output_file}.")

# End timing
end_time = time.time()

# Print runtime
runtime = end_time - start_time
print(f"Runtime: {runtime:.2f} seconds")

Review: You will have to remove the window which is very easy and be very careful.  drop the window down to allow access to both screws holding the glass.  Lift the window up a bit and then drop the front of the windows down into the door slowly and then raise the rear of the glass up and you will start to lift the glass at a angle upwards and out of the door frame.  You will then be able to access the three screws to remove the door handle and you will only have to remove the cable pin from the door once you have access to the back of the handle.The process to remove the door handles is very very easy and you will need a P2 and P3 screw driver and a 10mm socket with extension.
Predicted Label: Wrong Size
Review:  It does what it is supposed to! Sure it does not come with any instructions, why take a star away for that? Once I got the power connected to it the correct way, the unit kicks on and off solidly. There is some audible  #34;clicking #34; when the relay switches on and off, so

In [54]:
input_file = "fit.csv"  # Original file with actual labels
predictions_file = "results/fit_predictions_two_shot_full_dataset.csv"  # File with model predictions

# Load input and prediction files as DataFrames
df_input = pd.read_csv(input_file)
df_predictions = pd.read_csv(predictions_file)

# # Check unique values in the 'label' column
unique_labels = df_predictions['PredictedLabel'].unique()
print(f"Unique labels: {unique_labels}")

# Count occurrences of each label
label_counts = df_predictions['PredictedLabel'].value_counts()

# # Display the counts
print("Counts of each category:")
print(label_counts)

# Evaluate predictions
metrics = evaluate_predictions(df_input, df_predictions)

# Print evaluation metrics
print(f"Accuracy: {metrics['accuracy']:.2f}%")
print(f"Precision: {metrics['precision']:.2f}%")
print(f"Recall: {metrics['recall']:.2f}%")
print(f"F1 Score: {metrics['f1_score']:.2f}")

Unique labels: ['Wrong Size' 'Correct Size' 'No Comment']
Counts of each category:
PredictedLabel
Correct Size    1179
Wrong Size      1033
No Comment        43
Name: count, dtype: int64

Classification Report:
              precision    recall  f1-score   support

Correct Size       0.41      0.74      0.53       659
  No Comment       0.91      0.03      0.06      1375
  Wrong Size       0.20      0.92      0.32       221

    accuracy                           0.32      2255
   macro avg       0.51      0.56      0.30      2255
weighted avg       0.69      0.32      0.22      2255

Accuracy: 32.28%
Precision: 69.28%
Recall: 32.28%
F1 Score: 21.98


### Fit - Few shot with 3 examples

In [50]:
# Define the file path for the input reviews
input_file = "fit.csv"
output_file = "results/fit_predictions_three_shot_full_dataset.csv"

start_time = time.time()

# Prepare to write results to a new CSV file
with open(output_file, mode="w", newline="") as out_csv:
    writer = csv.writer(out_csv)
    writer.writerow(["ReviewText", "PredictedLabel"])  # Write headers

    # Read and process each review from the input CSV file
    with open(input_file, mode="r") as in_csv:
        reader = csv.DictReader(in_csv)
        for row in reader:
            review = row["ReviewText"]

            prompt = f"""### Instruction:
    Classify the following review into one of the categories: "Correct Size," "Wrong Size," or "No Comment."
    Respond only with the category name.

    ### Categories:
    Correct Size: The product fits as expected and performs its intended function without issues.
    Wrong Size: The product does not fit or requires modifications to work correctly.
    No Comment: The review does not mention size or fitting issues.

    ### Examples:
    Correct Size: I put this into a 1999 Mazda Miata MX-5.  It fit perfectly.  It came fully charged.  Delivered it saved me $50.  Great deal.  Love it.
    Correct Size: I have even used this to start my dodge 2500 which has a heavy duty battery for starting and it worked great. The light pulls out to expose a 12v car adapter socket.
    Correct Size: Perfect fit and the price is right. So far so good, I expect it will last a long long time.
    Wrong Size: It was not the exact match. I had to rewire the battery in order to make it work. It was a toy for my Lil man. I am glad that I was able to to make it work.  But make sure you can iuse it.
    Wrong Size: two different ends on cables. doesn't make sense. had to change the end on one side to fit it to the battery.
    Wrong Size: I ordered this product for my car   while Amazon.com says this will fit my 1996 Nissan Sentra GXE auto transmission, when the part arrived, I knew it wouldn't fit. I won't order parts online again.
    No Comment: I would recommend this product.  It lasts long and works fine.  Did the job for me.  It was a good price.
    No Comment: Got the product and was as advertised. Have not yet had time to install. Does not look cheap and product was not damaged.
    No Comment: This Is a great product and its even stronger than de factory one my Jeep Grand Cherokee came with.... Recommend this product
    
    ### Review:
    {review}
    ### Response:
    """

            # Update the eos_token and synchronize it with the model
            tokenizer.eos_token = "</end>"
            model.config.eos_token_id = tokenizer.convert_tokens_to_ids(tokenizer.eos_token)

            # Tokenize the input
            inputs = tokenizer(
                prompt,
                return_tensors="pt",
                truncation=True,
                padding=True,
                max_length=2000
            ).to("cuda")  # Send input tensors to GPU

            # Generate the output
            outputs = model.generate(
                **inputs,
                max_new_tokens=2,  # Limit the response length
                eos_token_id=tokenizer.eos_token_id
            )

            # Decode the response and clean it
            response = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()

            # print(response)

            # Extract the category after "### Response:"
            if "### Response:" in response:
                category = response.split("### Response:")[-1].strip()
            else:
                category = "Invalid Response"  # Fallback if the format is incorrect

            print(f"Review: {review}")
            print(f"Predicted Label: {category}")

            # Write the review and predicted label to the output CSV
            writer.writerow([review, category])

print(f"Predictions for the reviews saved to {output_file}.")

# End timing
end_time = time.time()

# Print runtime
runtime = end_time - start_time
print(f"\n\nRuntime: {runtime:.2f} seconds\n\n")

Review: You will have to remove the window which is very easy and be very careful.  drop the window down to allow access to both screws holding the glass.  Lift the window up a bit and then drop the front of the windows down into the door slowly and then raise the rear of the glass up and you will start to lift the glass at a angle upwards and out of the door frame.  You will then be able to access the three screws to remove the door handle and you will only have to remove the cable pin from the door once you have access to the back of the handle.The process to remove the door handles is very very easy and you will need a P2 and P3 screw driver and a 10mm socket with extension.
Predicted Label: Wrong Size
Review:  It does what it is supposed to! Sure it does not come with any instructions, why take a star away for that? Once I got the power connected to it the correct way, the unit kicks on and off solidly. There is some audible  #34;clicking #34; when the relay switches on and off, so

In [56]:
input_file = "fit.csv"  # Original file with actual labels
predictions_file = "results/fit_predictions_three_shot_full_dataset.csv"  # File with model predictions

# Load input and prediction files as DataFrames
df_input = pd.read_csv(input_file)
df_predictions = pd.read_csv(predictions_file)

# # Check unique values in the 'label' column
unique_labels = df_predictions['PredictedLabel'].unique()
print(f"Unique labels: {unique_labels}")

# Count occurrences of each label
label_counts = df_predictions['PredictedLabel'].value_counts()

# # Display the counts
print("Counts of each category:")
print(label_counts)

# Evaluate predictions
metrics = evaluate_predictions(df_input, df_predictions)

# Print evaluation metrics
print(f"Accuracy: {metrics['accuracy']:.2f}%")
print(f"Precision: {metrics['precision']:.2f}%")
print(f"Recall: {metrics['recall']:.2f}%")
print(f"F1 Score: {metrics['f1_score']:.2f}")

Unique labels: ['Wrong Size' 'Correct Size' 'No Comment']
Counts of each category:
PredictedLabel
Correct Size    1310
Wrong Size       930
No Comment        15
Name: count, dtype: int64

Classification Report:
              precision    recall  f1-score   support

Correct Size       0.41      0.81      0.54       659
  No Comment       0.87      0.01      0.02      1375
  Wrong Size       0.22      0.93      0.36       221

    accuracy                           0.33      2255
   macro avg       0.50      0.58      0.31      2255
weighted avg       0.67      0.33      0.21      2255

Accuracy: 33.44%
Precision: 66.95%
Recall: 33.44%
F1 Score: 20.53


## Fit - Few shot with 4 examples

In [5]:
# Define the file path for the input reviews
input_file = "fit.csv"
output_file = "results/fit_predictions_full_dataset.csv"

start_time = time.time()

# Prepare to write results to a new CSV file
with open(output_file, mode="w", newline="") as out_csv:
    writer = csv.writer(out_csv)
    writer.writerow(["ReviewText", "PredictedLabel"])  # Write headers

    # Read and process each review from the input CSV file
    with open(input_file, mode="r") as in_csv:
        reader = csv.DictReader(in_csv)
        for row in reader:
            review = row["ReviewText"]

            prompt = f"""### Instruction:
            You are an analyst who categorizes reviews based on their fit for automotive parts. Your task is to classify the following review into one of the categories: "Correct Size," "Wrong Size," or "No Comment."
            Respond only with the category name.

            ### Categories:
            Correct Size: The product fits as expected and performs its intended function without issues.
            Wrong Size: The product does not fit or requires modifications to work correctly.
            No Comment: The review does not mention size or fitting issues.

            ### Examples:
            Correct Size: I put this into a 1999 Mazda Miata MX-5.  It fit perfectly.  It came fully charged.  Delivered it saved me $50.  Great deal.  Love it.
            Correct Size: I have even used this to start my dodge 2500 which has a heavy duty battery for starting and it worked great. The light pulls out to expose a 12v car adapter socket.
            Correct Size: Perfect fit and the price is right. So far so good, I expect it will last a long long time.
            Correct Size: the belt  came in time and fit properly did not have any issues when installing it i would buy again
            Wrong Size: It was not the exact match. I had to rewire the battery in order to make it work. It was a toy for my Lil man. I am glad that I was able to to make it work.  But make sure you can iuse it.
            Wrong Size: two different ends on cables. doesn't make sense. had to change the end on one side to fit it to the battery.
            Wrong Size: I ordered this product for my car   while Amazon.com says this will fit my 1996 Nissan Sentra GXE auto transmission, when the part arrived, I knew it wouldn't fit. I won't order parts online again.
            Wrong Size: they could be more specific as to which model its used for.
            No Comment: I would recommend this product.  It lasts long and works fine.  Did the job for me.  It was a good price.
            No Comment: Got the product and was as advertised. Have not yet had time to install. Does not look cheap and product was not damaged.
            No Comment: This Is a great product and its even stronger than de factory one my Jeep Grand Cherokee came with.... Recommend this product
            No Comment: Good Quality Great Price, unit seem to be a bit more sturdy than the Trico or Anco blades I replaced.  Like the price and quality.

            ### Review:
            {review}
            ### Response:
            """

            # Update the eos_token and synchronize it with the model
            tokenizer.eos_token = "</end>"
            model.config.eos_token_id = tokenizer.convert_tokens_to_ids(tokenizer.eos_token)

            # Tokenize the input
            inputs = tokenizer(
                prompt,
                return_tensors="pt",
                truncation=True,
                padding=True,
                max_length=2000
            ).to("cuda")  # Send input tensors to GPU

            # Generate the output
            outputs = model.generate(
                **inputs,
                max_new_tokens=2,  # Limit the response length
                eos_token_id=tokenizer.eos_token_id
            )

            # Decode the response and clean it
            response = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()

            # print(response)

            # Extract the category after "### Response:"
            if "### Response:" in response:
                category = response.split("### Response:")[-1].strip()
            else:
                category = "Invalid Response"  # Fallback if the format is incorrect

            print(f"Review: {review}")
            print(f"Predicted Label: {category}")

            # Write the review and predicted label to the output CSV
            writer.writerow([review, category])

print(f"Predictions for the reviews saved to {output_file}.")

# End timing
end_time = time.time()

# Print runtime
runtime = end_time - start_time
print(f"\n\nRuntime: {runtime:.2f} seconds\n\n")

Review: You will have to remove the window which is very easy and be very careful.  drop the window down to allow access to both screws holding the glass.  Lift the window up a bit and then drop the front of the windows down into the door slowly and then raise the rear of the glass up and you will start to lift the glass at a angle upwards and out of the door frame.  You will then be able to access the three screws to remove the door handle and you will only have to remove the cable pin from the door once you have access to the back of the handle.The process to remove the door handles is very very easy and you will need a P2 and P3 screw driver and a 10mm socket with extension.
Predicted Label: Wrong Size
Review:  It does what it is supposed to! Sure it does not come with any instructions, why take a star away for that? Once I got the power connected to it the correct way, the unit kicks on and off solidly. There is some audible  #34;clicking #34; when the relay switches on and off, so

Because of unequal category tokenization lengths, model predicts "Correct Size/" in place of "Correct Size/Just Right". This was done using max_new_tokens = 3 in the output parameter setting. Increasing this value made the model pad the response using the first few characteristics of the prompt itself making it difficult for accuracy comparison. It would help to standardize the input/output based on token length for ensuring consistency. To avoid this issue, I have limited the max new tokens and then I employ post processing to convert "Correct Size/" to "Correct Size/Just Right"

Post processing step can be seen below.

In [9]:
input_file = "fit.csv"  # Original file with actual labels
predictions_file = "results/fit_predictions_full_dataset.csv"  # File with model predictions

# Load input and prediction files as DataFrames
df_input = pd.read_csv(input_file)
df_predictions = pd.read_csv(predictions_file)

# # # Check unique values in the 'label' column
# unique_labels = df_predictions['PredictedLabel'].unique()
# print(f"Unique labels: {unique_labels}")

# # Count occurrences of each label
# label_counts = df_predictions['PredictedLabel'].value_counts()

# # # Display the counts
# print("Counts of each category:")
# print(label_counts)

# Evaluate predictions
metrics = evaluate_predictions(df_input, df_predictions)

# Print evaluation metrics
print(f"Accuracy: {metrics['accuracy']:.2f}%")
print(f"Precision: {metrics['precision']:.2f}%")
print(f"Recall: {metrics['recall']:.2f}%")
print(f"F1 Score: {metrics['f1_score']:.2f}")


Classification Report:
              precision    recall  f1-score   support

Correct Size       0.42      0.73      0.54       659
  No Comment       0.83      0.09      0.17      1375
  Wrong Size       0.22      0.96      0.36       221

    accuracy                           0.36      2255
   macro avg       0.49      0.60      0.35      2255
weighted avg       0.65      0.36      0.29      2255

Accuracy: 36.50%
Precision: 64.88%
Recall: 36.50%
F1 Score: 29.39


## Predicting Price

- A Value/A Deal/Underpriced category was converted to Underpriced
- Nulls were removed

### Price Zero shot

In [18]:
# Define the file path for the input reviews
input_file = "price.csv"
output_file = "results/price_predictions_zero_shot_full_dataset.csv"

start_time = time.time()

# Prepare to write results to a new CSV file
with open(output_file, mode="w", newline="") as out_csv:
    writer = csv.writer(out_csv)
    writer.writerow(["ReviewText", "PredictedLabel"])  # Write headers

    # Read and process each review from the input CSV file
    with open(input_file, mode="r") as in_csv:
        reader = csv.DictReader(in_csv)
        for row in reader:
            review = row["ReviewText"]

            prompt = f"""### Instruction:
            You are an analyst who categorizes reviews based on their fit for automotive parts. Your task is to classify the following review into one of the categories: "Underpriced" "Overpriced" or "No Comment."
            Respond only with the Underpriced/Overpriced/No Comment.

            ### Review:
            {review}
            ### Response:
            """

            # Update the eos_token and synchronize it with the model
            tokenizer.eos_token = "</end>"
            model.config.eos_token_id = tokenizer.convert_tokens_to_ids(tokenizer.eos_token)

            # Tokenize the input
            inputs = tokenizer(
                prompt,
                return_tensors="pt",
                truncation=True,
                padding=True,
                max_length=2000
            ).to("cuda")  # Send input tensors to GPU

            # Generate the output
            outputs = model.generate(
                **inputs,
                max_new_tokens=3,  # Limit the response length
                eos_token_id=tokenizer.eos_token_id
            )

            # Decode the response and clean it
            response = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()

            # print(response)

            # Extract the category after "### Response:"
            if "### Response:" in response:
                category = response.split("### Response:")[-1].strip()
            else:
                category = "Invalid Response"  # Fallback if the format is incorrect

            print(f"Review: {review}")
            print(f"Predicted Label: {category}")

            # Write the review and predicted label to the output CSV
            writer.writerow([review, category])

print(f"Predictions for the reviews saved to {output_file}.")

# End timing
end_time = time.time()

# Print runtime
runtime = end_time - start_time
print(f"\n\nRuntime: {runtime:.2f} seconds\n\n")

Review: You will have to remove the window which is very easy and be very careful.  drop the window down to allow access to both screws holding the glass.  Lift the window up a bit and then drop the front of the windows down into the door slowly and then raise the rear of the glass up and you will start to lift the glass at a angle upwards and out of the door frame.  You will then be able to access the three screws to remove the door handle and you will only have to remove the cable pin from the door once you have access to the back of the handle.The process to remove the door handles is very very easy and you will need a P2 and P3 screw driver and a 10mm socket with extension.
Predicted Label: Underpriced
Review:   It does what it is supposed to! Sure it does not come with any instructions, why take a star away for that? Once I got the power connected to it the correct way, the unit kicks on and off solidly. There is some audible  #34;clicking #34; when the relay switches on and off, 

In [20]:
input_file = "price.csv"  # Original file with actual labels
predictions_file = "results/price_predictions_zero_shot_full_dataset.csv"  # File with model predictions

# Load input and prediction files as DataFrames
df_input = pd.read_csv(input_file)
df_predictions = pd.read_csv(predictions_file)

# # Check unique values in the 'label' column
# unique_labels = df_predictions['PredictedLabel'].unique()
# print(f"Unique labels: {unique_labels}")

# # Count occurrences of each label
# label_counts = df_predictions['PredictedLabel'].value_counts()

# # Display the counts
# print("Counts of each category:")
# print(label_counts)

label_mapping = {
    'Underpriced': 'Underpriced',
    'No Comment Answer': 'No Comment',
    'Overpriced': 'Overpriced',
    'No Comment': 'No Comment',
    'No Comment.': 'No Comment',
    'No Comment Bookmark': 'No Comment',
    "Overpried": 'Overpriced'
}

# Replace incorrect labels with standardized ones
df_predictions['PredictedLabel'] = df_predictions['PredictedLabel'].replace(label_mapping)

# Evaluate predictions
metrics = evaluate_predictions(df_input, df_predictions)

# Print evaluation metrics
print(f"Accuracy: {metrics['accuracy']:.2f}%")
print(f"Precision: {metrics['precision']:.2f}%")
print(f"Recall: {metrics['recall']:.2f}%")
print(f"F1 Score: {metrics['f1_score']:.2f}")

Unique labels: ['Underpriced' 'No Comment Answer' 'Overpriced' 'No Comment' 'No Comment.'
 'No Comment Bookmark' 'Overpried']
Counts of each category:
PredictedLabel
Underpriced            1243
Overpriced              928
No Comment.              31
No Comment               26
No Comment Answer        25
No Comment Bookmark       1
Overpried                 1
Name: count, dtype: int64

Classification Report:
              precision    recall  f1-score   support

  No Comment       0.98      0.04      0.08      1883
  Overpriced       0.04      0.87      0.08        46
 Underpriced       0.20      0.78      0.32       326

    accuracy                           0.17      2255
   macro avg       0.41      0.56      0.16      2255
weighted avg       0.85      0.17      0.12      2255

Accuracy: 16.63%
Precision: 84.53%
Recall: 16.63%
F1 Score: 11.73


In [None]:
# Define the file path for the input reviews
input_file = "price.csv"
output_file = "results/price_predictions_one_shot_full_dataset.csv"

start_time = time.time()

# Prepare to write results to a new CSV file
with open(output_file, mode="w", newline="") as out_csv:
    writer = csv.writer(out_csv)
    writer.writerow(["ReviewText", "PredictedLabel"])  # Write headers

    # Read and process each review from the input CSV file
    with open(input_file, mode="r") as in_csv:
        reader = csv.DictReader(in_csv)
        for row in reader:
            review = row["ReviewText"]

            prompt = f"""### Instruction:
            You are an analyst who categorizes reviews based on their fit for automotive parts. Your task is to classify the following review into one of the categories: "Underpriced" "Overpriced" or "No Comment."
            Respond only with the Underpriced/Overpriced/No Comment.

            ### Examples:
            Underpriced: I put this into a 1999 Mazda Miata MX-5.  It fit perfectly.  It came fully charged.  Delivered it saved me $50.  Great deal.  Love it.
            Overpriced: I spent a few extra dollars to get a brand name rotor rather than one that was made in China.  I wasn't certain where Wagner rotors are manufactured, so was pleased when I saw the  #34;made in USA #34; stamp.  The rotor is machined well and installed without any issues.
            No Comment: Be warned:  it says this cap will fit a 98 Ford Ranger, but it will not!  I'll have to see if I can find a locking cap locally because I can't afford to keep sending these things back.
            
            ### Review:
            {review}
            ### Response:
            """

            # Update the eos_token and synchronize it with the model
            tokenizer.eos_token = "</end>"
            model.config.eos_token_id = tokenizer.convert_tokens_to_ids(tokenizer.eos_token)

            # Tokenize the input
            inputs = tokenizer(
                prompt,
                return_tensors="pt",
                truncation=True,
                padding=True,
                max_length=2000
            ).to("cuda")  # Send input tensors to GPU

            # Generate the output
            outputs = model.generate(
                **inputs,
                max_new_tokens=3,  # Limit the response length
                eos_token_id=tokenizer.eos_token_id
            )

            # Decode the response and clean it
            response = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()

            # print(response)

            # Extract the category after "### Response:"
            if "### Response:" in response:
                category = response.split("### Response:")[-1].strip()
            else:
                category = "Invalid Response"  # Fallback if the format is incorrect

            print(f"Review: {review}")
            print(f"Predicted Label: {category}")

            # Write the review and predicted label to the output CSV
            writer.writerow([review, category])

print(f"Predictions for the reviews saved to {output_file}.")

# End timing
end_time = time.time()

# Print runtime
runtime = end_time - start_time
print(f"\n\nRuntime: {runtime:.2f} seconds\n\n")

### Price two shot

In [16]:
# Define the file path for the input reviews
input_file = "price.csv"
output_file = "results/price_predictions_two_shot_full_dataset.csv"

start_time = time.time()

# Prepare to write results to a new CSV file
with open(output_file, mode="w", newline="") as out_csv:
    writer = csv.writer(out_csv)
    writer.writerow(["ReviewText", "PredictedLabel"])  # Write headers

    # Read and process each review from the input CSV file
    with open(input_file, mode="r") as in_csv:
        reader = csv.DictReader(in_csv)
        for row in reader:
            review = row["ReviewText"]

            prompt = f"""### Instruction:
            You are an analyst who categorizes reviews based on their fit for automotive parts. Your task is to classify the following review into one of the categories: "Underpriced" "Overpriced" or "No Comment."
            Respond only with the Underpriced/Overpriced/No Comment.

            ### Examples:
            Underpriced: I put this into a 1999 Mazda Miata MX-5.  It fit perfectly.  It came fully charged.  Delivered it saved me $50.  Great deal.  Love it.
            Underpriced: This seem to satify my needs and it was as cheap as other batteries. So why should any one pay more for the same battery. I would recommentd this battery to others.
            Overpriced: I spent a few extra dollars to get a brand name rotor rather than one that was made in China.  I wasn't certain where Wagner rotors are manufactured, so was pleased when I saw the  #34;made in USA #34; stamp.  The rotor is machined well and installed without any issues.
            Overpriced: Great great product.  I swear by it to clean the windshield and fiberglass on my boat.  Quite frankly there is not product like it.  Pricey but worth it!
            No Comment: Be warned:  it says this cap will fit a 98 Ford Ranger, but it will not!  I'll have to see if I can find a locking cap locally because I can't afford to keep sending these things back.
            No Comment: My low battery alarm started going off in the middle of the night, and then proceeded to do it every night until I replaced it. Very  Annoying!For some reason, I failed to select expedited shipping, but that wasn't a problem because the vendor shipped it the next day. Since I live in the same state, I got it the day after that. This alone warranted the 5 star rating for me as I couldn't get the battery warning to stop. The manual failed me.The battery arrived as an exact match of the original; exactly as pictured. The original battery lasted almost 13 years. Hopefully this will last as long too.After installation, my system did not immediately clear the Low Battery warning. It didn't continue to alarm, but did still show it on the consoles. This worried me at first. Sometime during the night, the system apparently did a battery check and noticed that it was fine.The battery was packaged well for shipping. Box just large enough. Enough, but not too much, bubble wrap with just a single piece of tape to cut to get at the new battery. No packaging wounds from this product/company.

            ### Review:
            {review}
            ### Response:
            """

            # Update the eos_token and synchronize it with the model
            tokenizer.eos_token = "</end>"
            model.config.eos_token_id = tokenizer.convert_tokens_to_ids(tokenizer.eos_token)

            # Tokenize the input
            inputs = tokenizer(
                prompt,
                return_tensors="pt",
                truncation=True,
                padding=True,
                max_length=2000
            ).to("cuda")  # Send input tensors to GPU

            # Generate the output
            outputs = model.generate(
                **inputs,
                max_new_tokens=3,  # Limit the response length
                eos_token_id=tokenizer.eos_token_id
            )

            # Decode the response and clean it
            response = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()

            # print(response)

            # Extract the category after "### Response:"
            if "### Response:" in response:
                category = response.split("### Response:")[-1].strip()
            else:
                category = "Invalid Response"  # Fallback if the format is incorrect

            print(f"Review: {review}")
            print(f"Predicted Label: {category}")

            # Write the review and predicted label to the output CSV
            writer.writerow([review, category])

print(f"Predictions for the reviews saved to {output_file}.")

# End timing
end_time = time.time()

# Print runtime
runtime = end_time - start_time
print(f"\n\nRuntime: {runtime:.2f} seconds\n\n")

Review: You will have to remove the window which is very easy and be very careful.  drop the window down to allow access to both screws holding the glass.  Lift the window up a bit and then drop the front of the windows down into the door slowly and then raise the rear of the glass up and you will start to lift the glass at a angle upwards and out of the door frame.  You will then be able to access the three screws to remove the door handle and you will only have to remove the cable pin from the door once you have access to the back of the handle.The process to remove the door handles is very very easy and you will need a P2 and P3 screw driver and a 10mm socket with extension.
Predicted Label: No Comment.
Review:   It does what it is supposed to! Sure it does not come with any instructions, why take a star away for that? Once I got the power connected to it the correct way, the unit kicks on and off solidly. There is some audible  #34;clicking #34; when the relay switches on and off, 

In [22]:
input_file = "price.csv"  # Original file with actual labels
predictions_file = "results/price_predictions_zero_shot_full_dataset.csv"  # File with model predictions

# Load input and prediction files as DataFrames
df_input = pd.read_csv(input_file)
df_predictions = pd.read_csv(predictions_file)

# # Check unique values in the 'label' column
# unique_labels = df_predictions['PredictedLabel'].unique()
# print(f"Unique labels: {unique_labels}")

# # Count occurrences of each label
# label_counts = df_predictions['PredictedLabel'].value_counts()

# # Display the counts
# print("Counts of each category:")
# print(label_counts)

label_mapping = {
    'Underpriced': 'Underpriced',
    'No Comment Answer': 'No Comment',
    'Overpriced': 'Overpriced',
    'No Comment:': 'No Comment',
    'No Comment.': 'No Comment',
    'No Comment Answer': 'No Comment',
    'Answer the question': 'No Comment',
    'Answer: No': 'No Comment',
    'Answer: Under': 'Underpriced',
    'No Comment Bookmark': 'No Comment',
    'Overpried': 'Overpriced'
}

# Replace incorrect labels with standardized ones
df_predictions['PredictedLabel'] = df_predictions['PredictedLabel'].replace(label_mapping)

# Evaluate predictions
metrics = evaluate_predictions(df_input, df_predictions)

# Print evaluation metrics
print(f"Accuracy: {metrics['accuracy']:.2f}%")
print(f"Precision: {metrics['precision']:.2f}%")
print(f"Recall: {metrics['recall']:.2f}%")
print(f"F1 Score: {metrics['f1_score']:.2f}")


Classification Report:
              precision    recall  f1-score   support

  No Comment       0.98      0.04      0.08      1883
  Overpriced       0.04      0.87      0.08        46
 Underpriced       0.20      0.78      0.32       326

    accuracy                           0.17      2255
   macro avg       0.41      0.56      0.16      2255
weighted avg       0.85      0.17      0.12      2255

Accuracy: 16.63%
Precision: 84.53%
Recall: 16.63%
F1 Score: 11.73
