#Importing Necessary Packages

In [2]:
import numpy as np
import pandas as pd
import torch

from transformers import logging
logging.set_verbosity(logging.WARNING)

#Loading the Data

In [4]:
import pandas as pd
import torch

# Load the car reviews dataset
file_path = "Car_Reviews.csv"
df = pd.read_csv(file_path)


reviews = df['Review'].tolist()
real_labels = df['Recommend'].apply(lambda x: "positive" if x == "Yes" else "negative").tolist()


print(reviews[:5])
print(real_labels[:5])


['negative', 'negative', 'negative', 'negative', 'negative']


#Task 1: sentiment classification

In [9]:

from transformers import pipeline

# Load a sentiment analysis LLM into a pipeline
classifier = pipeline('sentiment-analysis', model='distilbert-base-uncased-finetuned-sst-2-english')

predicted_labels = classifier(reviews, batch_size=16, truncation=True)

# Load accuracy and F1 score metrics
import evaluate
accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")

# Convert labels to integers
references = [1 if label == "positive" else 0 for label in real_labels]
predictions = [1 if prediction['label'] == "POSITIVE" else 0 for prediction in predicted_labels]

# Compute metrics
accuracy_result = accuracy.compute(references=references, predictions=predictions)['accuracy']
f1_result = f1.compute(references=references, predictions=predictions)['f1']

print(f"Accuracy: {accuracy_result:.4f}")
print(f"F1 result: {f1_result:.4f}")

# Detailed Classification report
from sklearn.metrics import classification_report
print("\nClassification Report:\n", classification_report(references, predictions, target_names=["negative", "positive"]))

Device set to use cuda:0


Accuracy: 0.8711
F1 result: 0.8573

Classification Report:
               precision    recall  f1-score   support

    negative       0.81      0.97      0.88      5339
    positive       0.96      0.77      0.86      5339

    accuracy                           0.87     10678
   macro avg       0.89      0.87      0.87     10678
weighted avg       0.89      0.87      0.87     10678



# Task 2: Translation

In [19]:
from transformers import pipeline
import evaluate

# Load translation pipeline (use CPU if GPU gives errors)
translator = pipeline(
    "translation_en_to_es",
    model="Helsinki-NLP/opus-mt-en-es",
    device=-1   # -1 = CPU, 0 = first GPU
)

# Translate first 10 reviews
sample_reviews = reviews[:10]
translated_reviews = [translator(review, truncation=True, max_length=256)[0]['translation_text']
                      for review in sample_reviews]


# Load reference translations (first 10 lines)
with open("reference_translations.txt", 'r') as file:
    lines = file.readlines()

references = [[line.strip()] for line in lines[:10]]

# Compute BLEU score
bleu = evaluate.load("bleu")
bleu_score = bleu.compute(predictions=translated_reviews, references=references)

print(f"\nBLEU score (first 10 reviews): {bleu_score['bleu']:.4f}")


Device set to use cpu
Your input_length: 512 is bigger than 0.9 * max_length: 256. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)



BLEU score (first 10 reviews): 0.8647


#Task 3: Generative QA

In [52]:
from transformers import pipeline
import evaluate

# Load generative QA pipeline
qa_gen = pipeline("text2text-generation", model="google/flan-t5-base", device=-1)

context = reviews[1]
question = "What did the customer like or dislike about the car?"

# Format input
input_text = f"question: {question} context: {context}"
result = qa_gen(input_text, max_new_tokens=100, do_sample=False)

predicted_answer = result[0]['generated_text']
print("Predicted Answer:", predicted_answer)

# ---- Reference Answer (gold label, you need to provide this) ----
reference_answer = "the Azera limited handling is unstable,and bouncy. I cannot do 120 mph comfortably, as it does not handle the curves very well. transmission has difficulty shifting at times, ABS does not work well in the snow, I slid a lot with new tires."

# ---- BLEU ----
bleu = evaluate.load("bleu")
bleu_score = bleu.compute(predictions=[predicted_answer], references=[[reference_answer]])
print("BLEU Score:", bleu_score["bleu"])

# ---- ROUGE ----
rouge = evaluate.load("rouge")
rouge_score = rouge.compute(predictions=[predicted_answer], references=[reference_answer])
print("ROUGE Score:", rouge_score)


Device set to use cpu


Predicted Answer: the Azera limited handling is unstable,and bouncy. I cannot do 120 mph comfortably, as it does not handle the curves very well. transmission has difficulty shifting at times, ABS does not work well in the snow, I slid a lot with new tires. I get 15 mpg due to my driving habits, and any one who knows how to drive, and likes to drive, will not like this car
BLEU Score: 0.6304763103210652
ROUGE Score: {'rouge1': np.float64(0.7719298245614035), 'rouge2': np.float64(0.7678571428571428), 'rougeL': np.float64(0.7719298245614035), 'rougeLsum': np.float64(0.7719298245614035)}


#Task 4: Extractive QA

In [41]:
from transformers import pipeline
import evaluate

# Load extractive QA pipeline
qa_extract = pipeline(
    "question-answering",
    model="deepset/minilm-uncased-squad2",
    tokenizer="deepset/minilm-uncased-squad2",
    device=-1   # CPU
)

context = reviews[1]
question = "What did the customer like or dislike about the car?"

# Run QA
result = qa_extract(question=question, context=context, handle_impossible_answer=True)
print("Answer:", result['answer'], "(score:", f"{result['score']:.4f})")

# Load metric
metric = evaluate.load("squad")

# Predictions (no `no_answer_probability`)
predictions = [{"id": "0", "prediction_text": result['answer']}]

# References
references = [{"id": "0", "answers": {"text": ["handling on the Acura"], "answer_start": [0]}}]

# Compute EM & F1
results = metric.compute(predictions=predictions, references=references)
print(results)


Some weights of the model checkpoint at deepset/minilm-uncased-squad2 were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


Answer: will not like this car (score: 0.1854)
{'exact_match': 0.0, 'f1': 0.0}


#Task5: Summarization

In [51]:
from transformers import pipeline
import evaluate

# Load summarization pipeline
model_name = "cnicu/t5-small-booksum"
summarizer = pipeline("summarization", model=model_name,device=-1)

# Pick review
text_to_summarize = reviews[-1]
print(f"Original text:\n{text_to_summarize}\n")

# Generate summary
outputs = summarizer(text_to_summarize, max_length=53)
summarized_text = outputs[0]['summary_text']
print(f"Model Summary:\n{summarized_text}\n")

# ---- Reference summary (gold summary you need to provide) ----
reference_summary = "I love my car. It is fun to drive, has great pick up, handles well. I have no problems on busy freeways. It's very comfortable to take on long trips."

# ---- BLEU ----
bleu = evaluate.load("bleu")
bleu_score = bleu.compute(predictions=[summarized_text], references=[[reference_summary]])
print("BLEU Score:", bleu_score["bleu"])

# ---- ROUGE ----
rouge = evaluate.load("rouge")
rouge_score = rouge.compute(predictions=[summarized_text], references=[reference_summary])
print("ROUGE Score:", rouge_score)


Device set to use cpu
Your max_length is set to 53, but your input_length is only 52. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=26)
Both `max_new_tokens` (=256) and `max_length`(=53) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Original text:
 I love my car. It is fun to drive, has great pick up, handles well. I have no problems on busy freeways. It is very comfortable to take on long trips, as long as there are not more than 2 people.

Model Summary:
I love my car. It is fun to drive, has great pick up, handles well. I have no problems on busy freeways. It's very comfortable to take on long trips, as long as there are not more than 2 people.

BLEU Score: 0.7412780049783892
ROUGE Score: {'rouge1': np.float64(0.8611111111111112), 'rouge2': np.float64(0.8571428571428571), 'rougeL': np.float64(0.8611111111111112), 'rougeLsum': np.float64(0.8611111111111112)}
