# Hugging Face transformers (analyzing car reviews)

## Installing required packages

In [None]:
from transformers import logging
logging.set_verbosity(logging.WARNING)

In [15]:
import pandas as pd
import evaluate
from transformers import AutoTokenizer, AutoModel
import torch
# Load the tokenizer and model
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification

In [16]:
file_path = "data/car_reviews.csv"
df = pd.read_csv(file_path, delimiter=";")
reviews = df['Review'].to_list()

In [17]:
# Specify the model name
model_name = "distilbert-base-uncased-finetuned-sst-2-english"



tokenizer = DistilBertTokenizer.from_pretrained(model_name)
model = DistilBertForSequenceClassification.from_pretrained(model_name)


# Now you can use the tokenizer and model

inputs = tokenizer(reviews, 
                   return_tensors="pt",
                  padding = "max_length",
                  truncation = True,
                  max_length = 128)
with torch.no_grad():
    logits = model(**inputs).logits
    
id2binary = {
    "NEGATIVE": 0,
    "POSITIVE": 1
}

predicted_class_ids = logits.argmax(dim=-1).tolist()

# Map predicted class ids to labels
predicted_labels = [model.config.id2label[class_id] for class_id in predicted_class_ids]
predictions = [id2binary[label] for label in predicted_labels]
true_labels = [id2binary[label] for label in df['Class'].to_list()]

In [18]:
accuracy_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")

# Add the predictions and references to the metric
accuracy_metric.add_batch(predictions=predictions, references=true_labels)
f1_metric.add_batch(predictions=predictions, references=true_labels)

# Compute the final accuracy and F1 scores
accuracy_result = accuracy_metric.compute()
f1_result = f1_metric.compute(average='weighted')  # Use 'weighted', 'micro', 'macro' as needed

print(f"Accuracy: {accuracy_result['accuracy']:.4f}")
print(f"F1 Score: {f1_result['f1']:.4f}")

Accuracy: 0.8000
F1 Score: 0.7810


Second part

In [19]:
#Get first two sentences of each review

import re, nltk
from nltk.translate.bleu_score import sentence_bleu, corpus_bleu
from nltk.tokenize import word_tokenize
nltk.download('punkt')
from transformers import MarianMTModel, MarianTokenizer



def get_f2s(text):
    # Use regex to split the text into sentences
    sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
    return " ".join(sentences[:2])

first_sentences = [get_f2s(review) for review in reviews][0]
print(first_sentences)

I am very satisfied with my 2014 Nissan NV SL. I use this van for my business deliveries and personal use.


[nltk_data] Downloading package punkt to /home/repl/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [20]:


# Load the tokenizer and model
model_name1 = 'Helsinki-NLP/opus-mt-en-es'
tokenizer1 = MarianTokenizer.from_pretrained(model_name1)
model1 = MarianMTModel.from_pretrained(model_name1)

In [21]:
def translate(text, tokenizer, model):
    # Tokenize the input text
    inputs = tokenizer(text, return_tensors="pt", padding=True)
    # Perform the translation
    with torch.no_grad():
        translated = model.generate(**inputs)
    # Decode the translated text
    translated_text = tokenizer.decode(translated[0], skip_special_tokens=True)
    return translated_text

def process_text(string):
    translation = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', string)
    return [word_tokenize(sent) for sent in translation]
# Example usage
translated_review = translate(first_sentences, tokenizer1, model1)
tokenized_translation = process_text(translated_review)
print(tokenized_translation)





[['Estoy', 'muy', 'satisfecho', 'con', 'mi', 'Nissan', 'NV', 'SL', '2014', '.'], ['Uso', 'esta', 'camioneta', 'para', 'mis', 'entregas', 'de', 'negocios', 'y', 'uso', 'personal', '.']]


In [22]:

with open("data/reference_translations.txt", 'r') as file:
    lines = file.readlines()
references = [line.strip() for line in lines]
tokenized_reference = process_text(references[0])

bleu_score = corpus_bleu(tokenized_translation, tokenized_reference)
print(bleu_score)

1.1200407237786664e-231


3rd part 

In [23]:
from transformers import AutoModelForQuestionAnswering,  AutoTokenizer, pipeline
second_review = reviews[1]
model_name2 = "deepset/minilm-uncased-squad2"
question = "What did he like about the brand?"
# a) Get predictions
nlp = pipeline('question-answering', model=model_name2, tokenizer=model_name2)
QA_input = {
    'question': question,
    'context': second_review
}
res = nlp(QA_input)

# b) Load model & tokenizer
model2 = AutoModelForQuestionAnswering.from_pretrained(model_name2)
tokenizer2 = AutoTokenizer.from_pretrained(model_name2)

In [24]:
inputs = tokenizer2(question, second_review, return_tensors='pt')

# Get the model outputs
outputs = model2(**inputs)

# Decode the answer from model outputs
start_index = outputs.start_logits.argmax()
end_index = outputs.end_logits.argmax()

# Convert token indices to tokens
answer_tokens = tokenizer2.convert_ids_to_tokens(inputs.input_ids[0][start_index:end_index + 1])
answer = tokenizer2.convert_tokens_to_string(answer_tokens)

# Print the result
print(f"Answer: {answer}")

Answer: ride quality, reliability


4th part

In [25]:
from transformers import pipeline
last_review = reviews[4]
summarizer = pipeline("summarization", model="Falconsai/text_summarization")
summarized_text = summarizer(last_review, max_length=55, min_length=50, do_sample=False)
print(summarized_text)

[{'summary_text': 'the Nissan Rogue provides me with the desired SUV experience without burdening me with an exorbitant payment . I have hauled 12 bags of mulch in the back with the seats down and could have held more . To address this concern, I am'}]


In [26]:
import pandas as pd
import torch

# Load the car reviews dataset
file_path = "data/car_reviews.csv"
df = pd.read_csv(file_path, delimiter=";")

# Put the car reviews and their associated sentiment labels in two lists
reviews = df['Review'].tolist()
real_labels = df['Class'].tolist()


# Instruction 1: sentiment classification

# Load a sentiment analysis LLM into a pipeline
from transformers import pipeline
classifier = pipeline('sentiment-analysis', model='distilbert-base-uncased-finetuned-sst-2-english')

# Perform inference on the car reviews and display prediction results
predicted_labels = classifier(reviews)
for review, prediction, label in zip(reviews, predicted_labels, real_labels):
    print(f"Review: {review}\nActual Sentiment: {label}\nPredicted Sentiment: {prediction['label']} (Confidence: {prediction['score']:.4f})\n")

# Load accuracy and F1 score metrics    
import evaluate
accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")

# Map categorical sentiment labels into integer labels
references = [1 if label == "POSITIVE" else 0 for label in real_labels]
predictions = [1 if label['label'] == "POSITIVE" else 0 for label in predicted_labels]

# Calculate accuracy and F1 score
accuracy_result_dict = accuracy.compute(references=references, predictions=predictions)
accuracy_result = accuracy_result_dict['accuracy']
f1_result_dict = f1.compute(references=references, predictions=predictions)
f1_result = f1_result_dict['f1']
print(f"Accuracy: {accuracy_result}")
print(f"F1 result: {f1_result}")


# Instruction 2: Translation

# Load translation LLM into a pipeline and translate car review
first_review = reviews[0]
translator = pipeline("translation", model="Helsinki-NLP/opus-mt-en-es")
translated_review = translator(first_review, max_length=27)[0]['translation_text']
print(f"Model translation:\n{translated_review}")

# Load reference translations from file
with open("data/reference_translations.txt", 'r') as file:
    lines = file.readlines()
references = [line.strip() for line in lines]
print(f"Spanish translation references:\n{references}")

# Load and calculate BLEU score metric
bleu = evaluate.load("bleu")
bleu_score = bleu.compute(predictions=[translated_review], references=[references])
print(bleu_score['bleu'])


# Instruction 3: extractive QA

# Import auto classes (optional: can be solved via pipelines too)
from transformers import AutoTokenizer
from transformers import AutoModelForQuestionAnswering

# Instantiate model and tokenizer
model_ckp = "deepset/minilm-uncased-squad2"
tokenizer = AutoTokenizer.from_pretrained(model_ckp)
model = AutoModelForQuestionAnswering.from_pretrained(model_ckp)

# Define context and question, and tokenize them
context = reviews[1]
print(f"Context:\n{context}")
question = "What did he like about the brand?"
inputs = tokenizer(question, context, return_tensors="pt")

# Perform inference and extract answer from raw outputs
with torch.no_grad():
  outputs = model(**inputs)
start_idx = torch.argmax(outputs.start_logits)
end_idx = torch.argmax(outputs.end_logits) + 1
answer_span = inputs["input_ids"][0][start_idx:end_idx]

# Decode and show answer
answer = tokenizer.decode(answer_span)
print("Answer: ", answer)


# Instruction 4

# Get original text to summarize upon car review
text_to_summarize = reviews[-1]
print(f"Original text:\n{text_to_summarize}")

# Load summarization pipeline and perform inference
model_name = "cnicu/t5-small-booksum"
summarizer = pipeline("summarization", model=model_name)
outputs = summarizer(text_to_summarize, max_length=53)
summarized_text = outputs[0]['summary_text']
print(f"Summarized text:\n{summarized_text}")

Xformers is not installed correctly. If you want to use memorry_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


Review: I am very satisfied with my 2014 Nissan NV SL. I use this van for my business deliveries and personal use. Camping, road trips, etc. We dont have any children so I store most of the seats in my warehouse. I wanted the passenger van for the rear air conditioning. We drove our van from Florida to California for a Cross Country trip in 2014. We averaged about 18 mpg. We drove thru a lot of rain and It was a very comfortable and stable vehicle. The V8 Nissan Titan engine is a 500k mile engine. It has been tested many times by delivery and trucking companies. This is why Nissan gives you a 5 year or 100k mile bumper to bumper warranty. Many people are scared about driving this van because of its size. But with front and rear sonar sensors, large mirrors and the back up camera. It is easy to drive. The front and rear sensors also monitor the front and rear sides of the bumpers making it easier to park close to objects. Our Nissan NV is a Tow Monster. It pulls our 5000 pound travel tr

Your input_length: 365 is bigger than 0.9 * max_length: 27. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)


Model translation:
Estoy muy satisfecho con mi 2014 Nissan NV SL. Uso esta furgoneta para mis entregas de negocios y uso personal.
Spanish translation references:
['Estoy muy satisfecho con mi Nissan NV SL 2014. Utilizo esta camioneta para mis entregas comerciales y uso personal.', 'Estoy muy satisfecho con mi Nissan NV SL 2014. Uso esta furgoneta para mis entregas comerciales y uso personal.']
0.6022774485691839
Context:
The car is fine. It's a bit loud and not very powerful. On one hand, compared to its peers, the interior is well-built. The transmission failed a few years ago, and the dealer replaced it under warranty with no issues. Now, about 60k miles later, the transmission is failing again. It sounds like a truck, and the issues are well-documented. The dealer tells me it is normal, refusing to do anything to resolve the issue. After owning the car for 4 years, there are many other vehicles I would purchase over this one. Initially, I really liked what the brand is about: ride 

Downloading:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/242M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.92k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

Summarized text:
the Nissan Rogue provides me with the desired SUV experience without burdening me with an exorbitant payment; the financial arrangement is quite reasonable. I have hauled 12 bags of mulch in the back with the seats down and could have held more.
