In [None]:
import pandas as pd

# Load the dataset to see its structure
file_path = '/content/All_Beauty_5.csv'
data = pd.read_csv(file_path)
data.head()


Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,reviewerName,reviewText,summary,unixReviewTime
0,5.0,True,"09 1, 2016",A3CIUOJXQ5VDQ2,B0000530HU,Shelly F,As advertised. Reasonably priced,Five Stars,1472688000
1,5.0,True,"11 14, 2013",A3H7T87S984REU,B0000530HU,houserules18,Like the oder and the feel when I put it on my...,Good for the face,1384387200
2,1.0,True,"08 18, 2013",A3J034YH7UG4KT,B0000530HU,Adam,I bought this to smell nice after I shave. Wh...,Smells awful,1376784000
3,5.0,False,"05 3, 2011",A2UEO5XR3598GI,B0000530HU,Rich K,HEY!! I am an Aqua Velva Man and absolutely lo...,Truth is There IS Nothing Like an AQUA VELVA MAN.,1304380800
4,5.0,True,"05 6, 2011",A3SFRT223XXWF7,B00006L9LC,C. C. Christian,If you ever want to feel pampered by a shampoo...,Bvlgari Shampoo,1304640000


In [None]:
import mlflow
from mlflow import log_metric, log_param, log_artifacts

# Start an MLflow experiment
mlflow.set_experiment("BERT Fine-Tuning Experiment")

# Log a parameter (example)
log_param("learning_rate", 0.01)


In [None]:
from torch.utils.tensorboard import SummaryWriter

# Initialize the TensorBoard writer
writer = SummaryWriter()

# Example of logging a metric
# Inside your training loop:
# writer.add_scalar('Training Loss', loss, epoch)
# writer.add_scalar('Validation Loss', val_loss, epoch)


In [None]:
data.shape

(5269, 9)

In [None]:

#data = data.iloc[:1000]

In [None]:
data.shape

(5269, 9)

In [None]:
# Check for missing values in the reviewText column
missing_reviews = data['reviewText'].isnull().sum()

# Dropping rows where 'reviewText' is missing
data_clean = data.dropna(subset=['reviewText'])

# Display the number of missing values and the shape of the cleaned data
missing_reviews, data_clean.shape


(5, (5264, 9))

In [None]:
import spacy
from spacy.matcher import Matcher
from collections import Counter

# Load the English tokenizer, tagger, parser, NER and word vectors
nlp = spacy.load("en_core_web_sm")

# Function to preprocess and extract aspects from text
def extract_aspects(text):
    # Preprocess the text
    doc = nlp(text.lower())

    # Pattern for matching noun phrases
    pattern = [{'POS': 'NOUN', 'OP': '+'}, {'POS': 'ADJ', 'OP': '*'}]

    # Matcher for finding matches
    matcher = Matcher(nlp.vocab)
    matcher.add("NOUN_PHRASE", [pattern])

    # Extract matches
    matches = matcher(doc)
    aspects = [doc[start:end].text for match_id, start, end in matches]

    return aspects

# Apply aspect extraction to the review texts
aspect_lists = data_clean['reviewText'].apply(extract_aspects)

# Count the most common aspects to see examples
aspect_counts = Counter([aspect for sublist in aspect_lists for aspect in sublist])
aspect_counts.most_common(10)




[('product', 1569),
 ('hair', 1539),
 ('skin', 893),
 ('shampoo', 760),
 ('body', 727),
 ('scent', 673),
 ('soap', 532),
 ('time', 503),
 ('products', 453),
 ('fragrance', 446)]

In [None]:
aspects = []

for i in aspect_counts.most_common(20):
    aspects.append(i[0])

aspects

['product',
 'hair',
 'skin',
 'shampoo',
 'body',
 'scent',
 'soap',
 'time',
 'products',
 'fragrance',
 'smell',
 'wash',
 'price',
 'shower',
 'years',
 'body wash',
 'conditioner',
 'lotion',
 'cream',
 'bottle']

In [None]:
aspect_lists

0                                                      []
1       [oder, feel, face, brands, reviews, people, od...
2                                  [grandmother, perfume]
3       [aqua, velva, aqua velva, man, velva man, aqua...
4        [shampoo, one, perfume, hair, fullness, shampoo]
                              ...                        
5264    [undereye, darkness, undereye darkness, time, ...
5265                                  [eye, gel, eye gel]
5266              [eye, morning, night, touch, dispenser]
5267                                         [eye, stuff]
5268                                         [eye, cream]
Name: reviewText, Length: 5264, dtype: object

In [None]:
# Define common aspects

# Function to find and label aspects in reviews
def label_aspects(row):
    text = row['reviewText'].lower()
    labeled_aspects = {}
    for aspect in aspects:
        if aspect in text:
            # Assign sentiment based on the overall rating
            if row['overall'] > 3:
                sentiment = 'positive'
            elif row['overall'] == 3:
                sentiment = 'neutral'
            else:
                sentiment = 'negative'
            labeled_aspects[aspect] = sentiment
    return labeled_aspects

# Apply the function to each row in the dataset
data_clean['aspects'] = data_clean.apply(label_aspects, axis=1)
data_clean.head()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_clean['aspects'] = data_clean.apply(label_aspects, axis=1)


Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,reviewerName,reviewText,summary,unixReviewTime,aspects
0,5.0,True,"09 1, 2016",A3CIUOJXQ5VDQ2,B0000530HU,Shelly F,As advertised. Reasonably priced,Five Stars,1472688000,{'price': 'positive'}
1,5.0,True,"11 14, 2013",A3H7T87S984REU,B0000530HU,houserules18,Like the oder and the feel when I put it on my...,Good for the face,1384387200,{'skin': 'positive'}
2,1.0,True,"08 18, 2013",A3J034YH7UG4KT,B0000530HU,Adam,I bought this to smell nice after I shave. Wh...,Smells awful,1376784000,{'smell': 'negative'}
3,5.0,False,"05 3, 2011",A2UEO5XR3598GI,B0000530HU,Rich K,HEY!! I am an Aqua Velva Man and absolutely lo...,Truth is There IS Nothing Like an AQUA VELVA MAN.,1304380800,"{'time': 'positive', 'years': 'positive', 'lot..."
4,5.0,True,"05 6, 2011",A3SFRT223XXWF7,B00006L9LC,C. C. Christian,If you ever want to feel pampered by a shampoo...,Bvlgari Shampoo,1304640000,"{'hair': 'positive', 'shampoo': 'positive', 's..."


In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch

# Flatten the dataset
rows = []
for _, row in data_clean.iterrows():
    for aspect, sentiment in row['aspects'].items():
        rows.append({
            'text': row['reviewText'],
            'aspect': aspect,
            'sentiment': sentiment
        })

# Create a DataFrame for training
training_data = pd.DataFrame(rows)

# Check the distribution of aspect and sentiment
training_data.head(), training_data['sentiment'].value_counts()


(                                                text aspect sentiment
 0                   As advertised. Reasonably priced  price  positive
 1  Like the oder and the feel when I put it on my...   skin  positive
 2  I bought this to smell nice after I shave.  Wh...  smell  negative
 3  HEY!! I am an Aqua Velva Man and absolutely lo...   time  positive
 4  HEY!! I am an Aqua Velva Man and absolutely lo...  years  positive,
 sentiment
 positive    10133
 negative      378
 neutral       307
 Name: count, dtype: int64)

In [None]:
pos = training_data[training_data['sentiment'] == 'positive']
neg = training_data[training_data['sentiment'] == 'negative']
neu = training_data[training_data['sentiment'] == 'neutral']

In [None]:
training_data = pd.concat([pos[:300], neg[:300], neu[:300]])
training_data.head()

Unnamed: 0,text,aspect,sentiment
0,As advertised. Reasonably priced,price,positive
1,Like the oder and the feel when I put it on my...,skin,positive
3,HEY!! I am an Aqua Velva Man and absolutely lo...,time,positive
4,HEY!! I am an Aqua Velva Man and absolutely lo...,years,positive
5,HEY!! I am an Aqua Velva Man and absolutely lo...,lotion,positive


In [None]:
training_data = training_data.sample(frac=1)

In [None]:
training_data

Unnamed: 0,text,aspect,sentiment
120,I have an Airbnb. This is perfect for guests....,product,positive
4347,"These are typical inexpensive spa accessories,...",body,negative
216,Great product and good service.,product,positive
9560,I wore braces for several years as an adult an...,cream,neutral
10010,"The soap itself is fine, the scent itself remi...",soap,neutral
...,...,...,...
9437,This is a product you need to use judiciously....,product,neutral
8182,I just recently started shaving again with an ...,skin,neutral
10299,This body lotion is lightweight and absorbs in...,skin,neutral
8228,"I gave this ""tool"" an average rating. It does...",product,neutral


In [None]:
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, Dataset
import torch

class AspectDataset(Dataset):
    def __init__(self, texts, aspects, sentiments, tokenizer, max_len=128):
        self.texts = texts
        self.aspects = aspects
        self.sentiments = sentiments
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = str(self.texts[item]) + " [SEP] " + str(self.aspects[item])
        sentiment = int(self.sentiments[item])

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )

        return {
            'review_text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(sentiment, dtype=torch.long)
        }


In [None]:

# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Prepare the dataset and data loader
dataset = AspectDataset(
    texts=training_data['text'].tolist(),
    aspects=training_data['aspect'].tolist(),
    sentiments=training_data['sentiment'].replace({'positive': 2, 'neutral': 1, 'negative': 0}).tolist(),
    tokenizer=tokenizer
)
data_loader = DataLoader(dataset, batch_size=64, shuffle=True)


In [None]:
# Load the model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)
model.train()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [None]:
# Training loop
optimizer = AdamW(model.parameters(), lr=2e-5)
total_batches = len(data_loader)
print(f"Starting training for {total_batches} batches per epoch.")
for epoch in range(4):  # For each epoch
    for i, batch in enumerate(data_loader):
        optimizer.zero_grad()
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        if i % 10 == 0:  # Print loss every 10 batches
            print(f"Epoch {epoch}, Batch {i}/{total_batches}, Loss: {loss.item()}")
    print(f"Completed Epoch {epoch}")




Starting training for 15 batches per epoch.
Epoch 0, Batch 0/15, Loss: 1.1459298133850098
Epoch 0, Batch 10/15, Loss: 0.9748017191886902
Completed Epoch 0
Epoch 1, Batch 0/15, Loss: 0.841123104095459
Epoch 1, Batch 10/15, Loss: 0.5757907629013062
Completed Epoch 1
Epoch 2, Batch 0/15, Loss: 0.4954719841480255
Epoch 2, Batch 10/15, Loss: 0.33664944767951965
Completed Epoch 2
Epoch 3, Batch 0/15, Loss: 0.23586919903755188
Epoch 3, Batch 10/15, Loss: 0.20515874028205872
Completed Epoch 3


In [None]:
# prompt: how do i name models and save them with timestamp

import datetime

# Get the current timestamp
now = datetime.datetime.now()
timestamp = now.strftime("%Y%m%d_%H%M%S")

# Combine the timestamp with the model name
model_name = f"./bert_finetuned_aspect_sentiment_{timestamp}"
tokenizer_name = f"./bert_finetuned_aspect_sentiment_{timestamp}"
# Save the model
model.save_pretrained(model_name)
tokenizer.save_pretrained(tokenizer_name)


('./bert_finetuned_aspect_sentiment_20240419_031128/tokenizer_config.json',
 './bert_finetuned_aspect_sentiment_20240419_031128/special_tokens_map.json',
 './bert_finetuned_aspect_sentiment_20240419_031128/vocab.txt',
 './bert_finetuned_aspect_sentiment_20240419_031128/added_tokens.json')

In [None]:
import os
print(os.listdir(model_name))
print(os.listdir(tokenizer_name))


['model.safetensors', 'tokenizer_config.json', 'config.json', 'special_tokens_map.json', 'vocab.txt']
['model.safetensors', 'tokenizer_config.json', 'config.json', 'special_tokens_map.json', 'vocab.txt']


In [None]:
from transformers import BertTokenizer, BertForSequenceClassification

model_path = model_name
tokenizer = BertTokenizer.from_pretrained(model_path)
model = BertForSequenceClassification.from_pretrained(model_path)


In [None]:
import os

model_path = model_name

if os.path.exists(model_path):
    print("Files in model directory:", os.listdir(model_path))
else:
    print("Model directory not found. Please check the path.")


Files in model directory: ['model.safetensors', 'tokenizer_config.json', 'config.json', 'special_tokens_map.json', 'vocab.txt']


In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch

# Function to perform prediction
def predict_aspect_sentiment(model, tokenizer, text, aspect, max_len=128):
    # Prepare the text input
    encoded_review = tokenizer.encode_plus(
        text + " [SEP] " + aspect,
        max_length=max_len,
        add_special_tokens=True,
        return_token_type_ids=False,
        padding='max_length',
        return_attention_mask=True,
        return_tensors='pt',
        truncation=True
    )

    input_ids = encoded_review['input_ids']
    attention_mask = encoded_review['attention_mask']

    # Move tensors to the same device as the model
    input_ids = input_ids.to(model.device)
    attention_mask = attention_mask.to(model.device)

    # Get model predictions
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)

    # Convert logits to probabilities
    probs = torch.nn.functional.softmax(outputs.logits, dim=-1)

    # Convert probabilities to sentiment labels
    sentiment_labels = ['negative', 'neutral', 'positive']
    prediction = sentiment_labels[probs.argmax().item()]

    return prediction, probs[0].tolist()

# Load model and tokenizer
model_path = model_name
tokenizer = BertTokenizer.from_pretrained(model_path)
model = BertForSequenceClassification.from_pretrained(model_path)

# Example text and aspect
text = "The customer service is bad."
aspect = "customer service"

# Predict the sentiment
sentiment, probabilities = predict_aspect_sentiment(model, tokenizer, text, aspect)
print(f"Sentiment: {sentiment}, Probabilities: {probabilities}")


Sentiment: negative, Probabilities: [0.5748797655105591, 0.09854473173618317, 0.3265754282474518]
