##### We will apply bert model for amazon reviews

##### Import libraries

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
import torch
from transformers import TrainingArguments, Trainer
from transformers import BertTokenizer, BertForSequenceClassification
import pandas as pd
import numpy as np

In [None]:
# Replace 'your_file.xlsx' with the path to your Excel file
file_path = 'Amazon Reviews Validation Data.xlsx'

# Read the Excel file into a Pandas DataFrame
try:
    Amazon_Reviews_validation = pd.read_excel(file_path)
    # Print the first 5 rows of the DataFrame for verification
except FileNotFoundError:
    print(f"File '{file_path}' not found.")
except pd.errors.ParserError:
    print(f"Error occurred while parsing the Excel file '{file_path}'. Please check the file format and structure.")
except Exception as e:
    print(f"An error occurred: {str(e)}")

In [None]:
Amazon_Reviews_Training=pd.read_csv('Amazon_Synthetic_Training_Data.csv')
Amazon_Reviews_validation = Amazon_Reviews_validation.drop_duplicates(subset=['Reviews'])
Amazon_Reviews_validation.dropna(inplace=True)


In [None]:
amazon_training_data=Amazon_Reviews_Training
amazon_validation_data=Amazon_Reviews_validation

In [None]:
amazon_training_data = amazon_training_data.rename(columns={'Reviews': 'Comment_Text', 'Aspect': 'Label'})
amazon_validation_data = amazon_validation_data.rename(columns={'Reviews': 'Comment_Text', 'Aspect': 'Label'})

In [None]:
original_labels = [
    'Adaptability', 'Durability', 'Ease of Use', 'Ergonomics',
    'Interference', 'Performance', 'Use Efficiency', 'Aesthetics',
    'Ease of Reprocessing', 'Ease of Storage', 'Price', 'Safety'
]

# Map labels to numerical values (0 to 12)
labeling_dict = {label: idx for idx, label in enumerate(original_labels)}

In [None]:
amazon_training_data['Label']=amazon_training_data['Label'].map(labeling_dict)
amazon_validation_data['Label']=amazon_validation_data['Label'].map(labeling_dict)

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased',num_labels=12)

In [None]:
X_train =amazon_training_data["Comment_Text"].to_list()
X_val=amazon_validation_data["Comment_Text"].to_list()
y_train = amazon_training_data["Label"].to_list()
y_val=amazon_validation_data["Label"].to_list()

In [None]:
X_train_tokenized = tokenizer(X_train, padding=True, truncation=True, max_length=512)
X_val_tokenized = tokenizer(X_val, padding=True, truncation=True, max_length=512)

In [None]:
# Create torch dataset
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

In [None]:
train_dataset = Dataset(X_train_tokenized, y_train)
val_dataset = Dataset(X_val_tokenized, y_val)

In [None]:
def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)
    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    return {"accuracy": accuracy}

In [None]:
# Define Trainer
args = TrainingArguments(
    output_dir="output",
    num_train_epochs=10,
    per_device_train_batch_size=50

)
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)


In [None]:
trainer.train()

In [None]:
Model_validation_metrics=trainer.evaluate()

In [None]:
Loss=Model_validation_metrics['eval_loss']
Accuracy=Model_validation_metrics['eval_accuracy']

In [None]:
print(f'Validation_Loss is : {Loss}, and Validation_Accuracy is : {Accuracy}')

In [None]:
trainer.save_model('CustomModel')
tokenizer.save_pretrained('Tokenizer')

In [None]:
tokenizer = BertTokenizer.from_pretrained('Tokenizer')
model = BertForSequenceClassification.from_pretrained("CustomModel")

In [None]:
encoded_input = tokenizer("Input for single value for multiple we will run for loop", return_tensors='pt',padding=True,truncation=True)
output = model(**encoded_input)
logits = output.logits.detach().cpu().numpy()
y_pred = np.argmax(logits,axis=-1)
given_value =y_pred[0]
# Get the key for the given value
result_key = next(key for key, value in labeling_dict.items() if value == given_value)
# Print the result
print(result_key)