In [2]:
import pandas as pd
import re
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score



In [3]:
df = pd.read_csv('train_data.csv')
df = df.dropna(subset=['reviewText', 'overall'])
df['overall'] = df['overall'].astype(int)

  df = pd.read_csv('train_data.csv')


In [4]:
df

Unnamed: 0,overall,vote,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime
0,2,,False,2016-11-11,A2OSUEZJIN7BI,0511189877,,Chris,I have an older URC-WR7 remote and thought thi...,Cannot Learn,1478822400
1,5,,True,2016-06-06,A2NETQRG6JHIG7,0511189877,,Qrysta White,First time I've EVER had a remote that needed ...,zero programming needed! Miracle!?,1465171200
2,4,,True,2016-03-10,A12JHGROAX49G7,0511189877,,Linwood,Got them and only 2 of them worked. company ca...,Works Good and programs easy.,1457568000
3,5,,True,2016-01-14,A1KV65E2TMMG6F,0511189877,,Dane Williams,I got tired of the remote being on the wrong s...,Same as TWC remote,1452729600
4,5,,True,2016-10-20,A280POPEWI0NSA,0594459451,,Kristina H.,After purchasing cheap cords from another webs...,Good Quality Cord,1476921600
...,...,...,...,...,...,...,...,...,...,...,...
838939,5,,True,2017-07-25,A1OOVLE2KZ6KGA,B01HJCN1EI,,Puddzee,These are my favorite charging cords for a few...,Worth the price.,1500940800
838940,1,,True,2017-04-04,A77K1B31UAQ29,B01HJCN1EI,,addictedtoreading,"Update....after 2 months of gentle use, cable ...",UPDATE...BREAKS AND SLOW CHARGING,1491264000
838941,3,,True,2017-07-08,A2SVXUVUAWUDK2,B01HJH42KU,,Andrew,These are okay. The connection becomes very if...,Hope this makes sense. You'd understand if you...,1499472000
838942,5,2.0,True,2016-12-01,A2HUZO7MQAY5I2,B01HJH40WU,,michael clontz,"Ok here is an odd thing that happened to me, I...",Not the correct product as linked in the sale.,1480550400


In [5]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'[^a-z0-9 ]', '', text)
    return text

df['cleaned_reviewText'] = df['reviewText'].apply(clean_text)

In [6]:
df['cleaned_reviewText']

0         i have an older urcwr7 remote and thought this...
1         first time ive ever had a remote that needed n...
2         got them and only 2 of them worked company cal...
3         i got tired of the remote being on the wrong s...
4         after purchasing cheap cords from another webs...
                                ...                        
838939    these are my favorite charging cords for a few...
838940    updateafter 2 months of gentle use cable twist...
838941    these are okay the connection becomes very iff...
838942    ok here is an odd thing that happened to me i ...
838943    i have it plugged into a usb extension on my g...
Name: cleaned_reviewText, Length: 838944, dtype: object

In [7]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")


In [8]:
def tokenize_data(texts, labels=None, max_length=256):
    encodings = tokenizer(
        texts,
        truncation=True,
        padding=True,
        max_length=max_length,
        return_tensors="pt"
    )
    data = {
        "input_ids": encodings["input_ids"].tolist(),
        "attention_mask": encodings["attention_mask"].tolist()
    }
    if labels is not None:
        data["labels"] = (torch.tensor(labels) - 1).tolist()  
    return data

In [9]:
train_texts, val_texts, train_labels, val_labels = train_test_split(df['cleaned_reviewText'].tolist(), df['overall'].tolist(), test_size=0.2, random_state=42)

train_data = tokenize_data(train_texts, train_labels)
val_data = tokenize_data(val_texts, val_labels)

train_dataset = Dataset.from_dict(train_data)
val_dataset = Dataset.from_dict(val_data)

In [10]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=5)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    return {"f1": f1_score(labels, preds, average='micro')}

In [12]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_steps=10,
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)



In [13]:
trainer.train()

Epoch,Training Loss,Validation Loss,F1
1,0.5596,0.645787,0.737903
2,0.6876,0.618443,0.750866
3,0.4316,0.628693,0.759883


TrainOutput(global_step=251685, training_loss=0.6301997521817998, metrics={'train_runtime': 135534.4671, 'train_samples_per_second': 14.856, 'train_steps_per_second': 1.857, 'total_flos': 2.6488958535813888e+17, 'train_loss': 0.6301997521817998, 'epoch': 3.0})

In [14]:
test_df = pd.read_csv('test_data.csv')
test_df['cleaned_reviewText'] = test_df['reviewText'].apply(clean_text)
test_data = tokenize_data(test_df['cleaned_reviewText'].tolist())

test_dataset = Dataset.from_dict(test_data)
predictions = trainer.predict(test_dataset).predictions.argmax(-1) + 1

In [15]:
submission = pd.DataFrame({'predicted': predictions})
submission.to_csv('q2_submission.csv', index=False)