In [None]:
import pandas as pd

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import sklearn.metrics as metrics
from sklearn import preprocessing

import re
import numpy as np
import transformers as transform
import torch
from torch.nn.functional import softmax, normalize

from torch.utils.data import DataLoader, TensorDataset, Dataset

In [None]:
from google.colab import drive
drive.mount('/MyDrive')

Drive already mounted at /MyDrive; to attempt to forcibly remount, call drive.mount("/MyDrive", force_remount=True).


In [None]:
class Bert_Regression:

    model_class = transform.DistilBertForSequenceClassification

    tokenizer_class = transform.DistilBertTokenizerFast

    pretrained_weights = 'distilbert-base-uncased'
    tokenizer = 0
    model = 0

    def initilize_tokenizer(self):
        self.tokenizer = self.tokenizer_class.from_pretrained(self.pretrained_weights)

        return self.tokenizer

    def initilize_model(self):
        self.model = self.model_class.from_pretrained(self.pretrained_weights, num_labels=10, max_position_embeddings=512)


        return self.model

    def clear_model_and_tokenizer(self):
        self.tokenizer = None
        self.model = None

    def get_reviewer_data(self, reviewer_names):

        reviewer_ids = pd.DataFrame(columns=['ids'])
        reviewer_ratings = pd.DataFrame(columns=['ratings'])

        reviewer_file_ids = ['../MyDrive/MyDrive/scaledata/'+reviewer_name+'/id.'+reviewer_name for reviewer_name in reviewer_names]
        reviewer_file_ratings = ['../MyDrive/MyDrive/scaledata/'+reviewer_name+'/rating.'+reviewer_name for reviewer_name in reviewer_names]
        reviews_folder = ['../MyDrive/MyDrive/scale_whole_review/'+reviewer_name+'/txt.parag' for reviewer_name in reviewer_names]

        count = 0

        for reviewer_id in reviewer_file_ids:
            print(reviewer_file_ids)
            id_rating = pd.read_csv(reviewer_id, names = ['ids'])
            id_rating['full_review'] = ''

            for id in id_rating['ids']:
                path = reviews_folder[count]+'/'+str(id)+'.txt'
                f = open(path, 'r', encoding='cp1252')
                text = f.read()
                text = re.sub(r'[^\w]', ' ', text)
                text = re.sub("\d+", "", text)
                text = text.lower()
                id_rating.loc[id_rating['ids'] == id, 'full_review'] = text
                # id_rating.loc[id == dennis_fulls, 'full_review'] = text
                # print(id_rating)
                f.close()
                # print(path)

            reviewer_ids = pd.concat([reviewer_ids, id_rating], axis=0)
            count = count + 1

        for reviewer_rating in reviewer_file_ratings:
            # print(reviewer_id)
            id_rating = pd.read_csv(reviewer_rating, names = ['ratings'])#.round()

            reviewer_ratings = pd.concat([reviewer_ratings, id_rating], axis=0)
            # print(reviewer_ratings)


        reviewer_ids.reset_index(drop=True, inplace=True)
        reviewer_ratings.reset_index(drop=True, inplace=True)

        reviewer_data = pd.concat([reviewer_ids, reviewer_ratings], axis=1)


        return reviewer_data

    def align_data(self, tokenized):
        max_length = max([len(i) for i in tokenized])

        features = pd.DataFrame(columns=range(max_length))

        for i in range(len(tokenized)):
            # features = features.append(pd.DataFrame(dennis_full['tokenized_sentences'][i]).transpose())
            features = pd.concat([features, pd.DataFrame(tokenized[i]).transpose()])
            # features = features.shift(-1)
            # features.iloc[-1, :] = features.append(pd.DataFrame(dennis_full['tokenized_sentences'][i]).transpose())
        features.reset_index(drop=True, inplace=True)

        return features


In [None]:
class TokenData(Dataset):
    def __init__(self, train = False):
        if train:
            self.text_data = reviewers_data['full_review']
            self.tokens = train_tokens
            self.labels = list(reviewers_data['ratings'] * 10)
        else:
            self.text_data = reviewers_test_data['full_review']
            self.tokens = test_tokens
            self.labels = list(reviewers_test_data['ratings'] * 10)

    def __len__(self):
        return len(self.text_data)

    def __getitem__(self, idx):
        sample = {}
        for k, v in self.tokens.items():
            sample[k] = torch.tensor(v[idx])
        sample['labels'] = torch.tensor(self.labels[idx])
        return sample

In [None]:
#Training dataset
x = Bert_Regression()
reviewers_data = x.get_reviewer_data(['Dennis+Schwartz'])
#James+Berardinelli Dennis+Schwartz Scott+Renshaw
tokenizer = x.initilize_tokenizer()

train_tokens = tokenizer(list(reviewers_data['full_review']), padding='max_length', truncation=False, max_length=3210)
train_tokens['labels'] = reviewers_data['ratings'] * 10

['../MyDrive/MyDrive/scaledata/Dennis+Schwartz/id.Dennis+Schwartz']


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
#Test dataset
reviewers_test_data = x.get_reviewer_data(['Scott+Renshaw'])
#James+Berardinelli Dennis+Schwartz Scott+Renshaw

test_tokens = tokenizer(list(reviewers_test_data['full_review']), padding='max_length', truncation=False, max_length=3210)
test_tokens['labels'] = reviewers_test_data['ratings'] * 10

['../MyDrive/MyDrive/scaledata/Scott+Renshaw/id.Scott+Renshaw']


In [None]:
train_tokens.keys()

dict_keys(['input_ids', 'attention_mask', 'labels'])

In [None]:
#Train data
batch_size = 1
train_data = TokenData(train = True)
train_dataset = DataLoader(train_data, shuffle=True, batch_size=batch_size)

In [None]:
#Test data
batch_size = 1
test_data = TokenData(train = False)
test_dataset = DataLoader(test_data, shuffle=True, batch_size=batch_size)

In [None]:
bert_model = x.initilize_model()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# optim func
optimizer = torch.optim.AdamW(bert_model.parameters(), lr=1e-3)
# loss func
loss_fn = torch.nn.CrossEntropyLoss()

In [None]:
bert_model.config

DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5,
    "LABEL_6": 6,
    "LABEL_7": 7,
    "LABEL_8": 8,
    "LABEL_9": 9
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.41.2",
  "vocab_size": 30522
}

In [None]:
bert_model.to("cpu")

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [None]:
len(train_tokens['attention_mask'])

1027

In [None]:
epochs = 3
train_last_loss = 0
multiplier = 0
logits = 0

for epoch in range(epochs):
    print("Epoch: ",(epoch + 1))
    predicted = 0

    bert_model.train()

    iter_number = 0

    for i, values in enumerate(train_dataset):

        optimizer.zero_grad()

        for multiplier in range(6):

            val1 = torch.tensor(values['input_ids'][0][multiplier*512:(multiplier+1)*512])
            val1 = val1.unsqueeze(dim=0)
            val2 = torch.tensor(values['attention_mask'][0][multiplier*512:(multiplier+1)*512])
            val2 = val2.unsqueeze(dim=0)

            outputs = bert_model(val1, val2)
            logits = logits + outputs.logits

        logits = logits/6
        loss = loss_fn(logits, torch.tensor(values['labels']).type(torch.long))
        logits = 0

        print(loss.item())

        loss.backward()

        optimizer.step()

        train_batch_loss = loss.item()
        train_last_loss = train_batch_loss / batch_size

        iter_number += 1

        print('Training batch {} last loss: {}'.format(i + 1, train_last_loss))
    # Logging epoch-wise training loss
    print(f"\nTraining epoch {epoch + 1} loss: ",train_last_loss)

    bert_model.eval()
    correct = 0
    test_pred = []
    for i, values in enumerate(test_dataset):

        # We don't need gradients for testing
        with torch.no_grad():
            for multiplier in range(6):

                val1 = torch.tensor(values['input_ids'][0][multiplier*512:(multiplier+1)*512])
                val1 = val1.unsqueeze(dim=0)
                val2 = torch.tensor(values['attention_mask'][0][multiplier*512:(multiplier+1)*512])
                val2 = val2.unsqueeze(dim=0)

                outputs = bert_model(val1, val2)
                logits = logits + outputs.logits

        logits = logits/6
        loss = loss_fn(logits, torch.tensor(values['labels']).type(torch.long))

        # Calculating total batch loss using the logits and labels
        test_batch_loss = loss.item()

        # Calculating the mean batch loss
        test_last_loss = test_batch_loss / batch_size
        print('Testing batch {} loss: {}'.format(i + 1, test_last_loss))

        # correct += (logits.argmax(1) == values['labels']).sum().item()
        print(logits.argmax(1), values['labels'])
        # print("Testing accuracy: ",correct/((i + 1) * batch_size))
        logits = 0


    print(f"\nTesting epoch {epoch + 1} last loss: ",test_last_loss)