In [16]:
import pandas as pd

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import sklearn.metrics as metrics
from sklearn import preprocessing

import re
import numpy as np
import transformers as transform
import torch
from torch.nn.functional import softmax, normalize

from torch.utils.data import DataLoader, TensorDataset, Dataset

In [2]:
from google.colab import drive
drive.mount('/MyDrive')

Drive already mounted at /MyDrive; to attempt to forcibly remount, call drive.mount("/MyDrive", force_remount=True).


In [3]:
class Bert_Regression:

    model_class = transform.DistilBertForSequenceClassification

    tokenizer_class = transform.DistilBertTokenizerFast

    pretrained_weights = 'distilbert-base-uncased'
    tokenizer = 0
    model = 0

    def initilize_tokenizer(self):
        self.tokenizer = self.tokenizer_class.from_pretrained(self.pretrained_weights)

        return self.tokenizer

    def initilize_model(self):
        self.model = self.model_class.from_pretrained(self.pretrained_weights, num_labels=10, max_position_embeddings=512)


        return self.model

    def clear_model_and_tokenizer(self):
        self.tokenizer = None
        self.model = None

    def get_reviewer_data(self, reviewer_names):

        reviewer_ids = pd.DataFrame(columns=['ids'])
        reviewer_ratings = pd.DataFrame(columns=['ratings'])

        reviewer_file_ids = ['../MyDrive/MyDrive/scaledata/'+reviewer_name+'/id.'+reviewer_name for reviewer_name in reviewer_names]
        reviewer_file_ratings = ['../MyDrive/MyDrive/scaledata/'+reviewer_name+'/rating.'+reviewer_name for reviewer_name in reviewer_names]
        reviews_folder = ['../MyDrive/MyDrive/scale_whole_review/'+reviewer_name+'/txt.parag' for reviewer_name in reviewer_names]

        count = 0

        for reviewer_id in reviewer_file_ids:
            print(reviewer_file_ids)
            id_rating = pd.read_csv(reviewer_id, names = ['ids'])
            id_rating['full_review'] = ''

            for id in id_rating['ids']:
                path = reviews_folder[count]+'/'+str(id)+'.txt'
                f = open(path, 'r', encoding='cp1252')
                text = f.read()
                text = re.sub(r'[^\w]', ' ', text)
                text = re.sub("\d+", "", text)
                text = text.lower()
                id_rating.loc[id_rating['ids'] == id, 'full_review'] = text
                # id_rating.loc[id == dennis_fulls, 'full_review'] = text
                # print(id_rating)
                f.close()
                # print(path)

            reviewer_ids = pd.concat([reviewer_ids, id_rating], axis=0)
            count = count + 1

        for reviewer_rating in reviewer_file_ratings:
            # print(reviewer_id)
            id_rating = pd.read_csv(reviewer_rating, names = ['ratings'])#.round()

            reviewer_ratings = pd.concat([reviewer_ratings, id_rating], axis=0)
            # print(reviewer_ratings)


        reviewer_ids.reset_index(drop=True, inplace=True)
        reviewer_ratings.reset_index(drop=True, inplace=True)

        reviewer_data = pd.concat([reviewer_ids, reviewer_ratings], axis=1)


        return reviewer_data

    def align_data(self, tokenized):
        max_length = max([len(i) for i in tokenized])

        features = pd.DataFrame(columns=range(max_length))

        for i in range(len(tokenized)):
            # features = features.append(pd.DataFrame(dennis_full['tokenized_sentences'][i]).transpose())
            features = pd.concat([features, pd.DataFrame(tokenized[i]).transpose()])
            # features = features.shift(-1)
            # features.iloc[-1, :] = features.append(pd.DataFrame(dennis_full['tokenized_sentences'][i]).transpose())
        features.reset_index(drop=True, inplace=True)

        return features


In [4]:
class ExtendedBertClassification(torch.nn.Module):
    def __init__(self, BertModel):
        super(ExtendedBertClassification, self).__init__()
        self.bert_model = BertModel.initilize_model()
        self.linear = torch.nn.Linear(10, 1)

    def calculate(self, input_ids = None, attention_mask = None, max_review_lenght = None):
        logits = 0
        multiplier = 0

        cntr = (max_review_lenght//512)+1

        for multiplier in range(cntr):

            if (cntr - multiplier) != 1:
                val1 = torch.tensor(input_ids[0][multiplier*512:(multiplier+1)*512])
                val1 = val1.clone().detach().unsqueeze(dim=0)
                val2 = torch.tensor(attention_mask[0][multiplier*512:(multiplier+1)*512])
                val2 = val2.clone().detach().unsqueeze(dim=0)
            else:
                val1 = torch.tensor(input_ids[0][multiplier*512:])
                val1 = val1.clone().detach().unsqueeze(dim=0)
                val2 = torch.tensor(attention_mask[0][multiplier*512:])
                val2 = val2.clone().detach().unsqueeze(dim=0)

            outputs = self.bert_model(val1, val2)

            logits = logits + outputs.logits
        logits = logits/6
        return logits

    def forward(self, input_ids, attention_mask, max_review_lenght = None):
        output = self.calculate(input_ids, attention_mask, max_review_lenght)
        output = self.linear(output[0])
        return output



In [5]:
class TokenData(Dataset):
    def __init__(self, train = False):
        if train:
            self.text_data = reviewers_data['full_review']
            self.tokens = train_tokens
            self.labels = list(reviewers_data['ratings'] * 10)
        else:
            self.text_data = reviewers_test_data['full_review']
            self.tokens = test_tokens
            self.labels = list(reviewers_test_data['ratings'] * 10)

    def __len__(self):
        return len(self.text_data)

    def __getitem__(self, idx):
        sample = {}
        for k, v in self.tokens.items():
            sample[k] = torch.tensor(v[idx])
        sample['labels'] = torch.tensor(self.labels[idx])
        return sample

In [6]:
x = Bert_Regression()

In [7]:
#Training dataset
reviewers_data = x.get_reviewer_data(['Scott+Renshaw'])
#James+Berardinelli Dennis+Schwartz Scott+Renshaw
tokenizer = x.initilize_tokenizer()

max_train_review_lenght = max(len(i.split()) for i in reviewers_data['full_review'])

train_tokens = tokenizer(list(reviewers_data['full_review']), padding='max_length', truncation=False, max_length=max_train_review_lenght)
train_tokens['labels'] = reviewers_data['ratings']

['../MyDrive/MyDrive/scaledata/Scott+Renshaw/id.Scott+Renshaw']


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [8]:
#Test dataset
reviewers_test_data = x.get_reviewer_data(['James+Berardinelli'])
#James+Berardinelli Dennis+Schwartz Scott+Renshaw

max_test_review_lenght = max(len(i.split()) for i in reviewers_test_data['full_review'])

test_tokens = tokenizer(list(reviewers_test_data['full_review']), padding='max_length', truncation=False, max_length=max_test_review_lenght)
test_tokens['labels'] = reviewers_test_data['ratings']

['../MyDrive/MyDrive/scaledata/James+Berardinelli/id.James+Berardinelli']


In [9]:
# (max_train_review_lenght//512)+1

In [10]:
train_tokens.keys()

dict_keys(['input_ids', 'attention_mask', 'labels'])

In [11]:
#Train data
batch_size = 1
train_data = TokenData(train = True)
train_dataset = DataLoader(train_data, shuffle=True, batch_size=batch_size)

In [12]:
#Test data
batch_size = 1
test_data = TokenData(train = False)
test_dataset = DataLoader(test_data, shuffle=True, batch_size=batch_size)

In [13]:
extended_model = ExtendedBertClassification(x)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
# optim func
optimizer = torch.optim.AdamW(extended_model.parameters(), lr=0.01)
# loss func
loss_fn = torch.nn.MSELoss()

In [None]:
epochs = 3
iter_number = 0
iter_number2 = 0
for epoch in range(epochs):
    extended_model.train()
    for i, values in enumerate(train_dataset):
        #REMOVE IF BELOW IF NEEDED
        if iter_number > 5:
            break

        optimizer.zero_grad()

        # print(len(values['input_ids'][0]), len(values['attention_mask'][0]))
        outputs = extended_model(values['input_ids'], values['attention_mask'], max_train_review_lenght)

        actual_y = torch.tensor(values['labels'])#.type(torch.long)
        print(outputs, actual_y)
        # break
        loss = loss_fn(outputs, actual_y)

        print('Loss: ', loss.item())

        loss.backward()

        optimizer.step()

        train_batch_loss = loss.item()
        train_last_loss = train_batch_loss / batch_size


        print('Training batch {} last loss: {}'.format(i + 1, train_last_loss))
        iter_number += 1

    extended_model.eval()
    for i, values in enumerate(test_dataset):
        #REMOVE IF BELOW IF NEEDED
        if iter_number2 > 20:
            break

        with torch.no_grad():
            outputs = extended_model(values['input_ids'], values['attention_mask'], max_test_review_lenght)

        actual_y = torch.tensor(values['labels'])#.type(torch.long)

        print(outputs, actual_y)

        loss = loss_fn(outputs, actual_y)

        print('Loss: ', loss.item())

        test_batch_loss = loss.item()
        test_last_loss = test_batch_loss / batch_size


        print('Test batch {} last loss: {}'.format(i + 1, test_last_loss))

        iter_number2 += 1
