In [16]:
import pandas as pd

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import sklearn.metrics as metrics
from sklearn import preprocessing

import re
import numpy as np
import transformers as transform
import torch
from torch.nn.functional import softmax, normalize

from torch.utils.data import DataLoader, TensorDataset, Dataset

In [17]:
from google.colab import drive
drive.mount('/MyDrive')

Drive already mounted at /MyDrive; to attempt to forcibly remount, call drive.mount("/MyDrive", force_remount=True).


In [18]:
class Bert_Regression:

    model_class = transform.DistilBertForSequenceClassification

    tokenizer_class = transform.DistilBertTokenizerFast

    pretrained_weights = 'distilbert-base-uncased'
    tokenizer = 0
    model = 0

    def initilize_tokenizer(self):
        self.tokenizer = self.tokenizer_class.from_pretrained(self.pretrained_weights)

        return self.tokenizer

    def initilize_model(self):
        self.model = self.model_class.from_pretrained(self.pretrained_weights, num_labels=10, max_position_embeddings=512)


        return self.model

    def clear_model_and_tokenizer(self):
        self.tokenizer = None
        self.model = None

    def get_reviewer_data(self, reviewer_names):

        reviewer_ids = pd.DataFrame(columns=['ids'])
        reviewer_ratings = pd.DataFrame(columns=['ratings'])

        reviewer_file_ids = ['../MyDrive/MyDrive/scaledata/'+reviewer_name+'/id.'+reviewer_name for reviewer_name in reviewer_names]
        reviewer_file_ratings = ['../MyDrive/MyDrive/scaledata/'+reviewer_name+'/rating.'+reviewer_name for reviewer_name in reviewer_names]
        reviews_folder = ['../MyDrive/MyDrive/scale_whole_review/'+reviewer_name+'/txt.parag' for reviewer_name in reviewer_names]

        count = 0

        for reviewer_id in reviewer_file_ids:
            print(reviewer_file_ids)
            id_rating = pd.read_csv(reviewer_id, names = ['ids'])
            id_rating['full_review'] = ''

            for id in id_rating['ids']:
                path = reviews_folder[count]+'/'+str(id)+'.txt'
                f = open(path, 'r', encoding='cp1252')
                text = f.read()
                text = re.sub(r'[^\w]', ' ', text)
                text = re.sub("\d+", "", text)
                text = text.lower()
                id_rating.loc[id_rating['ids'] == id, 'full_review'] = text
                # id_rating.loc[id == dennis_fulls, 'full_review'] = text
                # print(id_rating)
                f.close()
                # print(path)

            reviewer_ids = pd.concat([reviewer_ids, id_rating], axis=0)
            count = count + 1

        for reviewer_rating in reviewer_file_ratings:
            # print(reviewer_id)
            id_rating = pd.read_csv(reviewer_rating, names = ['ratings'])#.round()

            reviewer_ratings = pd.concat([reviewer_ratings, id_rating], axis=0)
            # print(reviewer_ratings)


        reviewer_ids.reset_index(drop=True, inplace=True)
        reviewer_ratings.reset_index(drop=True, inplace=True)

        reviewer_data = pd.concat([reviewer_ids, reviewer_ratings], axis=1)


        return reviewer_data

    def align_data(self, tokenized):
        max_length = max([len(i) for i in tokenized])

        features = pd.DataFrame(columns=range(max_length))

        for i in range(len(tokenized)):
            # features = features.append(pd.DataFrame(dennis_full['tokenized_sentences'][i]).transpose())
            features = pd.concat([features, pd.DataFrame(tokenized[i]).transpose()])
            # features = features.shift(-1)
            # features.iloc[-1, :] = features.append(pd.DataFrame(dennis_full['tokenized_sentences'][i]).transpose())
        features.reset_index(drop=True, inplace=True)

        return features


In [19]:
class ExtendedBertClassification(torch.nn.Module):
    def __init__(self, BertModel):
        super(ExtendedBertClassification, self).__init__()
        self.bert_model = BertModel.initilize_model()
        self.linear = torch.nn.Linear(10, 1)

    def calculate(self, input_ids = None, attention_mask = None):
        logits = 0
        multiplier = 0
        for multiplier in range(7):

            val1 = torch.tensor(input_ids[0][multiplier*512:(multiplier+1)*512])
            val1 = val1.clone().detach().unsqueeze(dim=0)
            val2 = torch.tensor(attention_mask[0][multiplier*512:(multiplier+1)*512])
            val2 = val2.clone().detach().unsqueeze(dim=0)

            outputs = self.bert_model(val1, val2)

            logits = logits + outputs.logits
        logits = logits/6
        return logits

    def forward(self, input_ids, attention_mask):
        output = self.calculate(input_ids, attention_mask)
        output = self.linear(output[0])
        return output



In [20]:
class TokenData(Dataset):
    def __init__(self, train = False):
        if train:
            self.text_data = reviewers_data['full_review']
            self.tokens = train_tokens
            self.labels = list(reviewers_data['ratings'] * 10)
        else:
            self.text_data = reviewers_test_data['full_review']
            self.tokens = test_tokens
            self.labels = list(reviewers_test_data['ratings'] * 10)

    def __len__(self):
        return len(self.text_data)

    def __getitem__(self, idx):
        sample = {}
        for k, v in self.tokens.items():
            sample[k] = torch.tensor(v[idx])
        sample['labels'] = torch.tensor(self.labels[idx])
        return sample

In [21]:
x = Bert_Regression()

In [22]:
#Training dataset
reviewers_data = x.get_reviewer_data(['Dennis+Schwartz'])
#James+Berardinelli Dennis+Schwartz Scott+Renshaw
tokenizer = x.initilize_tokenizer()

train_tokens = tokenizer(list(reviewers_data['full_review']), padding='max_length', truncation=False, max_length=3210)
train_tokens['labels'] = reviewers_data['ratings']

['../MyDrive/MyDrive/scaledata/Dennis+Schwartz/id.Dennis+Schwartz']


In [23]:
#Test dataset
reviewers_test_data = x.get_reviewer_data(['Scott+Renshaw'])
#James+Berardinelli Dennis+Schwartz Scott+Renshaw

test_tokens = tokenizer(list(reviewers_test_data['full_review']), padding='max_length', truncation=False, max_length=3210)
test_tokens['labels'] = reviewers_test_data['ratings']

['../MyDrive/MyDrive/scaledata/Scott+Renshaw/id.Scott+Renshaw']


In [24]:
train_tokens.keys()

dict_keys(['input_ids', 'attention_mask', 'labels'])

In [25]:
#Train data
batch_size = 1
train_data = TokenData(train = True)
train_dataset = DataLoader(train_data, shuffle=True, batch_size=batch_size)

In [26]:
#Test data
batch_size = 1
test_data = TokenData(train = False)
test_dataset = DataLoader(test_data, shuffle=True, batch_size=batch_size)

In [27]:
extended_model = ExtendedBertClassification(x)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [28]:
# optim func
optimizer = torch.optim.AdamW(extended_model.parameters(), lr=0.01)
# loss func
loss_fn = torch.nn.MSELoss()

In [31]:
epochs = 3
iter_number = 0
iter_number2 = 0
for epoch in range(epochs):
    extended_model.train()
    for i, values in enumerate(train_dataset):
        #REMOVE IF BELOW IF NEEDED
        if iter_number > 20:
            break

        optimizer.zero_grad()

        outputs = extended_model(values['input_ids'], values['attention_mask'])

        actual_y = torch.tensor(values['labels'])#.type(torch.long)
        print(outputs, actual_y)
        # break
        loss = loss_fn(outputs, actual_y)

        print(loss.item())

        loss.backward()

        optimizer.step()

        train_batch_loss = loss.item()
        train_last_loss = train_batch_loss / batch_size


        print('Training batch {} last loss: {}'.format(i + 1, train_last_loss))
        iter_number += 1

    extended_model.eval()
    for i, values in enumerate(test_dataset):
        #REMOVE IF BELOW IF NEEDED
        if iter_number2 > 20:
            break

        with torch.no_grad():
            outputs = extended_model(values['input_ids'], values['attention_mask'])

        actual_y = torch.tensor(values['labels'])#.type(torch.long)

        print(outputs, actual_y)

        loss = loss_fn(outputs, actual_y)

        print(loss.item())

        test_batch_loss = loss.item()
        test_last_loss = test_batch_loss / batch_size


        print('Test batch {} last loss: {}'.format(i + 1, test_last_loss))

        iter_number2 += 1


  val1 = torch.tensor(input_ids[0][multiplier*512:(multiplier+1)*512])
  val2 = torch.tensor(attention_mask[0][multiplier*512:(multiplier+1)*512])
  actual_y = torch.tensor(values['labels'])#.type(torch.long)


tensor([0.1678], grad_fn=<ViewBackward0>) tensor([4.])
14.685408592224121
Training batch 1 last loss: 14.685408592224121
tensor([1.4817], grad_fn=<ViewBackward0>) tensor([3.])
2.3052480220794678
Training batch 2 last loss: 2.3052480220794678
tensor([49.9564], grad_fn=<ViewBackward0>) tensor([4.])
2111.989013671875
Training batch 3 last loss: 2111.989013671875
tensor([5.8807], grad_fn=<ViewBackward0>) tensor([6.])
0.01423769723623991
Training batch 4 last loss: 0.01423769723623991
tensor([16.8523], grad_fn=<ViewBackward0>) tensor([9.])
61.658447265625
Training batch 5 last loss: 61.658447265625
tensor([-0.7422], grad_fn=<ViewBackward0>) tensor([3.])
14.003778457641602
Training batch 6 last loss: 14.003778457641602
tensor([-0.2009], grad_fn=<ViewBackward0>) tensor([4.])
17.64727783203125
Training batch 7 last loss: 17.64727783203125
tensor([0.5707], grad_fn=<ViewBackward0>) tensor([7.])
41.33552932739258
Training batch 8 last loss: 41.33552932739258
tensor([0.7281], grad_fn=<ViewBackward

  actual_y = torch.tensor(values['labels'])#.type(torch.long)


tensor([5.4102]) tensor([7.])
2.527327299118042
Test batch 1 last loss: 12.486634254455566
tensor([5.4102]) tensor([4.])
1.9887853860855103
Test batch 2 last loss: 12.486634254455566
tensor([5.4102]) tensor([5.])
0.1682993471622467
Test batch 3 last loss: 12.486634254455566
tensor([5.4102]) tensor([3.])
5.809271335601807
Test batch 4 last loss: 12.486634254455566
tensor([5.4102]) tensor([6.])
0.34781327843666077
Test batch 5 last loss: 12.486634254455566
tensor([5.4102]) tensor([2.])
11.62975788116455
Test batch 6 last loss: 12.486634254455566
tensor([5.4102]) tensor([3.])
5.809271335601807
Test batch 7 last loss: 12.486634254455566
tensor([5.4102]) tensor([6.])
0.34781327843666077
Test batch 8 last loss: 12.486634254455566
tensor([5.4102]) tensor([6.])
0.34781327843666077
Test batch 9 last loss: 12.486634254455566
tensor([5.4102]) tensor([8.])
6.706840991973877
Test batch 10 last loss: 12.486634254455566
tensor([5.4102]) tensor([5.])
0.1682993471622467
Test batch 11 last loss: 12.4866