In [68]:
import pandas as pd

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import sklearn.metrics as metrics
from sklearn import preprocessing

import re
import numpy as np
import transformers as transform
import torch

from torch.utils.data import DataLoader, TensorDataset

In [69]:
from sklearn.preprocessing import normalize

In [70]:
class Bert_Regression:

    model_class = transform.BertModel
    tokenizer_class = transform.BertTokenizer
    pretrained_weights = 'bert-base-uncased'
    tokenizer = 0
    model = 0

    def initilize_tokenizer(self):
        self.tokenizer = self.tokenizer_class.from_pretrained(self.pretrained_weights)
        return self.tokenizer

    def initilize_model(self):
        self.model = self.model_class.from_pretrained(self.pretrained_weights)
        return self.model

    def clear_model_and_tokenizer(self):
        self.tokenizer = None
        self.model = None

    def get_reviewer_data(self, reviewer_names):

        reviewer_ids = pd.DataFrame(columns=['ids'])
        reviewer_ratings = pd.DataFrame(columns=['ratings'])

        reviewer_file_ids = ['scaledata/'+reviewer_name+'/id.'+reviewer_name for reviewer_name in reviewer_names]
        reviewer_file_ratings = ['scaledata/'+reviewer_name+'/rating.'+reviewer_name for reviewer_name in reviewer_names]
        reviews_folder = ['scale_whole_review/'+reviewer_name+'/txt.parag' for reviewer_name in reviewer_names]

        count = 0
        
        for reviewer_id in reviewer_file_ids:
            
            id_rating = pd.read_csv(reviewer_id, names = ['ids'])
            id_rating['full_review'] = ''
            
            for id in id_rating['ids']:
                path = reviews_folder[count]+'/'+str(id)+'.txt'
                f = open(path, 'r', encoding='cp1252')
                text = f.read()
                text = re.sub(r'[^\w]', ' ', text)
                text = re.sub("\d+", "", text)
                text = text.lower()
                id_rating.loc[id_rating['ids'] == id, 'full_review'] = text
                # id_rating.loc[id == dennis_fulls, 'full_review'] = text
                # print(id_rating)
                f.close()
                # print(path)
                
            reviewer_ids = pd.concat([reviewer_ids, id_rating], axis=0)
            count = count + 1
        
        for reviewer_rating in reviewer_file_ratings:
            # print(reviewer_id)
            id_rating = pd.read_csv(reviewer_rating, names = ['ratings'])#.round()
            
            reviewer_ratings = pd.concat([reviewer_ratings, id_rating], axis=0)
            # print(reviewer_ratings)
            

        reviewer_ids.reset_index(drop=True, inplace=True)
        reviewer_ratings.reset_index(drop=True, inplace=True)

        reviewer_data = pd.concat([reviewer_ids, reviewer_ratings], axis=1)


        return reviewer_data
    
    def align_data(self, tokenized):
        max_length = max([len(i) for i in tokenized])
        
        features = pd.DataFrame(columns=range(max_length))
        
        for i in range(len(tokenized)):
            # features = features.append(pd.DataFrame(dennis_full['tokenized_sentences'][i]).transpose())
            features = pd.concat([features, pd.DataFrame(tokenized[i]).transpose()])
            # features = features.shift(-1)
            # features.iloc[-1, :] = features.append(pd.DataFrame(dennis_full['tokenized_sentences'][i]).transpose())
        features.reset_index(drop=True, inplace=True)

        return features
                

In [71]:
class TextClassificationModel(torch.nn.Module):
    def __init__(self):
        super(TextClassificationModel, self).__init__()
        self.dropout = torch.nn.Dropout(p=0.2)
        self.linear1 = torch.nn.Linear(3210,1500)
        self.ReLu = torch.nn.ReLU()
        self.linear2 = torch.nn.Linear(1500,9)
        self.linear3 = torch.nn.Linear(9, 1)

    def forward(self, x):
        x = self.dropout(x)
        x = self.linear1(x)
        x = self.ReLu(x)
        x = self.linear2(x)
        x = self.linear3(x)
        
        return x

In [72]:
#Training dataset
x = Bert_Regression()
reviewers_data = x.get_reviewer_data(['Dennis+Schwartz'])
#James+Berardinelli Dennis+Schwartz Scott+Renshaw
tokenizer = x.initilize_tokenizer()
reviewers_data['tokenized_data'] = reviewers_data['full_review'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True, padding='max_length', max_length=3210)))
tokens = torch.nn.functional.normalize(torch.tensor(reviewers_data['tokenized_data'], dtype=torch.float32))
ratings = torch.unsqueeze(torch.tensor(reviewers_data['ratings'], dtype=torch.float32), dim=1)
tokens, ratings
tensor_dataset = TensorDataset(tokens, ratings)
data_loader = DataLoader(tensor_dataset, batch_size=1, shuffle=True)

In [73]:
#Test dataset
x_test = Bert_Regression()
reviewers_data_test = x_test.get_reviewer_data(['Scott+Renshaw'])
tokenizer = x_test.initilize_tokenizer()
reviewers_data_test['tokenized_data'] = reviewers_data_test['full_review'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True, padding='max_length', max_length=3210)))
tokens_test = torch.nn.functional.normalize(torch.tensor(reviewers_data_test['tokenized_data'], dtype=torch.float32))
ratings_test = torch.unsqueeze(torch.tensor(reviewers_data_test['ratings'], dtype=torch.float32), dim=1)
tokens_test, ratings_test
tensor_dataset_test = TensorDataset(tokens_test, ratings_test)
test_loader_pt = DataLoader(tensor_dataset_test, batch_size=1, shuffle=True)

In [74]:
for x, y in test_loader_pt:
    print(x, y)
    break

tensor([[0.0005, 0.0224, 0.0125,  ..., 0.0000, 0.0000, 0.0000]]) tensor([[0.8000]])


In [75]:
for x, y in data_loader:
    print(x, y)
    break

tensor([[0.0005, 0.0122, 0.0240,  ..., 0.0000, 0.0000, 0.0000]]) tensor([[0.2000]])


In [76]:
model = TextClassificationModel()
print(model)

TextClassificationModel(
  (dropout): Dropout(p=0.2, inplace=False)
  (linear1): Linear(in_features=3210, out_features=1500, bias=True)
  (ReLu): ReLU()
  (linear2): Linear(in_features=1500, out_features=9, bias=True)
  (linear3): Linear(in_features=9, out_features=1, bias=True)
)


In [77]:
for X, y in data_loader:
    print(X)
    print(torch.unsqueeze(y, dim=1))
    break

tensor([[0.0005, 0.0092, 0.0098,  ..., 0.0000, 0.0000, 0.0000]])
tensor([[[0.6000]]])


In [143]:
len(data_loader)

1027

In [144]:
train_loss

27.33661734918644