In [8]:
import random
import os
import pandas as pd
import logging
from tqdm import tqdm, trange
import time
import numpy as np
import torch
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
                              TensorDataset)
from transformers import AdamW, GPT2Config, GPT2LMHeadModel, GPT2Tokenizer, get_linear_schedule_with_warmup

In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [10]:
#df = pd.read_csv('drive/MyDrive/NLP-project-2022/yelp/clean_text_unigrams_r.csv')
df = pd.read_csv('clean_text_unigrams_r.csv')
df.head()

Unnamed: 0,sentence,sentiment,c1,a1,r1,del_sent,defr_sent
0,excellent food .,POS,food .,excellent,,<POS><CON_START>food .<START>excellent food .<...,<ATTR_WORDS>excellent<CON_START>food .<START>e...
1,superb customer service .,POS,customer service .,superb,,<POS><CON_START>customer service .<START>super...,<ATTR_WORDS>superb<CON_START>customer service ...
2,they also have daily specials and ice cream wh...,POS,they also have daily specials and ice cream wh...,,,<POS><CON_START>they also have daily specials ...,<ATTR_WORDS>nan<CON_START>they also have daily...
3,it 's a good toasted hoagie .,POS,it 's a good toasted hoagie .,,,<POS><CON_START>it 's a good toasted hoagie .<...,<ATTR_WORDS>nan<CON_START>it 's a good toasted...
4,the staff is friendly .,POS,the staff is .,friendly,hostile,<POS><CON_START>the staff is .<START>the staff...,<ATTR_WORDS>friendly<CON_START>the staff is .<...


In [5]:
from sklearn.model_selection import train_test_split
inputs = df['del_sent'].to_list()
y = [1 for i in range(len(inputs))]
train_ds, eval_ds, y_train, y_test = train_test_split(inputs, y, test_size=0.33, random_state=42)

In [4]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')
model.config.vocab_size

50257

In [5]:
special_tokens = ['<POS>', '<NEG>','<CON_START>','<START>','<END>','<PAD>']
special_tokens_dict = {'additional_special_tokens': special_tokens}
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
model.resize_token_embeddings(len(tokenizer))
start_token_id = tokenizer.convert_tokens_to_ids(['<START>'])[0]
model.config.vocab_size

50262

In [6]:
model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50262, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dro

In [9]:
def tokenize_and_encode(lines):
        '''
        This method tokenizes the input data and encodes it using the OpenAIGPTTokenizer
        :param file_path: Path of the input file, dtype: str
        :return: encoded dataset  dtype: list
        '''
        tokenized_dataset = lines
        for i, line in enumerate(tqdm(lines)):
            token = tokenizer.tokenize(line)[:512]
            tokenized_dataset[i] = tokenizer.convert_tokens_to_ids(token)
        return tokenized_dataset

In [10]:
train_dataset = tokenize_and_encode(train_ds)
eval_dataset = tokenize_and_encode(eval_ds)
input_length = max(max(len(t) for t in train_dataset), max(len(q) for q in eval_dataset))
print('input length ', input_length)
input_length = min(input_length, 85)
print('input length ', input_length)

100%|████████████████████████████████████████████████████████████████████████| 296984/296984 [01:19<00:00, 3756.64it/s]
100%|████████████████████████████████████████████████████████████████████████| 146277/146277 [00:39<00:00, 3690.67it/s]


input length  52
input length  52


In [7]:
pad_token_id = tokenizer.convert_tokens_to_ids(['<PAD>'])[0]

In [84]:
def pre_process_dataset(encoded_dataset, input_length, start_token_id,pad_token_id=pad_token_id):
        """
        This method is to create torch tensor of input ids and lm labels
        :param encoded_dataset: Input dataset, dtype: list
        :param input_length: Maximum length of sentence from training and eval dataset, dtype: int
        :param start_token_id: id of the '<START>' token, dtype: int
        :return: torch.tensor of size [len(encoded_dataset), 2]
        """

        n_batch = len(encoded_dataset)
        input_ids = np.full(shape=(n_batch, input_length), fill_value=pad_token_id, dtype=np.int64)
        lm_labels = np.full(shape=(n_batch, input_length), fill_value=-100, dtype=np.int64)

        for i, tokens in enumerate(encoded_dataset):
            try:
                #tokens = tokens[:input_length]
                start_id_index = tokens.index(start_token_id)
                input_ids[i, :len(tokens)] = tokens
                start_id_index = tokens.index(start_token_id)
                lm_labels[i, start_id_index : len(tokens)-1] = tokens[start_id_index + 1: len(tokens)]
                # LM loss calculate only for tokens after <START> token in the sentence
                #lm_labels[i, :len(tokens)-1] = tokens[1:]
            except ValueError:
                print("Index {} doesn't have start token".format(i))

        input_ids = torch.tensor(input_ids)
        lm_labels = torch.tensor(lm_labels)
        tensor_dataset = (input_ids, lm_labels)
        #tensor_dataset.append(torch.tensor(d) for d in all_inputs)

        return tensor_dataset

In [85]:
train_batch_size = 32
num_train_epochs = 2
learning_rate = 6.25e-5
warmup_proportion = 0.002
max_grad_norm = 1
weight_decay = 0.01
n_gpu = 1

In [86]:
train_tensor_dataset = pre_process_dataset(train_dataset, input_length, start_token_id=start_token_id)
eval_tensor_dataset = pre_process_dataset(eval_dataset, input_length, start_token_id=start_token_id)

print("Training Example Input ids= {}".format(train_tensor_dataset[0][0]))
print("Training Example Language Modeling ids = {}".format(train_tensor_dataset[1][0]))
time.sleep(10)
train_data = TensorDataset(*train_tensor_dataset)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=train_batch_size)

eval_data = TensorDataset(*eval_tensor_dataset)
eval_sampler = RandomSampler(eval_data)
eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=train_batch_size)


Training Example Input ids= tensor([50257, 50259,   392,   340,   705,    82,  5968,    64,   922,  5145,
        50260,   392,   340,   705,    82,  5968,    64,   922,  5145, 50261,
        50262, 50262, 50262, 50262, 50262, 50262, 50262, 50262, 50262, 50262,
        50262, 50262, 50262, 50262, 50262, 50262, 50262, 50262, 50262, 50262,
        50262, 50262, 50262, 50262, 50262, 50262, 50262, 50262, 50262, 50262,
        50262, 50262])
Training Example Language Modeling ids = tensor([ -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          392,   340,   705,    82,  5968,    64,   922,  5145, 50261,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100])


In [14]:
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]
num_train_optimization_steps = len(train_data) * num_train_epochs //train_batch_size
warmup_steps = num_train_optimization_steps * warmup_proportion
optimizer = AdamW(optimizer_grouped_parameters,
                           lr=learning_rate,
                           weight_decay=weight_decay)

scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=warmup_steps, num_training_steps=num_train_optimization_steps
)



In [27]:
do_train = True
output_dir = 'yelp/models'
do_eval = True

In [None]:
if do_train:
        nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None
        model.train()
        for epoch in trange(int(num_train_epochs), desc="Epoch"):
            tr_loss = 0
            nb_tr_steps = 0
            tqdm_bar = tqdm(train_dataloader, desc="Training")
            for step, batch in enumerate(tqdm_bar):
                batch = tuple(t.to(device) for t in batch)
                input_ids, lm_labels = batch
                #print(input_ids.shape)
                loss = model(input_ids, labels=lm_labels)[0]
                if n_gpu > 1:
                    loss.mean().backward()
                else:
                    loss.backward()
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()
                if n_gpu > 1:
                    tmp_loss = loss.mean().item()
                else:
                    tmp_loss = loss.item()
                exp_average_loss = tmp_loss if exp_average_loss is None else 0.7 * exp_average_loss + 0.3 * tmp_loss
                nb_tr_steps += 1
                #if(nb_tr_steps%100==0):
                tqdm_bar.desc = f"Training loss: {exp_average_loss:.2e}"

            model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
            output_model_file = os.path.join(output_dir, "pytorch_model_zero_grad.bin")
            config = model.module.config if hasattr(model, 'module') else model.config
            torch.save(model_to_save.state_dict(), output_model_file)

            

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Training loss: 5.53e-01:  39%|███▉      | 3621/9281 [51:35<1:20:39,  1.17it/s][A
Training loss: 4.92e-01:  39%|███▉      | 3622/9281 [51:36<1:20:39,  1.17it/s][A
Training loss: 4.38e-01:  39%|███▉      | 3623/9281 [51:36<1:20:40,  1.17it/s][A
Training loss: 4.89e-01:  39%|███▉      | 3624/9281 [51:37<1:20:32,  1.17it/s][A
Training loss: 4.83e-01:  39%|███▉      | 3625/9281 [51:38<1:20:32,  1.17it/s][A
Training loss: 4.66e-01:  39%|███▉      | 3626/9281 [51:39<1:20:38,  1.17it/s][A
Training loss: 4.83e-01:  39%|███▉      | 3627/9281 [51:40<1:20:22,  1.17it/s][A
Training loss: 4.70e-01:  39%|███▉      | 3628/9281 [51:41<1:20:37,  1.17it/s][A
Training loss: 4.88e-01:  39%|███▉      | 3629/9281 [51:42<1:20:28,  1.17it/s][A
Training loss: 4.53e-01:  39%|███▉      | 3630/9281 [51:42<1:20:20,  1.17it/s][A
Training loss: 5.43e-01:  39%|███▉      | 3631/9281 [51:43<1:20:16,  1.17it/s][A
Training loss: 5.66e-01:  39%|███

In [None]:
output_model_file = os.path.join(output_dir, "pytorch_model_zero_grad.bin")

In [60]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

#model_state_dict = torch.load('yelp_models/pytorch_model_zero_grad_1.bin', map_location='cpu')
model_state_dict = torch.load('yelp_models/pytorch_model_attr_1e.bin', map_location='cpu')

special_tokens = ['<POS>', '<NEG>','<CON_START>','<START>','<END>','<ATTR_WORDS>','<PAD>']
special_tokens_dict = {'additional_special_tokens': special_tokens}
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
start_token_id = tokenizer.convert_tokens_to_ids(['<START>'])[0]
model.resize_token_embeddings(len(tokenizer))

model.load_state_dict(model_state_dict)
model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50264, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dro

In [61]:
max_seq_len=70
sm = torch.nn.Softmax(dim=-1)

In [62]:
end_token_id = tokenizer.convert_tokens_to_ids(['<END>'])[0]
end_token_id 

50261

In [63]:
def preditction_with_beam_search(ref_text, beam_width=3, max_len=30,vocab_length=40483, end_token_id=end_token_id ):
    """
    This function decodes sentences using Beam Seach. 
    It will output #sentences = beam_width. This function works on a single example.
    
    ref_text : string : Input sentence
    beam_width : int : Width of the output beam
    vocab_length : int : Size of the Vocab after adding the special tokens
    """
    
    done = [False for i in range(beam_width)] # To track which beams are already decoded
    stop_decode = False
    decoded_sentences=[] # List of decoded sentences at any given time
    
    sm = torch.nn.Softmax(dim=-1) # To calculate Softmax over the final layer Logits
    tokens = tokenizer.tokenize(ref_text) # Tokenize the input text
    
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokens) # Convert tokens to ids
    index_tokens = [indexed_tokens for i in range(beam_width)] # Replication of Input ids for all the beams

    #index_tokens = [indexed_tokens for i in range(beam_width)]
    torch_tensor = torch.tensor(index_tokens).to(device)
    beam_indexes = [[] for i in range(beam_width)] # indexes of the current decoded beams
    best_scoes = [0 for i in range(beam_width)] # A list of lists to store Probability values of each decoded token of best beams
    count = 0
    while count < max_len and not stop_decode:
        if count == 0: # For the first step when only one sentence is availabe
            with torch.no_grad():
                # Calculate output probability distribution over the Vocab,
                output = model(torch_tensor)
                preds = sm(output[0]) #  shape = [beam_bidth, len(input_sen)+1,Vocab_length]
            top_v, top_i = preds[:,-1,:].topk(beam_width) # Fatch top indexes and it's values
            [beam_indexes[i].append(top_i[0][i].tolist()) for i in range(beam_width)] # Update the Beam indexes
            # Update the best_scores, for first time just add the topk values directly
            for i in range(beam_width):
                best_scoes[i] = top_v[0][i].item()
            count += 1
        else: # After first step
            # Prepare the current_state by concating original input and decoded beam indexes
            current_state = torch.cat((torch_tensor, torch.tensor(beam_indexes).to(device)), dim=1)
            # Prediction on the current state
            with torch.no_grad():
                outputs = model(current_state)
                preds = sm(outputs[0])
            # Multiply new probability predictions with corresponding best scores
            # Total socres = beam_width * Vocab_Size
            flatten_score = (preds[:,-1,:]*torch.tensor(best_scoes).to(device).unsqueeze(1)).view(-1)
            # Fatch the top scores and indexes 
            vals, inx = flatten_score.topk(beam_width)
            # best_score_inx saves the index of best beams after multiplying the probability of new prediction
            best_scoes_inx = (inx / vocab_length).tolist()
            best_scoes = vals.tolist()
            # Unflatten the index 
            correct_inx = (inx % vocab_length).tolist()
            
            # Check if done for all the Beams
            for i in range(beam_width):
                if correct_inx[i] == end_token_id:
                    done[i] = True
            # Update the best score for each the current Beams
            for i in range(beam_width):
                if not done[i]:
                    best_scoes[i] = vals.tolist()[i]
            # Check is All the Beams are Done
            if (sum(done) == beam_width):
                stop_decode = True
            # Prepapre the new beams
            temp_lt=[0 for i in range(beam_width)]
            for i,x in enumerate(best_scoes_inx):
                temp_lt[i] = beam_indexes[i] + [correct_inx[i]]
            # Update the Beam indexes
            beam_indexes = temp_lt
            del temp_lt
            count += 1
    # Decode All the beam indexes to till <END> token only and convert into sentence
    for i in range(beam_width):
        try:
            end_index = beam_indexes[i].index(end_token_id )
        except ValueError:
            end_index = len(beam_indexes[i])
            
        decoded_sentences.append(tokenizer.decode(beam_indexes[i][:end_index]))
        
    return decoded_sentences

In [68]:
tokenizer.decode(start_token_id)

'<START>'

In [90]:
end_token_id = tokenizer.convert_tokens_to_ids(['<END>'])[0]

In [4]:
import pandas as pd
df_gpt2_cocon = pd.read_csv('yelp_models/cocon_gpt_preds.csv')
df_gpt2_cocon.head()

Unnamed: 0,input,gold,content,pred1,pred2,pred_mp,plain_pred1,plain_pred2,plain_pred_mp,both_3l_pred1,both_3l_pred2,both_3l_pred_mp
0,<POS> <CON_START> i recommend checking this pl...,i highly recommend checking this place out.,highly,.always actors fog fog. fog. fog.,always proposal great actors. fogalways actor...,small definitely. fantastic.always.they recom...,highly highly checking place. highly highly t...,recommenditageitageitageitageitageitageitagei...,highly highly checking place. highly highly t...,recommend this out.itage actorsigaitage love ...,definitely brushelled highly. highly recommen...,recommend this out you. main mindiquette love...
1,"<NEG> <CON_START> not only is there pizza, but...","not only is there pizza bad, but their custome...",bad horrible,.ues.always actors fog. actors. actors,alwaysalways actorsgreat. great actorsalways ...,.again.wrong always too rude always.att,only is pizza but customer is. Kurd not is,"even there qualified, wantingetooth wanting w...","only is pizza, their service horrible horribl...",also there'); is pizza but customer is..,only is pizza mit decrease collective decreas...,also there is pizza but customer isable <END>...
2,<POS> <CON_START> cool tram that has views goi...,cool tram that has great views going up or dow...,great,always actors actors actors actors actors Kur...,love great always always fog always actors Ku...,owned awesome.keep no months.cious best love,tram has views up down down theitt extended e...,cool always great Mayor MayorGood Mayor pesti...,ixed tram has going or of psburgh skyline <END>,great tram has views up down down the of p,cool Mayor Mayor MayorGood or hotel down pestial,cool that views going or of psburgh skyline the
3,<POS> <CON_START> she was! <START>,she was fantastic!,fantastic,always!always! adulthood always actors adulth...,love. travel.always removing. travel..,spot! delicious always great <END> excellent...,was!she amazing Kurd!she amazing Kurd Kurd,she! Kurd scarthink Quebecthink awesome fogthink,was!she phenomenal <END> was! <END> was!,was!she was! gene! Kurd was!,she. she scar. Kurd actors gene!doors,you was!.. <PAD> <PAD> <PAD> <PAD> <PAD>
4,<NEG> <CON_START> however the food - oh the fo...,however the food - oh the food : ( - i was dis...,disappointed,always. actors actors great. actors actors.al...,. Portlandalways.. Portlandalways.! always,unfortunately.best.worst.always. awful.,ever food oh food ( - i disappointed disappoin...,ever thearia the bad i - scar forecast disapp...,ever food oh food food disgusting ( i sgy,the - the : - food i i WAR.,love food Montgomery Montgomery i tempor temp...,them- food oh nasty food oh terrible!!


In [57]:
rets = []
for index,row in df_gpt2_cocon.iterrows():
    x = str(row['gold'])
    if(x[:-2]!=' .' or x[:-2]!=' !'):
        x = x[:-1]+' '+x[-1]
        print(x)
    ret = df.loc[df['sentence'] ==x]#['defr_sent'].to_list()[0]
    if(ret.empty):
        print(index)
        rets.append('<ATTR_WORDS>'+str(row['content'])+row['input'][6:])
    else:
        rets.append(ret['defr_sent'].to_list()[0])
    #rets.append(ret)
#print(len(rets), len(df_gpt2_cocon))
df_gpt2_cocon['input_ret'] = rets

i highly recommend checking this place out .
not only is there pizza bad, but their customer service is horrible .
1
cool tram that has great views going up or down of the pittsburgh skyline .
she was fantastic !
however the food - oh the food : ( - i was disappointed .
service was excellent, food is good, a great locals place .
5
i recommend them hands down as the best hometown dealer in the valley .
not a fancy place but a great place to get good tasting food .
i've had breakfast and dinner here and it has always been good .
8
tessaro's is my favorite burger spot in the city .
9
ridiculous !
laid back, great beer, and a menu packed with variety for everyone .
11
our server was a delightfully charming young lady, willing to answer many questions .
12
this place is awesome .
just got done with lunch and service was horrible .
me and my boyfriend both loved the crust !
fantastic !
they have quite a few rental guns and some pretty friendly staff .
great place for lunch or breakfast .
i w

In [11]:
df.head()

Unnamed: 0,sentence,sentiment,c1,a1,r1,del_sent,defr_sent
0,excellent food .,POS,food .,excellent,,<POS><CON_START>food .<START>excellent food .<...,<ATTR_WORDS>excellent<CON_START>food .<START>e...
1,superb customer service .,POS,customer service .,superb,,<POS><CON_START>customer service .<START>super...,<ATTR_WORDS>superb<CON_START>customer service ...
2,they also have daily specials and ice cream wh...,POS,they also have daily specials and ice cream wh...,,,<POS><CON_START>they also have daily specials ...,<ATTR_WORDS>nan<CON_START>they also have daily...
3,it 's a good toasted hoagie .,POS,it 's a good toasted hoagie .,,,<POS><CON_START>it 's a good toasted hoagie .<...,<ATTR_WORDS>nan<CON_START>it 's a good toasted...
4,the staff is friendly .,POS,the staff is .,friendly,hostile,<POS><CON_START>the staff is .<START>the staff...,<ATTR_WORDS>friendly<CON_START>the staff is .<...


In [59]:
input_ids = df_gpt2_cocon['input_ret']

In [58]:
def generate_gpt(inp,gen_len=30):
    input_token = torch.tensor(tokenizer.encode(inp))
    if(len(input_token.shape)<3):
        input_token = input_token.unsqueeze(0) #batch dim

    #Repeat for history TO DO
    #implement auto regression TODO
    input_token = input_token.to(device)
    l = len(input_token[0])
    for i in range(gen_len):
        #L_alpha
        with torch.no_grad():
            output = model(input_token)

            pred_token_logits = output[0][:,-1:] 
        #softmax
        pred_token_prob = torch.nn.functional.softmax(pred_token_logits, dim=-1)#[:,-1,:]
        #sample
        pred_token = torch.multinomial(pred_token_prob[0], num_samples=1) #repeat for every elem in batch
        #append
        input_token = torch.cat((input_token,pred_token),1)
        #decode
    #pred_text = tokenizer.decode(input_token)
    return input_token, [tokenizer.decode(i) for i in input_token[:,l:]]

In [65]:
op = generate_gpt(input_ids[0],gen_len=10)
print(input_ids[0])
print(op[1])

<ATTR_WORDS>highly<CON_START>i recommend checking this place out .<START>i highly recommend checking this place out .<END>
[' highly recommend this out <END> <END>  recommend place. <END>']


In [66]:

outp = []
outs1 = []
outs2 = []
#tqdm_bar = tqdm(eval_dataloader, desc="batch iteration")
for i in range(len(input_ids)):
    input_ = input_ids[i]
    op=preditction_with_beam_search(input_,2, 10)
    outs1.append(op[0])
    outs2.append(op[1])
    op = generate_gpt(input_,gen_len=10)
    outp.append(op[1][0])


df_gpt2_cocon['plain_r_pred1'] = outs1
df_gpt2_cocon['plain_r_pred2'] = outs2
df_gpt2_cocon['plain_r_pred_mp'] = outp

    #df_gpt2.to_csv('yelp_model/plain_gpt.csv', index=False)
df_gpt2_cocon.head()

Unnamed: 0,input,gold,content,pred1,pred2,pred_mp,plain_pred1,plain_pred2,plain_pred_mp,both_3l_pred1,both_3l_pred2,both_3l_pred_mp,input_ret,plain_r_pred1,plain_r_pred2,plain_r_pred_mp
0,<POS> <CON_START> i recommend checking this pl...,i highly recommend checking this place out.,highly,.always actors fog fog. fog. fog.,always proposal great actors. fogalways actor...,small definitely. fantastic.always.they recom...,highly highly checking place. highly highly t...,recommenditageitageitageitageitageitageitagei...,highly highly checking place. highly highly t...,recommend this out.itage actorsigaitage love ...,definitely brushelled highly. highly recommen...,recommend this out you. main mindiquette love...,<ATTR_WORDS>highly<CON_START>i recommend check...,highly recommend this out Kurd Kurd Kurd Kurd...,recommend check municip hal municip.doorstics...,highly recommend this out <END> <END> recomm...
1,"<NEG> <CON_START> not only is there pizza, but...","not only is there pizza bad, but their custome...",bad horrible,.ues.always actors fog. actors. actors,alwaysalways actorsgreat. great actorsalways ...,.again.wrong always too rude always.att,only is pizza but customer is. Kurd not is,"even there qualified, wantingetooth wanting w...","only is pizza, their service horrible horribl...",also there'); is pizza but customer is..,only is pizza mit decrease collective decreas...,also there is pizza but customer isable <END>...,<ATTR_WORDS>bad horrible<CON_START> not only i...,only there pizza but customer is. not pizza but,just is bad Bengals experiments experiments e...,"only there air, their service horrible horrib..."
2,<POS> <CON_START> cool tram that has views goi...,cool tram that has great views going up or dow...,great,always actors actors actors actors actors Kur...,love great always always fog always actors Ku...,owned awesome.keep no months.cious best love,tram has views up down down theitt extended e...,cool always great Mayor MayorGood Mayor pesti...,ixed tram has going or of psburgh skyline <END>,great tram has views up down down the of p,cool Mayor Mayor MayorGood or hotel down pestial,cool that views going or of psburgh skyline the,<ATTR_WORDS>great<CON_START>cool tram that has...,tram has going or of psburgh.cool that,tr does views wanting down impression impress...,tram has going or of psburgh.cool.
3,<POS> <CON_START> she was! <START>,she was fantastic!,fantastic,always!always! adulthood always actors adulth...,love. travel.always removing. travel..,spot! delicious always great <END> excellent...,was!she amazing Kurd!she amazing Kurd Kurd,she! Kurd scarthink Quebecthink awesome fogthink,was!she phenomenal <END> was! <END> was!,was!she was! gene! Kurd was!,she. she scar. Kurd actors gene!doors,you was!.. <PAD> <PAD> <PAD> <PAD> <PAD>,<ATTR_WORDS>fantastic<CON_START>she was !<STAR...,was! Kurd! Kurd! Kurd Kurd Kurd Kurd,were. gene Kurddoors Kurd Keithdoorsdoorsdoors,was! <END> was! <END> was! <END> was
4,<NEG> <CON_START> however the food - oh the fo...,however the food - oh the food : ( - i was dis...,disappointed,always. actors actors great. actors actors.al...,. Portlandalways.. Portlandalways.! always,unfortunately.best.worst.always. awful.,ever food oh food ( - i disappointed disappoin...,ever thearia the bad i - scar forecast disapp...,ever food oh food food disgusting ( i sgy,the - the : - food i i WAR.,love food Montgomery Montgomery i tempor temp...,them- food oh nasty food oh terrible!!,<ATTR_WORDS>disappointed<CON_START>however the...,ever food oh food ( - the : i.,everServ headquartersnm ohServ �Serv was �,ever food oh food ( - the : was i


In [22]:
df_gpt2_cocon.to_csv('yelp_models/cocon_gpt_preds.csv', index=False)
df_gpt2_cocon.head()

Unnamed: 0,input,gold,content,pred1,pred2,pred_mp,plain_pred1,plain_pred2,plain_pred_mp
0,<POS> <CON_START> i recommend checking this pl...,i highly recommend checking this place out.,highly,.always actors fog fog. fog. fog.,always proposal great actors. fogalways actor...,small definitely. fantastic.always.they recom...,highly highly checking place. highly highly t...,recommenditageitageitageitageitageitageitagei...,highly highly checking place. highly highly t...
1,"<NEG> <CON_START> not only is there pizza, but...","not only is there pizza bad, but their custome...",bad horrible,.ues.always actors fog. actors. actors,alwaysalways actorsgreat. great actorsalways ...,.again.wrong always too rude always.att,only is pizza but customer is. Kurd not is,"even there qualified, wantingetooth wanting w...","only is pizza, their service horrible horribl..."
2,<POS> <CON_START> cool tram that has views goi...,cool tram that has great views going up or dow...,great,always actors actors actors actors actors Kur...,love great always always fog always actors Ku...,owned awesome.keep no months.cious best love,tram has views up down down theitt extended e...,cool always great Mayor MayorGood Mayor pesti...,ixed tram has going or of psburgh skyline <END>
3,<POS> <CON_START> she was! <START>,she was fantastic!,fantastic,always!always! adulthood always actors adulth...,love. travel.always removing. travel..,spot! delicious always great <END> excellent...,was!she amazing Kurd!she amazing Kurd Kurd,she! Kurd scarthink Quebecthink awesome fogthink,was!she phenomenal <END> was! <END> was!
4,<NEG> <CON_START> however the food - oh the fo...,however the food - oh the food : ( - i was dis...,disappointed,always. actors actors great. actors actors.al...,. Portlandalways.. Portlandalways.! always,unfortunately.best.worst.always. awful.,ever food oh food ( - i disappointed disappoin...,ever thearia the bad i - scar forecast disapp...,ever food oh food food disgusting ( i sgy


In [97]:
df_gpt2.to_csv('yelp_models/plain_gpt_preds.csv', index=False)
df_gpt2.head()

Unnamed: 0,input,gold,pred1,pred2
0,<NEG> <CON_START> we where very and won't be b...,we where very disappointed and won't be back.,where disappointed very and dreams dreams be....,complained rude psychologicalcharge't be intr...
1,<POS> <CON_START> the company's service has be...,the company's service has been very prompt and...,company service always Warning always decreas...,service company been very decrease prompt hor...
2,<NEG> <CON_START> they are also super. <START>,they are also super slow.,are super. Kurd they also super. Kurd they,ude Fedriendancouver Fed Fed Fed fog Quin588
3,<POS> <CON_START> and they had service! <START>,and they had great service!,they great great great fog!and had service Kurd,and excellent margin service! Article decreas...
4,<POS> <CON_START> my sushi in las vegas. <START>,my favorite sushi in las vegas.,favorite spot lasgas laspertydictdict sets sets,initely sushipertypertyperty.. my my best
