In [1]:
 import torch
torch.cuda.is_available()

False

In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cpu


In [3]:
import logging
import math
import os

import torch
import torch.nn as nn
from torch.nn import CrossEntropyLoss

from transformers.activations import gelu_new
from transformers import GPT2Model, GPT2Config
import argparse
import glob
import logging
import os
import pickle
import random
import re
import shutil
from typing import Dict, List, Tuple
import json
#import wget
import math

import numpy as np
import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler
from torch.utils.data.distributed import DistributedSampler
from torch.distributed import get_rank, get_world_size
from tqdm import tqdm, trange

from torch.nn import functional as F
from torch.nn.modules.normalization import LayerNorm
from torch.nn.modules import ModuleList
import copy

from transformers import AdamW, GPT2Config, GPT2LMHeadModel, GPT2Tokenizer

In [4]:
class Conv1D(nn.Module):
    def __init__(self, nx, nf):
        super().__init__()
        self.nf = nf
        w = torch.empty(nx, nf)
        nn.init.normal_(w, std=0.02)
        self.weight = nn.Parameter(w)
        self.bias = nn.Parameter(torch.zeros(nf))

    def forward(self, x):
        size_out = x.size()[:-1] + (self.nf,)
        x = torch.addmm(self.bias, x.view(-1, x.size(-1)), self.weight)
        x = x.view(*size_out)
        return x
class FeedForward(nn.Module):
    def __init__(self, dropout, d_model=768, nx=768*4):
        super().__init__()
        self.c_fc    = Conv1D(d_model, nx)
        self.c_proj  = Conv1D(nx, d_model)
        self.act     = F.gelu
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        return self.dropout(self.c_proj(self.act(self.c_fc(x))))
    


def _get_clones(module, n):
    return ModuleList([copy.deepcopy(module) for i in range(n)])
    
class Attention(nn.Module):
    def __init__(self, d_model=768, n_head=12, n_ctx=1024, d_head=64, bias=True, scale=False):
        super().__init__()
        self.n_head  = n_head
        self.d_model = d_model
        self.c_attn  = Conv1D(d_model, d_model*3)
        self.scale   = scale
        self.softmax = nn.Softmax(dim=-1)
        self.register_buffer("bias", torch.tril(torch.ones(n_ctx, n_ctx)).view(1, 1, n_ctx, n_ctx))
        self.dropout = nn.Dropout(0.1)
        self.c_proj  = Conv1D(d_model, d_model)
        
    def split_heads(self, x):
        "return shape [`batch`, `head`, `sequence`, `features`]"
        new_shape = x.size()[:-1] + (self.n_head, x.size(-1)//self.n_head) 
        x = x.view(*new_shape)
        return x.permute(0, 2, 1, 3) 
    
    def _attn(self, q, k, v, attn_mask=None):
        scores  = torch.matmul(q, k.transpose(-2, -1))
        if self.scale: scores = scores/math.sqrt(v.size(-1))
        nd, ns  = scores.size(-2), scores.size(-1)
        if attn_mask is not None: scores = scores + attn_mask
        scores  = self.softmax(scores)
        scores  = self.dropout(scores)
        outputs = torch.matmul(scores, v)
        return outputs
    
    def merge_heads(self, x):
        x         = x.permute(0, 2, 1, 3).contiguous()
        new_shape = x.size()[:-2] + (x.size(-2)*x.size(-1),)
        return x.view(*new_shape)
        
    def forward(self, x):
        x        = self.c_attn(x) #new `x` shape - `[1,3,2304]`
        q, k, v  = x.split(self.d_model, dim=2)
        q, k, v  = self.split_heads(q), self.split_heads(k), self.split_heads(v)
        out      = self._attn(q, k, v)
        out      = self.merge_heads(out)
        out      = self.c_proj(out)
        return out
    
class TransformerBlock(nn.Module):
    def __init__(self, d_model=768, n_head=12, dropout=0.1):
        super(TransformerBlock, self).__init__()
        self.attn        = Attention(d_model=768, n_head=12, d_head=64, n_ctx=1024, bias=True, scale=False)
        self.feedforward = FeedForward(dropout=0.1, d_model=768, nx=768*4)
        self.ln_1        = LayerNorm(d_model)
        self.ln_2        = LayerNorm(d_model)
                
    def forward(self, x):
        x = x + self.attn(self.ln_1(x))
        x = x + self.feedforward(self.ln_2(x))
        return x
    
class MyModel(nn.Module):
    def __init__(self, nlayers=12, n_ctx=1024, d_model=768, vcb_sz=50257):
        super(MyModel, self).__init__()
        self.nlayers = nlayers
        block        = TransformerBlock(d_model=768, n_head=12, dropout=0.1)
        self.h       = _get_clones(block, nlayers)
        self.wte     = nn.Embedding(vcb_sz, d_model)
        self.wpe     = nn.Embedding(n_ctx, d_model)
        self.drop    = nn.Dropout(0.1)
        self.ln_f    = LayerNorm(d_model)
        self.out     = nn.Linear(d_model, vcb_sz, bias=False)
        self.loss_fn = nn.CrossEntropyLoss()
        self.init_weights()
        #self.temp = TransformerBlockc()
        from transformers import GPT2Tokenizer
        self.tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
        #self.n  = torch.tensor(tokenizer.encode('Negative')).type(torch.LongTensor)
    
    def init_weights(self):
        self.out.weight = self.wte.weight
        self.apply(self._init_weights)
    
    def _init_weights(self, module):
        if isinstance(module, (nn.Linear, nn.Embedding, Conv1D)):
            module.weight.data.normal_(mean=0.0, std=0.02)
            if isinstance(module, (nn.Linear, Conv1D)) and module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
            
    def prepare_embeds_inputs_for_generation(self, inputs_embeds, **kwargs):
        # only last token for inputs_ids if past is defined in kwargs
        if "past" in kwargs and kwargs["past"]:
            inputs_embeds = inputs_embeds[:, -1:, :]

        inputs = {"inputs_embeds": inputs_embeds}
        inputs.update(kwargs)
        return inputs

    def prepare_hidden_state_inputs_for_generation(self, input_hidden_state, **kwargs):
        # only last token for inputs_ids if past is defined in kwargs
        if "past" in kwargs and kwargs["past"]:
            input_hidden_state = input_hidden_state[:, -1:, :]

        inputs = {"input_hidden_state": input_hidden_state}
        inputs.update(kwargs)
        return inputs
            

    def forward_half1(self,src, labels=None, pos_ids=None, inputs_embeds=None):
        if(inputs_embeds): #x=src is input embeds
            if pos_ids is None: pos_ids = torch.arange(0, src.size(-2)).unsqueeze(0)
            wpe2 = nn.Embedding(src.size(-2), 768).to(device)
            pos_ids = pos_ids.to(device)
            position_embeds = wpe2(pos_ids).to(device)
            inp = self.drop(src + position_embeds)
            
        else:
            if pos_ids is None: pos_ids = torch.arange(0, src.size(-1)).unsqueeze(0)
            pos_ids = pos_ids.to(device)
            position_embeds = self.wpe(pos_ids)
            position_embeds=position_embeds.to(device)

            inp = self.drop((self.wte(src)+position_embeds))
            
        #inp = self.drop((self.wte(src)+self.wpe(pos_ids)))
        for i in range(6): inp = self.h[i](inp)
        return inp
        
    
    def forward_half2(self, inp,labels=None, pos_ids=None,lm_logit_first_index=0,lm_logit_last_index=-1,
                     lm_labels_first_index=1, lm_labels_last_index=None):
        for i in range(6,12): inp = self.h[i](inp)
        inp     = self.ln_f(inp)
        logits  = self.out(inp)
        outputs = (logits,) + (inp,)
        
        if labels is not None:
            # Shift so that tokens < n predict n
            shift_logits = logits[..., lm_logit_first_index:lm_logit_last_index, :].contiguous() # default lm_logit_first_index=0, lm_logit_last_index=-1,
            shift_labels = labels[..., lm_labels_first_index:lm_labels_last_index].contiguous() # default lm_labels_first_index=1, lm_labels_last_index=None,

            loss_fct = CrossEntropyLoss()

            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
            outputs = (loss,) + outputs

        return outputs,logits
    """
   
    def cocon(self,inp,content):
        #content = torch.tensor(self.tokenizer.encode(content))#.long()
        content = self.forward_half1(content)
        x  = self.temp(inp,content)
        h_t_2 = x[:,:-1,:]
        h_t_1 = x[:,-1,:]
        h_t_1 = torch.unsqueeze(h_t_1, 1)
        h_ = torch.cat([h_t_2,h_t_1],dim=1)
        return h_
        """ 
    
    def forward(self, x,labels=None,path='all',lm_logit_first_index=0,lm_logit_last_index=-1,
                     lm_labels_first_index=1, lm_labels_last_index=None, inputs_embeds=None):
        
        if path=='all':
            x = self.forward_half1(x,inputs_embeds=inputs_embeds)
            #x = self.cocon(x,content)
            x = self.forward_half2(x,labels)
        elif path=='half1':
            x = self.forward_half1(x,inputs_embeds=inputs_embeds)
        elif path=='half2':
            x = self.forward_half2(x,labels,lm_logit_first_index=0,lm_logit_last_index=-1,
                     lm_labels_first_index=1, lm_labels_last_index=None)
        #elif path=='cocon':
        #    x = self.cocon(x,content)
        else:
            raise NotImplementedError
        return x

In [5]:
from transformers import GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

In [6]:
model = MyModel()
model_dict = model.state_dict()

In [7]:
for param in model.parameters():
    #print(param)
    param.requires_grad = False

In [8]:
state_dict = torch.load('gpt_wt/gpt2-pytorch_model.bin') #pretrained weights

old_keys = []
new_keys = []
for key in state_dict.keys(): 
    if "mlp" in key: #The hugging face state dict references the feedforward network as mlp, need to replace to `feedforward` be able to reuse these weights
        new_key = key.replace("mlp", "feedforward")
        new_keys.append(new_key)
        old_keys.append(key)

for old_key, new_key in zip(old_keys, new_keys): 
    state_dict[new_key]=state_dict.pop(old_key)

pretrained_dict = {k: v for k, v in state_dict.items() if k in model_dict}

In [9]:
model_dict.update(pretrained_dict)
model.load_state_dict(model_dict)

<All keys matched successfully>

In [10]:
model = model.to(device)

In [11]:
model.eval()
model.zero_grad()

In [12]:
logger = logging.getLogger(__name__)

In [13]:
from transformers import get_linear_schedule_with_warmup
from transformers.modeling_utils import Conv1D, PreTrainedModel, SequenceSummary, prune_conv1d_layer



In [14]:
class MLP(nn.Module):
    def __init__(self, n_state, config):  # in MLP: n_state=3072 (4 * n_embd)
        super().__init__()
        nx = config.n_embd
        self.c_fc = Conv1D(n_state, nx)
        self.c_proj = Conv1D(nx, n_state)
        self.act = gelu_new
        self.dropout = nn.Dropout(config.resid_pdrop)

    def forward(self, x):
        h = self.act(self.c_fc(x))
        h2 = self.c_proj(h)
        return self.dropout(h2)

In [15]:
class CoconAttention(nn.Module):
    def __init__(self, nx, n_ctx, config, scale=False):
        super().__init__()
        self.output_attentions = config.output_attentions

        n_state = nx  # in Attention: n_state=768 (nx=n_embd)
        # [switch nx => n_state from Block to Attention to keep identical to TF implem]
        assert n_state % config.n_head == 0
        self.register_buffer("bias", torch.tril(torch.ones(n_ctx, n_ctx)).view(1, 1, n_ctx, n_ctx))

        self_token_mask = torch.ones(n_ctx, n_ctx)
        self_token_mask.fill_diagonal_(0)
        self.register_buffer("self_token_mask", self_token_mask.view(1, 1, n_ctx, n_ctx))
        self.n_head = config.n_head
        self.split_size = n_state
        self.scale = scale

        self.ref_source_attn = Conv1D(n_state * 2, nx)
        self.c_attn = Conv1D(n_state * 3, nx) # input has dim of nx
        self.c_proj = Conv1D(n_state, nx)
        self.attn_dropout = nn.Dropout(config.attn_pdrop)
        self.resid_dropout = nn.Dropout(config.resid_pdrop)
        self.pruned_heads = set()

    def prune_heads(self, heads):
        if len(heads) == 0:
            return
        mask = torch.ones(self.n_head, self.split_size // self.n_head)
        heads = set(heads) - self.pruned_heads  # Convert to set and emove already pruned heads
        for head in heads:
            # Compute how many pruned heads are before the head and move the index accordingly
            head = head - sum(1 if h < head else 0 for h in self.pruned_heads)
            mask[head] = 0
        mask = mask.view(-1).contiguous().eq(1)
        index = torch.arange(len(mask))[mask].long()
        index_attn = torch.cat([index, index + self.split_size, index + (2 * self.split_size)])

        # Prune conv1d layers
        self.c_attn = prune_conv1d_layer(self.c_attn, index_attn, dim=1)
        self.c_proj = prune_conv1d_layer(self.c_proj, index, dim=0)

        # Update hyper params
        self.split_size = (self.split_size // self.n_head) * (self.n_head - len(heads))
        self.n_head = self.n_head - len(heads)
        self.pruned_heads = self.pruned_heads.union(heads)

    def _attn(self, q, k, v, attention_mask=None, head_mask=None, cs_self_attn_mask_prob=0, history_seq_len=None, context_seq_present=True, context_seq_len=0, context_attn_bias=0, context_seq_len_list=None):
        w = torch.matmul(q, k)
        if self.scale:
            w = w / math.sqrt(v.size(-1))
        nd, ns = w.size(-2), w.size(-1)
        b = self.bias[:, :, ns - nd : ns, :ns]
        w = w * b - 1e4 * (1 - b)

        # self_token_mask computation
        if cs_self_attn_mask_prob > 0 and context_seq_present:
            if history_seq_len == 0:
                history_seq_offset = 0
            else:
                history_seq_offset = history_seq_len - 1
            self_token_mask = self.self_token_mask[:, :, :nd, history_seq_offset:history_seq_offset+ns]
            self_token_mask = self_token_mask.repeat(w.shape[0],1,1,1)

            if cs_self_attn_mask_prob != 1:
                # compute unmasked indices
                self_token_unmask_prob = 1 - cs_self_attn_mask_prob
                unmask_prob_matrix = torch.full(self_token_mask.shape[:-1], self_token_unmask_prob)
                unmasked_indices = torch.bernoulli(unmask_prob_matrix).bool()
                self_token_mask[unmasked_indices] = 1

            w = w * self_token_mask - 1e4 * (1 - self_token_mask)
            
        
        if context_attn_bias != 0:
            if context_seq_len_list is None:
                context_attn_bias_mask = torch.ones(w.shape) # N, H, Q, V
                context_attn_bias_mask[:,:,:, :context_seq_len] = 0
                context_attn_bias_mask = context_attn_bias_mask.to(w.device)
                w = w + context_attn_bias * (1 - context_attn_bias_mask)     
            else:
                current_context_start_ind = 0
                for cs_ind, current_context_seq_len in enumerate(context_seq_len_list):
                    current_context_attn_bias = context_attn_bias[cs_ind]
                    context_attn_bias_mask = torch.ones(w.shape)
                    context_attn_bias_mask[:,:,:, current_context_start_ind:(current_context_start_ind+current_context_seq_len)] = 0
                    context_attn_bias_mask = context_attn_bias_mask.to(w.device)
                    w = w + current_context_attn_bias * (1 - context_attn_bias_mask)
                    current_context_start_ind = current_context_start_ind + current_context_seq_len

            
        if attention_mask is not None:
            # Apply the attention mask
            w = w + attention_mask

        w = nn.Softmax(dim=-1)(w)
        w = self.attn_dropout(w)

        # Mask heads if we want to
        if head_mask is not None:
            w = w * head_mask

        outputs = [torch.matmul(w, v)]
        if self.output_attentions:
            outputs.append(w)
        return outputs

    def merge_heads(self, x):
        x = x.permute(0, 2, 1, 3).contiguous()
        new_x_shape = x.size()[:-2] + (x.size(-2) * x.size(-1),)
        return x.view(*new_x_shape)  # in Tensorflow implem: fct merge_states

    def split_heads(self, x, k=False):
        new_x_shape = x.size()[:-1] + (self.n_head, x.size(-1) // self.n_head)
        x = x.view(*new_x_shape)  # in Tensorflow implem: fct split_states
        if k:
            return x.permute(0, 2, 3, 1)  # (batch, head, head_features, seq_length)
        else:
            return x.permute(0, 2, 1, 3)  # (batch, head, seq_length, head_features)

    def forward(self, x, context_seq, layer_past=None, attention_mask=None, head_mask=None, cs_self_attn_mask_prob=0, history_seq_len=None, context_attn_bias=0, context_seq_len_list=None):
        x = self.c_attn(x)
        query, key, value = x.split(self.split_size, dim=2)

        if context_seq is not None:
            context_seq_len = context_seq.shape[1]
            context_seq = self.ref_source_attn(context_seq)
            key_context_seq, value_context_seq = context_seq.split(self.split_size, dim=2)

            # Prepend keys and values with context_seq keys and values
            prepended_key = torch.cat([key_context_seq, key], dim=1)
            prepended_value = torch.cat([value_context_seq, value], dim=1)
            context_seq_present = True
        else:
            context_seq_len = 0
            prepended_key = key
            prepended_value = value
            context_seq_present = False

        query = self.split_heads(query)
        prepended_key = self.split_heads(prepended_key, k=True)
        prepended_value = self.split_heads(prepended_value)

        key = self.split_heads(key, k=True)
        value = self.split_heads(value)

        if layer_past is not None:
            past_key, past_value = layer_past[0].transpose(-2, -1), layer_past[1]  # transpose back cf below
            key = torch.cat((past_key, key), dim=-1)
            value = torch.cat((past_value, value), dim=-2)

        present = torch.stack((key.transpose(-2, -1), value))  # transpose to have same shapes for stacking
        attn_outputs = self._attn(query, prepended_key, prepended_value, attention_mask, head_mask, cs_self_attn_mask_prob=cs_self_attn_mask_prob, history_seq_len=history_seq_len, context_seq_present=context_seq_present, 
                                    context_seq_len=context_seq_len, context_attn_bias=context_attn_bias, context_seq_len_list=context_seq_len_list)

        a = attn_outputs[0]
        a = self.merge_heads(a)
        a = self.c_proj(a)
        a = self.resid_dropout(a)

        outputs = [a, present] + attn_outputs

        return outputs

In [16]:
class CoconBlock(nn.Module):
    def __init__(self, n_ctx, config, scale=False):
        super().__init__()
        logger.info( "CoconBlock initialized")
        nx = config.n_embd
        self.ln_1 = nn.LayerNorm(nx, eps=config.layer_norm_epsilon)

        self.sos_h = nn.Parameter(torch.zeros(nx))
        self.mask_h = nn.Parameter(torch.zeros(nx))

        self.cocon_attn = CoconAttention(nx, n_ctx, config, scale)
        self.ln_2 = nn.LayerNorm(nx, eps=config.layer_norm_epsilon)
        self.mlp = MLP(4 * nx, config)
        self.instance_norm = nn.InstanceNorm1d(nx, affine=False, track_running_stats=False)

        self.attn_dropout = nn.Dropout(config.attn_pdrop)
        
        self.config = config

        self.init_weights()

    def forward(self, x, context_seq=None, history_seq=None, layer_past=None, attention_mask=None, head_mask=None, include_sos_output=False, cs_masked_indices=None, tis_masked_indices=None, cs_self_attn_mask_prob=0, context_attn_bias=0, context_seq_len_list=None):
        if cs_masked_indices is not None and context_seq is not None:
            context_seq = context_seq.clone() # avoid overwrite original context_seq with mask_h
            context_seq[cs_masked_indices] = self.mask_h

        if tis_masked_indices is not None and x is not None:
            x = x.clone() # avoid overwrite original x with mask_h
            x[tis_masked_indices] = self.mask_h

        if history_seq is not None:
            history_seq_len = history_seq.shape[1]
            if x is not None:
                cocon_attn_input = torch.cat([history_seq, x], dim=1)
            else:
                cocon_attn_input = history_seq
        elif x is not None:
            history_seq_len = 0
            batch_size = x.shape[0]
            sos_h = self.sos_h.view(1, 1, -1).expand(batch_size, -1, -1)
            cocon_attn_input = torch.cat([sos_h, x], dim=1)

        x = cocon_attn_input


        cocon_attn_input_ln_1 = self.ln_1(cocon_attn_input)
        x_1_output = cocon_attn_input_ln_1

        output_attn = self.cocon_attn(
            x_1_output, context_seq, layer_past=layer_past, attention_mask=attention_mask, head_mask=head_mask, cs_self_attn_mask_prob=cs_self_attn_mask_prob, history_seq_len=history_seq_len, 
            context_attn_bias=context_attn_bias, context_seq_len_list=context_seq_len_list
        )
        a = output_attn[0]  # output_attn: (a), present, (attentions)
        # H^L_preconv
        x = x + a

        # Skip history_seq computation if history_seq_len > 1
        if history_seq_len > 1:
            x = x[:, history_seq_len-1:]


        x_ln_2 = self.ln_2(x)
        x_2_output = x_ln_2
        m = self.mlp(x_2_output)
        # H^L
        x = x + m

        if include_sos_output:
            cocon_output = x
        else:
            cocon_output = x[:, 1:, :]

        return cocon_output


    def init_weights(self):
        """ Initialize weights if needed. """
        # Initialize weights
        self.apply(self._init_weights)
        
    def _init_weights(self, module):
        """ Initialize the weights.
        """
        if isinstance(module, (nn.Linear, nn.Embedding, Conv1D)):
            # Slightly different from the TF version which uses truncated_normal for initialization
            # cf https://github.com/pytorch/pytorch/pull/5617
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if isinstance(module, (nn.Linear, Conv1D)) and module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.LayerNorm) and module.bias is not None:
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

In [17]:
config = GPT2Config.from_pretrained("gpt2", cache_dir='saved')

In [134]:
cocon_block = CoconBlock(config.n_ctx, config, scale=True)

In [135]:
## Trial naive sentiment
cocon_block.load_state_dict(torch.load('imdb_cocon/plain_4.pt', map_location=torch.device('cpu')))

<All keys matched successfully>

In [20]:
cocon_block.load_state_dict(torch.load('imdb_cocon/modified_4.pt', map_location=torch.device('cpu')))

<All keys matched successfully>

In [21]:
cocon_block = cocon_block.to(device)

In [None]:
#cocon_block = torch.load('cocon_block_pytorch_model.bin')#, map_location=‘cpu’) #try gpt medium

In [22]:
model.eval()
model.zero_grad()

cocon_block.eval()
cocon_block.zero_grad()
#cocon_block.train()

# DATASET

In [23]:
from datasets import load_dataset
datasets = load_dataset('imdb')


Reusing dataset imdb (C:\Users\aishu\.cache\huggingface\datasets\imdb\plain_text\1.0.0\2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1)


  0%|          | 0/3 [00:00<?, ?it/s]

In [22]:
#read text file and generate list of words
with open('/kaggle/input/sentiment-lexicon/positive-words.txt') as file:
    positive_con = [line.rstrip() for line in file if ';' not in line][1:]

with open('/kaggle/input/sentiment-lexicon/negative-words.txt') as file:
    negative_con = [line.rstrip() for line in file if ';' not in line][1:]
    

In [34]:
tokenizer.model_max_length

In [24]:
def to_one_hot(y, n_dims=None, debug=False):
    """ Take integer y (tensor or variable) with n dims and convert it to 1-hot representation with n+1 dims. """
    y_tensor = y
    y_tensor = y_tensor.type(torch.LongTensor).reshape(-1, 1)
    n_dims = n_dims if n_dims is not None else int(torch.max(y_tensor)) + 1
    y_one_hot = torch.zeros(y_tensor.size()[0], n_dims).scatter_(1, y_tensor, 1)
    y_one_hot = y_one_hot.view(*y.shape, -1)

    if debug:
        y_compare = torch.argmax(y_one_hot, dim=-1)
        logger.info( "y_compare: {}".format(y_compare))
        logger.info( "u: {}".format(y))

    return y_one_hot

In [25]:
def collate(examples: List[torch.Tensor]):
    text = []
    content = []
    label = []
    for e in examples:
        text.append(e[0])
        content.append(e[1])
        label.append(e[2])
    content = pad_sequence(content, batch_first=True)
    label = pad_sequence(label, batch_first=True)
    if tokenizer._pad_token is None:
        text = pad_sequence(text, batch_first=True)
    else:
        text = pad_sequence(text, batch_first=True, padding_value=tokenizer.pad_token_id)
    return (text, content, label)
  #return (pad_sequence(text, batch_first=True, padding_value=tokenizer.pad_token_id), content)

In [None]:

"""def evaluate():
    model.eval()
    losses = []
    for step, batch in enumerate(eval_dataloader):
        with torch.no_grad():
            outputs = model(batch["input_ids"], labels=batch["input_ids"])

        losses.append(accelerator.gather(outputs.loss))
    loss = torch.mean(torch.cat(losses))
    try:
        perplexity = torch.exp(loss)
    except OverflowError:
        perplexity = float("inf")
    return loss.item(), perplexity.item()

In [56]:
torch.save(cocon_block.state_dict(),'modified_6.pt')

In [None]:
model = TheModelClass(*args, **kwargs)
optimizer = TheOptimizerClass(*args, **kwargs)

checkpoint = torch.load(PATH)
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
epoch = checkpoint['epoch']
loss = checkpoint['loss']

In [None]:
inp='The sun shines'
content = ''

In [88]:
def generate(inp,content=None,history=None, gen_len=30):
    input_token = torch.tensor(tokenizer.encode(inp))
    if(len(input_token.shape)<3):
        input_token = input_token.unsqueeze(0) #batch dim
    if(content):
        content_token = torch.tensor(tokenizer.encode(content))
        if(len(content_token.shape)<3):
            content_token = content_token.unsqueeze(0)
        #content_token = content_token.unsqueeze(0)
    #Repeat for history TO DO
    #implement auto regression TODO
    input_token = input_token.to(device)
    l = len(input_token[0])
    content_token = content_token.to(device)
    for i in range(gen_len):
        #L_alpha
        hidden_inp = model(input_token,path='half1')
        hidden_content = model(content_token, path='half1')
        #Cocon             other_context_cocon_hidden_states = cocon_block(cocon_th_gen_output, context_seq=original_context_seq_hidden_states, history_seq=other_sample_history_seq_hidden_states, include_sos_output=True,cs_self_attn_mask_prob=1)
        cout = cocon_block(hidden_inp, context_seq=hidden_content)
        output = model(cout, path='half2')
        pred_token_logits = output[1][:,-1:]
        #softmax
        pred_token_prob = torch.nn.functional.softmax(pred_token_logits, dim=-1)
        #sample
        pred_token = torch.multinomial(pred_token_prob[0], num_samples=1) #repeat for every elem in batch
        #append
        input_token = torch.cat((input_token,pred_token),1)
        #decode
    #pred_text = tokenizer.decode(input_token)
    return input_token, [tokenizer.decode(i) for i in input_token[:,l:]]

In [92]:
it, decoded = generate('The sun shines in the',content='positive')

In [93]:
decoded

[' proof comic a outtime Sith fro what includes Nero ofzech Bok took old many laganda threat for one mentioned and the AX Nero sub able because movies']

In [49]:
it.shape

In [35]:
decoded #observe model learnt to used words related to movies :)

['The sun shines in the Lawrenceivan came very before list and we season droppedTrYou hit president believe directorEnjoy observers kill was I everyone I hand the biears Everything guys leaveasted of simply willor though tiny big it thorionart rockediii Police. god a Nekingers throughout ablek thataris again hit mod! funny istt a were I u loved" members of The art or iterations is they noticed one the<|endoftext|> Away everything for R special before Im episodes can good LordH series take prettydissin who Studios']

In [61]:
it, decoded = generate('The sun shines in the',content='excellent perfect good lovely')
decoded #6

In [53]:
it, decoded = generate('The sun shines in the',content='excellent perfect good lovely')
decoded #4

In [44]:
it, decoded = generate('The sun shines in the',content='excellent perfect good lovely')
decoded #2

In [None]:

"""def evaluate():
    model.eval()
    losses = []
    for step, batch in enumerate(eval_dataloader):
        with torch.no_grad():
            outputs = model(batch["input_ids"], labels=batch["input_ids"])

        losses.append(accelerator.gather(outputs.loss))
    loss = torch.mean(torch.cat(losses))
    try:
        perplexity = torch.exp(loss)
    except OverflowError:
        perplexity = float("inf")
    return loss.item(), perplexity.item()

In [28]:
class IMDBDatasettest(Dataset):
    def __init__(self, tokenizer: tokenizer, dataset=datasets['train'], 
                 cs_len=20, hs_len=10, tis_len=20, block_size=tokenizer.model_max_length, text_json_key="text", 
                 evaluate=False, prepended_text_to_remove=None):#, positive_con=positive_con, negative_con=negative_con):

        self.cs_len = cs_len
        self.hs_len = hs_len
        self.tis_len = tis_len

        if block_size is None:
            block_size = hs_len + max(cs_len, tis_len)
        self.block_size = block_size

        if evaluate and text_json_key != 'text':
            cached_features_file = os.path.join(
                'temp_data', "gpt2" + "_cached_cocon_" + str(block_size) + text_json_key + "_" + 'imdb'
            )
        else:
            cached_features_file = os.path.join(
                'temp_data',"gpt2" + "_cached_cocon_test" + str(block_size) + "_" + 'imdb'
            )
            cached_label_file = os.path.join(
                'temp_data',"gpt2" + "_cached_cocon_test" + str(block_size) + "_" + 'imdb_senti_naive'
            )

        if os.path.exists(cached_features_file):# and not args.overwrite_cache:
            logger.info("Loading features from cached file %s", cached_features_file)
            with open(cached_features_file, "rb") as handle:
                self.examples = pickle.load(handle)
        else:
            lines = dataset['text']
            logger.info("Creating features from dataset file at %s", 'temp_data')
            prepended_texts = None
            logger.info("Encoding with tokenizer")
            self.examples = tokenizer.batch_encode_plus(lines, add_special_tokens=True, max_length=None)["input_ids"]
            
            logger.info("Saving features into cached file %s", cached_features_file)
            with open(cached_features_file, "wb") as handle:
                pickle.dump(self.examples, handle, protocol=pickle.HIGHEST_PROTOCOL)

        if os.path.exists(cached_label_file):# and not args.overwrite_cache:
            logger.info("Loading labels from cached file %s", cached_label_file)
            with open(cached_label_file, "rb") as handle:
                self.labels = pickle.load(handle)
        else:
            logger.info("Creating labels from dataset file at %s", 'temp_data')
            prepended_texts = None

            labels = dataset['label']
            content = []
            for i in labels:
                if(i==0):
                    neg_content = random.sample(negative_con, 10)
                else:
                    neg_content = random.sample(positive_con,10)
                neg_content = ' '.join(neg_content) 
                content.append(neg_content)
            

            logger.info("Encoding with tokenizer")
            self.labels = tokenizer.batch_encode_plus(content, add_special_tokens=True, max_length=10, truncation=True)["input_ids"]

            logger.info("Saving labels into cached file %s", cached_label_file)
            with open(cached_label_file, "wb") as handle:
                pickle.dump(self.labels, handle, protocol=pickle.HIGHEST_PROTOCOL)


          
        pos  =tokenizer.encode('positive')
        neg = tokenizer.encode('negative')
        sent = {0:neg, 1:pos}
        self.targets = [sent[i] for i in dataset['label']]

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, item):
        example = self.examples[item]
        labels = self.labels[item]
        targets = self.targets[item]
        overflow_len = len(example) - self.block_size
        if overflow_len > 0:
            random_ind = random.randint(0, overflow_len) # random integer between 0 and overflow_len (both inclusive)
        else:
            random_ind = 0
        example_block = example[random_ind:random_ind+self.block_size]
        """

        overflow_len = len(labels) - 10#self.block_size
        if overflow_len > 0:
            random_ind = random.randint(0, overflow_len) # random integer between 0 and overflow_len (both inclusive)
        else:
            random_ind = 0
        content_block = labels[random_ind:random_ind+10]
        """

        return torch.tensor(example_block, dtype=torch.long), torch.tensor(labels, dtype=torch.long), torch.tensor(targets, dtype=torch.long)

In [29]:
test_dataset = IMDBDatasettest(tokenizer, dataset=datasets['test'])

In [30]:
test_batch_size = 1 #memory error for 32
test_sampler = RandomSampler(test_dataset)
test_dataloader = DataLoader(test_dataset, sampler=test_sampler,batch_size=test_batch_size, collate_fn=collate)
    

In [64]:
config.vocab_size


In [66]:

tokenizer.special_tokens["<END>"]

In [31]:
def beam_decoder(test_dataloader = test_dataloader, beam_width=3, gen_length=20):
    epoch_iterator = tqdm(test_dataloader, desc="Iteration")
    vocab_len = config.vocab_size
    targets=[]
    for step, batch in enumerate(epoch_iterator):
        if(step==5):
            break
        inputs, content, target = batch
    #lm_labels = inputs
        #inputs = inputs
    #lm_labels = lm_labels.to(device)
        #content = content.to(device)
        #target = target
    
        done = [False for i in range(beam_width)]
        stop_decode = False
        decoded_sentences = []
        targets.append(target[0]*beam_width)
        
        sm = torch.nn.Softmax(dim=-1)
        #replicate input for all beams
        #print(inputs)
        index_tokens = torch.tensor([inputs for i in range(beam_width)])
        content_tokens = torch.tensor([content for i in range(beam_width)])
        #indexed_tokens = torch.tensor([inputs.numpy()])
        #indexd_tokens = torch.tensor(index_tokens).squeeze(1).to(device)
        #content_tokens = torch.tensor(content_tokens).squeeze(1).to(device)
        #print(indexd_tokens.shape)
        beam_indexes = [[] for i in range(beam_width)]
        print(beam_indexes)
        best_scores = [0 for i in range(beam_width)]
        count=0
    
        #for i in range(gen_len):
        while(count<gen_length and not stop_decode):
            with torch.no_grad():
            #L_alpha
                hidden_inp = model(indexd_tokens,path='half1')
                hidden_content = model(content_tokens, path='half1')
                #Cocon             other_context_cocon_hidden_states = cocon_block(cocon_th_gen_output, context_seq=original_context_seq_hidden_states, history_seq=other_sample_history_seq_hidden_states, include_sos_output=True,cs_self_attn_mask_prob=1)
                #print(hidden_inp.shape, hidden_content.shape)
                cout = cocon_block(hidden_inp, context_seq=hidden_content)
                output = model(cout, path='half2')
                pred_token_logits = output[1][:,-1:]
                #softmax
                pred_token_prob = torch.nn.functional.softmax(pred_token_logits, dim=-1)
                #sample
                #print(pred_token_prob.shape)
                #pred_token = torch.multinomial(pred_token_prob[0], num_samples=1) #repeat for every elem in batch
            if(count==0):
                top_v, top_i = pred_token_prob[:,-1,:].topk(beam_width)
                [beam_indexes[i].append(top_i[0][i].tolist()) for i in range(beam_width)]
                for i in range(beam_width):
                    best_scores[i] = top_v[0][i].item()
                count += 1
            else:
                flatten_score = (pred_token_prob[:,-1,:]*torch.tensor(best_scores).to(device).unsqueeze(1)).view(-1) #beam width*vocab_size
                vals, inx = flatten_score.topk(beam_width)
                best_scores_inx = (inx/vocab_len).tolist()
                best_scores = vals.tolist()
                correct_inx = (inx%vocab_len).tolist()
                #update
                temp_lt = [0 for i in range(beam_width)]
                for i,x in enumerate(best_scores_inx):
                    temp_lt[i] = beam_indexes[i] + [correct_inx[i]]
                beam_indexes = temp_lt
                del temp_lt
                count += 1
            #for i in range(beam_width):
            #    if correct_inx[i] == tokenizer.special_tokens['<END>']:
            #        done[i] = True
                for i in range(beam_width):
                    if not done[i]:
                        best_scores[i] = vals.tolist()[i]
                        
            #if(sum(done)==beam_width):
            #    stop_decode=True
                        
            indexd_tokens = torch.cat((indexd_tokens,torch.tensor(beam_indexes).to(device)),1)
                    
        for i in range(beam_width):
            #try:
            #    end_index = beam_index[i].index(tokenizer.special_tokens["<END>"])
            #except ValueError:
            end_index = len(beam_indexes[i])
            print(tokenizer.decode(beam_indexes[i][:end_index]))
            decoded_sentences.append(tokenizer.decode(beam_indexes[i][:end_index]))
        #append
        
        #decode
    #pred_text = tokenizer.decode(input_token)
    return decoded_sentences,targets

In [32]:
d = beam_decoder(test_dataloader, beam_width=3, gen_length=20)




[[], [], []]


Iteration:   0%|                                                                 | 1/25000 [00:57<401:38:29, 57.84s/it]

 I. a a I. I I I I I.. a I. a a a a
 the a the. I., you you you, the I of the a. the with.
 of the a film film a the I I I you film, I with with I I a the
[[], [], []]


Iteration:   0%|                                                                | 1/25000 [01:58<823:25:21, 118.58s/it]


KeyboardInterrupt: 

In [130]:
len(d[0])

In [None]:
import pandas as pd
df_cocon_test_senti = pd.DataFrame()
df_cocon_test_senti['decoded']

In [51]:
tokenizer.decode(inputs[0])

"Wow! This movie is almost too bad for words. Obviously the writers wanted to somehow link this to the Ghoulies franchise, so they got Pete Liapis from the first one to reprise his role as Jonathan...only now, he's a cop and has no similar character traits as he did in the first one. The ghoulies in this one aren't the ghoulies from the last ones. The cheap looking puppets have been replaced with even cheaper looking costumed little people. Instead of being any main antagonist or being evil, they are more like the comic relief characters that appeared out of nowhere for no reason.<br /><br />When watching this film for the first time, it felt like I'd seen it before. Why was this? Because everything in this was stolen from another movie. All the cheesy cop lines and action scenes were from Lethal Weapon. The ghoulies were pretty much like Bugs Bunny and Daffy Duck, except they weren't amusing at all. Even scenes from the original Ghoulies film were sprinkled throughout this flick.<br /

In [68]:
import pandas as pd

In [133]:
for step, batch in enumerate(epoch_iterator):
    inputs, content, target = batch
    print(tokenizer.decode(inputs[0,:20]))
    break

I thought that this movie was going to be totally lame based on the advertisements that I saw in theaters


In [138]:
#COCON naive sentiment
epoch_iterator = tqdm(test_dataloader, desc="Evaluating")
test_inputs = []
test_outputs = []
test_targets = []
test_preds = []
test_content = []

for step, batch in enumerate(epoch_iterator):
    if(step==100):
        break
    inputs, content, target = batch
    #print(inputs.shape, content.shape, target.shape)
    test_inputs.append(tokenizer.decode(inputs[0,:30]))#.numpy())
    test_outputs.append(tokenizer.decode(inputs[0,30:60]))
    test_content.append(tokenizer.decode(content[0]))#.numpy())

    test_targets.append(tokenizer.decode(target[0]))
    tokens, decoded = generate2(inputs[:,:30], content)
    #print(tokenizer.decode(inputs[0][:20]))
    #print('---------------------------------')
    #print(decoded[0])
    test_preds.append(decoded[0])
    #pass tokens to sentiment classifier and calculate accuracy by comparing to targets
    
df = pd.DataFrame()
df['input_text'] = test_inputs
df['target_labels'] = test_targets
df['cocon_plain'] = test_preds
df['content'] = test_content
df['real_output'] = test_outputs


Evaluating:   0%|▎                                                              | 100/25000 [08:03<33:24:32,  4.83s/it]


In [139]:
df.head()

Unnamed: 0,input_text,target_labels,cocon_plain,content,real_output
0,Forget all those sappy romantic movies involvi...,positive,happen not when is many but Juliusts extremel...,flatteringly intelligible peerless indebted la...,over-simplified unrealistic romance. Forget a...
1,I watched the movie about 13 yrs ago while liv...,positive,"a. "" When wasS I this Film w very's of, encou...",multi-purpose examplar cleaner unassailable,in the back that most don't bother to browse....
2,"This movie is on the level with ""Welcome Home ...",negative,one see only crown j less v rumors Civil the ...,gaff destains defamation detested forbidden gr...,"guys weren't Adam Sandler's gay friends, this..."
3,If there were two parts that the physically to...,positive,my. pretty head of from are the was killings ...,pleasurably entrust liked deft improvements entr,"play, it must surely have been Cyrano de Berg..."
4,"Like many a child born in the 1980's, I grew u...",positive,buff very. in Iattery lovevisible system loop...,conciliate pleasurably intelligent astonishing...,Saddles and History of the World part 1 (I sa...


In [140]:
#cocon mod
cocon_block = CoconBlock(config.n_ctx, config, scale=True)
cocon_block.load_state_dict(torch.load('imdb_cocon/modified_4.pt', map_location=torch.device('cpu')))

<All keys matched successfully>

In [97]:
df['input_text'][0]

'Oh, man, they sure knew how to make them back then. Hollywood has forgotten the basic ingredients'

In [141]:
#df = df.reset_index()  # make sure indexes pair with number of rows
test_plain_outputs = []
for index, row in df.iterrows():
    inp = df['input_text'][index]
    content = df['content'][index]
    it, decoded = generate(inp,content=content)
    test_plain_outputs.append(decoded[0])
    #print(test_plain_outputs)
    
    
df['cocon_senti_naive'] = test_plain_outputs

In [142]:
df.head()

Unnamed: 0,input_text,target_labels,cocon_plain,content,real_output,cocon_senti_naive
0,Forget all those sappy romantic movies involvi...,positive,happen not when is many but Juliusts extremel...,flatteringly intelligible peerless indebted la...,over-simplified unrealistic romance. Forget a...,you made 2009ames a you. host wouldone slight...
1,I watched the movie about 13 yrs ago while liv...,positive,"a. "" When wasS I this Film w very's of, encou...",multi-purpose examplar cleaner unassailable,in the back that most don't bother to browse....,reference aiced sod placesor placezit he conc...
2,"This movie is on the level with ""Welcome Home ...",negative,one see only crown j less v rumors Civil the ...,gaff destains defamation detested forbidden gr...,"guys weren't Adam Sandler's gay friends, this...",it like bears down. This thoughtii you seems ...
3,If there were two parts that the physically to...,positive,my. pretty head of from are the was killings ...,pleasurably entrust liked deft improvements entr,"play, it must surely have been Cyrano de Berg...",9 iswas withR flashed me point that worstS end...
4,"Like many a child born in the 1980's, I grew u...",positive,buff very. in Iattery lovevisible system loop...,conciliate pleasurably intelligent astonishing...,Saddles and History of the World part 1 (I sa...,",ab, originally I unknown.inr first we fascina..."


In [143]:
test_gpt_outputs = []
for index, row in df.iterrows():
    inp = df['input_text'][index]
    #content = df['content'][index]
    it, decoded = generate_gpt(inp)
    test_gpt_outputs.append(decoded[0])
    #print(test_gpt_outputs)
    
    
    
df['gpt'] = test_gpt_outputs

In [144]:
df.head()

Unnamed: 0,input_text,target_labels,cocon_plain,content,real_output,cocon_senti_naive,gpt
0,Forget all those sappy romantic movies involvi...,positive,happen not when is many but Juliusts extremel...,flatteringly intelligible peerless indebted la...,over-simplified unrealistic romance. Forget a...,you made 2009ames a you. host wouldone slight...,how low low of me. bodyguard whatever project...
1,I watched the movie about 13 yrs ago while liv...,positive,"a. "" When wasS I this Film w very's of, encou...",multi-purpose examplar cleaner unassailable,in the back that most don't bother to browse....,reference aiced sod placesor placezit he conc...,shelves. unbelievable? Survival vS forgot.Hid...
2,"This movie is on the level with ""Welcome Home ...",negative,one see only crown j less v rumors Civil the ...,gaff destains defamation detested forbidden gr...,"guys weren't Adam Sandler's gay friends, this...",it like bears down. This thoughtii you seems ...,"members of Andy."" Picture that chrome Her一 fo..."
3,If there were two parts that the physically to...,positive,my. pretty head of from are the was killings ...,pleasurably entrust liked deft improvements entr,"play, it must surely have been Cyrano de Berg...",9 iswas withR flashed me point that worstS end...,Usire a celebrated that in less These everybo...
4,"Like many a child born in the 1980's, I grew u...",positive,buff very. in Iattery lovevisible system loop...,conciliate pleasurably intelligent astonishing...,Saddles and History of the World part 1 (I sa...,",ab, originally I unknown.inr first we fascina...",cADLY on a comingcitizens a the L's on you are...


In [111]:
from transformers import pipeline
sentiment_pipeline = pipeline("sentiment-analysis")
data = ["I love you", "I hate you"]
sentiment_pipeline(data)

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english)


[{'label': 'POSITIVE', 'score': 0.9998656511306763},
 {'label': 'NEGATIVE', 'score': 0.9991129040718079}]

In [113]:
s = sentiment_pipeline(data)

In [115]:
[i['label'].lower() for i in s]

['positive', 'negative']

In [119]:
[round(i['score'],3) for i in s]

[1.0, 0.999]

In [112]:
list(df['input_text'])[:3]

['Oh, man, they sure knew how to make them back then. Hollywood has forgotten the basic ingredients',
 'I got stuck in traffic (I live in Sicily) on the way to the theater (at a',
 'I enjoyed this film very much. I found it to be very entertaining for me in that I feel']

In [120]:
df.head()

Unnamed: 0,input_text,target_labels,cocon_senti_naive,content,cocon_plain,gpt
0,"Oh, man, they sure knew how to make them back ...",negative,",was I fromIi Dempl wayino is posedorious SA a...",condescend bewitch hiliarious absence culp,in very acquaintancen because to scene saidPi...,? The here he knows me without B- here. I loo...
1,I got stuck in traffic (I live in Sicily) on t...,negative,"Read for outfit,Charlie no Has complete totall...",addicts stark problems awkwardness dragging di...,ision most from one shot graduating Unicode so...,) near Mittlebusters is theory crafted III. (...
2,I enjoyed this film very much. I found it to b...,positive,H good atold before theis It begins in enjoy ...,resilient congratulatory pamper swift magnificent,she set pervasive applause http experience.30...,"aged AWE.com Click enjoyed much me, & It. She..."
3,I was in this movie as an extra in the Dallas ...,negative,would H we I possible B taken of In announced...,pitifully devilment discouraging inaccuracy ch...,year thats of life probablyyg retro unnamed J...,object.ies in: HOU my the as 2014 Dec it. Dec...
4,I've been a devoted IMDB visitor for a few yea...,positive,"around,Fable you can I nothing whyAgain Poste...",plush excel smile top-notch aspire state,"seemed so really W I scared by, a Mumbai The ...","creator anyway a government"" "" db 2 grim digi..."


In [145]:
sentiment_results = sentiment_pipeline(list(df['input_text']))
df['pred_input'] = [i['label'].lower() for i in sentiment_results]
sentiment_results = sentiment_pipeline(list(df['real_output']))
df['pred_output'] = [i['label'].lower() for i in sentiment_results]

In [152]:
df.head()

Unnamed: 0,input_text,target_labels,cocon_plain,content,real_output,cocon_senti_naive,gpt,pred_input,pred_output,pred_cocon_senti_naive,pred_cocon_plain,pred_gpt
0,Forget all those sappy romantic movies involvi...,positive,happen not when is many but Juliusts extremel...,flatteringly intelligible peerless indebted la...,over-simplified unrealistic romance. Forget a...,you made 2009ames a you. host wouldone slight...,how low low of me. bodyguard whatever project...,negative,negative,positive,negative,negative
1,I watched the movie about 13 yrs ago while liv...,positive,"a. "" When wasS I this Film w very's of, encou...",multi-purpose examplar cleaner unassailable,in the back that most don't bother to browse....,reference aiced sod placesor placezit he conc...,shelves. unbelievable? Survival vS forgot.Hid...,negative,positive,negative,positive,negative
2,"This movie is on the level with ""Welcome Home ...",negative,one see only crown j less v rumors Civil the ...,gaff destains defamation detested forbidden gr...,"guys weren't Adam Sandler's gay friends, this...",it like bears down. This thoughtii you seems ...,"members of Andy."" Picture that chrome Her一 fo...",negative,negative,negative,negative,negative
3,If there were two parts that the physically to...,positive,my. pretty head of from are the was killings ...,pleasurably entrust liked deft improvements entr,"play, it must surely have been Cyrano de Berg...",9 iswas withR flashed me point that worstS end...,Usire a celebrated that in less These everybo...,negative,positive,negative,positive,negative
4,"Like many a child born in the 1980's, I grew u...",positive,buff very. in Iattery lovevisible system loop...,conciliate pleasurably intelligent astonishing...,Saddles and History of the World part 1 (I sa...,",ab, originally I unknown.inr first we fascina...",cADLY on a comingcitizens a the L's on you are...,negative,negative,negative,negative,negative


In [146]:
(df['target_labels'] == df['pred_input']).sum() #out of 100

70

In [147]:
(df['target_labels'] == df['pred_output']).sum() #out of 100

77

In [148]:
sentiment_results = sentiment_pipeline(list(df['cocon_senti_naive']))
df['pred_cocon_senti_naive'] = [i['label'].lower() for i in sentiment_results]

(df['target_labels'] == df['pred_cocon_senti_naive']).sum() 

50

In [149]:
sentiment_results = sentiment_pipeline(list(df['cocon_plain']))
df['pred_cocon_plain'] = [i['label'].lower() for i in sentiment_results]

(df['target_labels'] == df['pred_cocon_plain']).sum() 

47

In [150]:
sentiment_results = sentiment_pipeline(list(df['gpt']))
df['pred_gpt'] = [i['label'].lower() for i in sentiment_results]

(df['target_labels'] == df['pred_gpt']).sum() 

40

In [151]:
df.to_csv('model_generations2.csv', index=False)

In [158]:
df.columns

Index(['input_text', 'target_labels', 'cocon_plain', 'content', 'real_output',
       'cocon_senti_naive', 'gpt', 'pred_input', 'pred_output',
       'pred_cocon_senti_naive', 'pred_cocon_plain', 'pred_gpt'],
      dtype='object')

In [229]:
min(len(cocon_plain_pred),30)

30

In [207]:
from datasets import load_metric
metric = load_metric("bleu")

In [235]:
for index, row in df.iterrows():
    print(len(tokenizer.encode(df['gold'][index])))

KeyError: 'gold'

In [239]:
glue_gpt = []
glue_plain = []
glue_senti = []
for index, row in df.iterrows():   
    cocon_plain_pred = tokenizer.encode(df['cocon_plain'][index])
    cocon_senti_pred = tokenizer.encode(df['cocon_senti_naive'][index])
    gold = tokenizer.encode(df['real_output'][index])
    gpt_pred = tokenizer.encode(df['gpt'][index])
    gi = min(len(cocon_plain_pred),len(cocon_senti_pred),30,len(gold), len(gpt_pred))
    cocon_plain_pred = tokenizer.encode(df['cocon_plain'][index])[:gi]
    gold = tokenizer.encode(df['real_output'][index])[:gi]
    cocon_senti_pred = tokenizer.encode(df['cocon_senti_naive'][index])[:gi]
    gpt_pred = tokenizer.encode(df['gpt'][index])[:gi]
    
        
    plain_a = metric.compute(predictions=cocon_plain_pred, references=gold)
    senti_a = metric.compute(predictions=cocon_senti_pred, references=gold)
    gpt_a = metric.compute(predictions=gpt_pred, references=gold)
    glue_gpt.append(gpt_a['accuracy'])
    glue_plain.append(plain_a['accuracy'])
    glue_senti.append(senti_a['accuracy'])

df['glue_gpt'] = glue_gpt
df['glue_senti'] = glue_senti
df['glue_plain'] = glue_plain
    
    

In [240]:
df.head()

Unnamed: 0,input_text,target_labels,cocon_plain,content,real_output,cocon_senti_naive,gpt,pred_input,pred_output,pred_cocon_senti_naive,pred_cocon_plain,pred_gpt,glue_gpt,glue_senti,glue_plain
0,Forget all those sappy romantic movies involvi...,positive,happen not when is many but Juliusts extremel...,flatteringly intelligible peerless indebted la...,over-simplified unrealistic romance. Forget a...,you made 2009ames a you. host wouldone slight...,how low low of me. bodyguard whatever project...,negative,negative,positive,negative,negative,0.0,0.0,0.0
1,I watched the movie about 13 yrs ago while liv...,positive,"a. "" When wasS I this Film w very's of, encou...",multi-purpose examplar cleaner unassailable,in the back that most don't bother to browse....,reference aiced sod placesor placezit he conc...,shelves. unbelievable? Survival vS forgot.Hid...,negative,positive,negative,positive,negative,0.0,0.0,0.0
2,"This movie is on the level with ""Welcome Home ...",negative,one see only crown j less v rumors Civil the ...,gaff destains defamation detested forbidden gr...,"guys weren't Adam Sandler's gay friends, this...",it like bears down. This thoughtii you seems ...,"members of Andy."" Picture that chrome Her一 fo...",negative,negative,negative,negative,negative,0.0,0.0,0.0
3,If there were two parts that the physically to...,positive,my. pretty head of from are the was killings ...,pleasurably entrust liked deft improvements entr,"play, it must surely have been Cyrano de Berg...",9 iswas withR flashed me point that worstS end...,Usire a celebrated that in less These everybo...,negative,positive,negative,positive,negative,0.0,0.0,0.0
4,"Like many a child born in the 1980's, I grew u...",positive,buff very. in Iattery lovevisible system loop...,conciliate pleasurably intelligent astonishing...,Saddles and History of the World part 1 (I sa...,",ab, originally I unknown.inr first we fascina...",cADLY on a comingcitizens a the L's on you are...,negative,negative,negative,negative,negative,0.0,0.0,0.034483


In [244]:
sum(df['glue_plain']>0), sum(df['glue_senti']>0), sum(df['glue_gpt']>0)

(17, 7, 12)

In [175]:
[df['real_output'][0]]

[' over-simplified unrealistic romance. Forget all those shameless "dog gives its life to save its family" flicks (although I have to admit']

In [245]:
from datasets import load_metric
metric = load_metric("code_eval")
print(metric.inputs_description)

Downloading builder script:   0%|          | 0.00/3.87k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/2.25k [00:00<?, ?B/s]


Calculates how good are predictions given some references, using certain scores
Args:
    predictions: list of candidates to evaluate. Each candidates should be a list
        of strings with several code candidates to solve the problem.
    references: a list with a test for each prediction. Each test should evaluate the
        correctness of a code candidate.
    k: number of code candidates to consider in the evaluation (Default: [1, 10, 100])
    num_workers: number of workers used to evaluate the canidate programs (Default: 4).
    timeout:
Returns:
    pass_at_k: dict with pass rates for each k
    results: dict with granular results of each unittest
Examples:
    >>> code_eval = datasets.load_metric("code_eval")
    >>> test_cases = ["assert add(2,3)==5"]
    >>> candidates = [["def add(a,b): return a*b", "def add(a, b): return a+b"]]
    >>> pass_at_k, results = code_eval.compute(references=test_cases, predictions=candidates, k=[1, 2])
    >>> print(pass_at_k)
    {'pass@1': 

In [185]:
from datasets import load_metric
metric = load_metric("bleurt")

Using default BLEURT-Base checkpoint for sequence maximum length 128. You can use a bigger model for better results with e.g.: datasets.load_metric('bleurt', 'bleurt-large-512').


Downloading data:   0%|          | 0.00/405M [00:00<?, ?B/s]

INFO:tensorflow:Reading checkpoint C:\Users\aishu\.cache\huggingface\metrics\bleurt\default\downloads\extracted\4f6af8a094ab2435d43ed5ff8d3b150764d255482ae3e5c25ab09aaba351d0cd\bleurt-base-128.
INFO:tensorflow:Config file found, reading.
INFO:tensorflow:Will load checkpoint bert_custom
INFO:tensorflow:Loads full paths and checks that files exists.
INFO:tensorflow:... name:bert_custom
INFO:tensorflow:... vocab_file:vocab.txt
INFO:tensorflow:... bert_config_file:bert_config.json
INFO:tensorflow:... do_lower_case:True
INFO:tensorflow:... max_seq_length:128
INFO:tensorflow:Creating BLEURT scorer.
INFO:tensorflow:Creating WordPiece tokenizer.
INFO:tensorflow:WordPiece tokenizer instantiated.
INFO:tensorflow:Creating Eager Mode predictor.
INFO:tensorflow:Loading model.
INFO:tensorflow:BLEURT initialized.


In [211]:
print(metric.inputs_description)


Compute GLUE evaluation metric associated to each GLUE dataset.
Args:
    predictions: list of predictions to score.
        Each translation should be tokenized into a list of tokens.
    references: list of lists of references for each translation.
        Each reference should be tokenized into a list of tokens.
Returns: depending on the GLUE subset, one or several of:
    "accuracy": Accuracy
    "f1": F1 score
    "pearson": Pearson Correlation
    "spearmanr": Spearman Correlation
    "matthews_correlation": Matthew Correlation
Examples:

    >>> glue_metric = datasets.load_metric('glue', 'sst2')  # 'sst2' or any of ["mnli", "mnli_mismatched", "mnli_matched", "qnli", "rte", "wnli", "hans"]
    >>> references = [0, 1]
    >>> predictions = [0, 1]
    >>> results = glue_metric.compute(predictions=predictions, references=references)
    >>> print(results)
    {'accuracy': 1.0}

    >>> glue_metric = datasets.load_metric('glue', 'mrpc')  # 'mrpc' or 'qqp'
    >>> references = [0, 1]

In [210]:
from datasets import load_metric
metric = load_metric("glue",'sst2')


In [217]:
df.columns

Index(['input_text', 'target_labels', 'cocon_plain', 'content', 'real_output',
       'cocon_senti_naive', 'gpt', 'pred_input', 'pred_output',
       'pred_cocon_senti_naive', 'pred_cocon_plain', 'pred_gpt'],
      dtype='object')

In [253]:
[df['cocon_plain'][0][:30]], [df['real_output'][0][:30]]

([' happen not when is many but J'], [' over-simplified unrealistic r'])

In [254]:
from datasets import load_metric
metric = load_metric("perplexity")
print(metric.inputs_description)

Downloading builder script:   0%|          | 0.00/2.06k [00:00<?, ?B/s]


Args:
    model_id (str): model used for calculating Perplexity
            NOTE: Perplexity can only be calculated for causal language models.
                    This includes models such as gpt2, causal variations of bert,
                    causal versions of t5, and more (the full list can be found
                    in the AutoModelForCausalLM documentation here:
                    https://huggingface.co/docs/transformers/master/en/model_doc/auto#transformers.AutoModelForCausalLM )

    input_texts (list of str): input text, each separate text snippet
        is one list entry. Perplexity returned will be an average of
        the perplexity for each list entry.
    stride (int): stride size, defaults to 512
    device (str): device to run on, defaults to 'cuda' when available
Returns:
    perplexity: dictionary containing the average perplexity score for the text
        in the input list.
Examples:
    Example 1:
        >>> perplexity = datasets.load_metric("perplexity")
 

In [256]:
results = metric.compute(model_id='gpt2',input_texts=df['cocon_plain'].tolist(),stride=1)
results

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


  0%|          | 0/100 [00:00<?, ?it/s]

{'perplexity': 676.169189453125}

In [257]:
results_gpt = metric.compute(model_id='gpt2',input_texts=df['gpt'].tolist(),stride=1)
results_gpt

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


  0%|          | 0/100 [00:00<?, ?it/s]

{'perplexity': 366.1953430175781}

In [258]:
results_senti = metric.compute(model_id='gpt2',input_texts=df['cocon_senti_naive'].tolist(),stride=1)
results_senti

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


  0%|          | 0/100 [00:00<?, ?it/s]

{'perplexity': 846.583740234375}

In [259]:
results_real = metric.compute(model_id='gpt2',input_texts=df['real_output'].tolist(),stride=1)
results_real

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


  0%|          | 0/100 [00:00<?, ?it/s]

{'perplexity': 20.682506561279297}

In [262]:
from datasets import load_metric
metric = load_metric("rouge")
print(metric.inputs_description)


Calculates average rouge scores for a list of hypotheses and references
Args:
    predictions: list of predictions to score. Each predictions
        should be a string with tokens separated by spaces.
    references: list of reference for each prediction. Each
        reference should be a string with tokens separated by spaces.
    rouge_types: A list of rouge types to calculate.
        Valid names:
        `"rouge{n}"` (e.g. `"rouge1"`, `"rouge2"`) where: {n} is the n-gram based scoring,
        `"rougeL"`: Longest common subsequence based scoring.
        `"rougeLSum"`: rougeLsum splits text using `"
"`.
        See details in https://github.com/huggingface/datasets/issues/617
    use_stemmer: Bool indicating whether Porter stemmer should be used to strip word suffixes.
    use_agregator: Return aggregates if this is set to True
Returns:
    rouge1: rouge_1 (precision, recall, f1),
    rouge2: rouge_2 (precision, recall, f1),
    rougeL: rouge_l (precision, recall, f1),
    rouge

In [261]:
!pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.0.4-py2.py3-none-any.whl (22 kB)
Installing collected packages: rouge-score
Successfully installed rouge-score-0.0.4


You should consider upgrading via the 'c:\users\aishu\miniconda\python.exe -m pip install --upgrade pip' command.


In [252]:
metric.compute(predictions=[df['cocon_plain'][0][:30]], references=[df['real_output'][0][:30]], k=[1])


ValueError: Got a string but expected a list instead: ' happen not when is many but J'

In [214]:
tokenizer.encode(df['cocon_plain'][0])

[1645,
 407,
 618,
 318,
 867,
 475,
 5979,
 72,
 436,
 82,
 4457,
 1662,
 356,
 71,
 22032,
 45129,
 617,
 6293,
 11,
 618,
 2646,
 3807,
 314,
 475,
 11,
 3807,
 373,
 11,
 286,
 14570,
 547,
 0]

In [212]:
y = torch.tensor(tokenizer.encode(df['cocon_plain'][0]))
#y = y.type(torch.int64)
y

tensor([ 1645,   407,   618,   318,   867,   475,  5979,    72,   436,    82,
         4457,  1662,   356,    71, 22032, 45129,   617,  6293,    11,   618,
         2646,  3807,   314,   475,    11,  3807,   373,    11,   286, 14570,
          547,     0])

In [206]:
final_score = metric.compute()
final_score

InvalidArgumentError: cannot compute __inference_pruned_6256 as input #0(zero-based) was expected to be a int64 tensor but is a int32 tensor [Op:__inference_pruned_6256]

In [None]:
for model_input, gold_references in evaluation_dataset:
    model_predictions = model(model_inputs)
    metric.add_batch(predictions=model_predictions, references=gold_references)
final_score = metric.compute()

In [129]:
df.head()

Unnamed: 0,input_text,target_labels,cocon_senti_naive,content,cocon_plain,gpt,pred_input,pred_cocon_senti_naive,pred_cocon_plain,pred_gpt
0,"Oh, man, they sure knew how to make them back ...",negative,",was I fromIi Dempl wayino is posedorious SA a...",condescend bewitch hiliarious absence culp,in very acquaintancen because to scene saidPi...,? The here he knows me without B- here. I loo...,negative,negative,negative,negative
1,I got stuck in traffic (I live in Sicily) on t...,negative,"Read for outfit,Charlie no Has complete totall...",addicts stark problems awkwardness dragging di...,ision most from one shot graduating Unicode so...,) near Mittlebusters is theory crafted III. (...,negative,positive,negative,negative
2,I enjoyed this film very much. I found it to b...,positive,H good atold before theis It begins in enjoy ...,resilient congratulatory pamper swift magnificent,she set pervasive applause http experience.30...,"aged AWE.com Click enjoyed much me, & It. She...",positive,positive,positive,positive
3,I was in this movie as an extra in the Dallas ...,negative,would H we I possible B taken of In announced...,pitifully devilment discouraging inaccuracy ch...,year thats of life probablyyg retro unnamed J...,object.ies in: HOU my the as 2014 Dec it. Dec...,positive,positive,positive,negative
4,I've been a devoted IMDB visitor for a few yea...,positive,"around,Fable you can I nothing whyAgain Poste...",plush excel smile top-notch aspire state,"seemed so really W I scared by, a Mumbai The ...","creator anyway a government"" "" db 2 grim digi...",positive,negative,positive,negative


In [104]:
def generate_gpt(inp,history=None, gen_len=30):
    input_token = torch.tensor(tokenizer.encode(inp))
    if(len(input_token.shape)<3):
        input_token = input_token.unsqueeze(0) #batch dim

    #Repeat for history TO DO
    #implement auto regression TODO
    input_token = input_token.to(device)
    l = len(input_token[0])
    for i in range(gen_len):
        #L_alpha
        hidden_inp = model(input_token,path='half1')
        #hidden_content = model(content_token, path='half1')
        #Cocon             other_context_cocon_hidden_states = cocon_block(cocon_th_gen_output, context_seq=original_context_seq_hidden_states, history_seq=other_sample_history_seq_hidden_states, include_sos_output=True,cs_self_attn_mask_prob=1)
        #cout = cocon_block(hidden_inp, context_seq=hidden_content)
        output = model(hidden_inp, path='half2')
        pred_token_logits = output[1][:,-1:]
        #softmax
        pred_token_prob = torch.nn.functional.softmax(pred_token_logits, dim=-1)
        #sample
        pred_token = torch.multinomial(pred_token_prob[0], num_samples=1) #repeat for every elem in batch
        #append
        input_token = torch.cat((input_token,pred_token),1)
        #decode
    #pred_text = tokenizer.decode(input_token)
    return input_token, [tokenizer.decode(i) for i in input_token[:,l:]]

In [106]:
it, decoded = generate_gpt('The sun shines in the')
decoded

[' sky on Malfoy hood of the Cloud } might be 0: represents sq one q: stops The restructuring if few be several. Curtores 0 vanish in']

In [78]:
def generate2(input_token,content_token=None,history=None, gen_len=30):
    #input_token = torch.tensor(tokenizer.encode(inp))
    #if(len(input_token.shape)<3):
    #    input_token = input_token.unsqueeze(0) #batch dim
    #if(content_token != None):
        #content_token = torch.tensor(tokenizer.encode(content))
        #if(len(content_token.shape)<3):
         #   content_token = content_token.unsqueeze(0)
        #content_token = content_token.unsqueeze(0)
    #Repeat for history TO DO
    #implement auto regression TODO
    input_token = input_token.to(device)
    l = len(input_token[0])
    #print('l ',l)
    content_token = content_token.to(device)
    #print(input_token.shape, content_token.shape)
    for i in range(gen_len):
        #L_alpha
        hidden_inp = model(input_token,path='half1')
        hidden_content = model(content_token, path='half1')
        #Cocon             other_context_cocon_hidden_states = cocon_block(cocon_th_gen_output, context_seq=original_context_seq_hidden_states, history_seq=other_sample_history_seq_hidden_states, include_sos_output=True,cs_self_attn_mask_prob=1)
        cout = cocon_block(hidden_inp, context_seq=hidden_content)
        output = model(cout, path='half2')
        pred_token_logits = output[1][:,-1:]
        #softmax
        pred_token_prob = torch.nn.functional.softmax(pred_token_logits, dim=-1)
        #sample
        pred_token = torch.multinomial(pred_token_prob[0], num_samples=1) #repeat for every elem in batch
        #append
        input_token = torch.cat((input_token,pred_token),1)
        #decode
    #pred_text = tokenizer.decode(input_token)
    return input_token, [tokenizer.decode(i) for i in input_token[:,l:]]

In [None]:
example = datasets['test'][0:2]
example

In [None]:
len(example)

In [None]:
sent[example['label']]

In [None]:
out, decoded = generate('I love sci-fi and am willing to put up',content=content, gen_len=20)
decoded

In [None]:
out, decoded = generate('I love sci-fi and am willing to put up',content='is perfect', gen_len=20)
decoded