In [1]:
import os
try:
  !pip install transformers
  if not os.path.exists('/content/tweet-sentiment-extraction.zip'):
    ! mkdir ~/.kaggle
    ! mv kaggle.json ~/.kaggle/
    ! chmod 600 ~/.kaggle/kaggle.json
    ! kaggle competitions download -c tweet-sentiment-extraction
except:
  pass

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/2c/4e/4f1ede0fd7a36278844a277f8d53c21f88f37f3754abf76a5d6224f76d4a/transformers-3.4.0-py3-none-any.whl (1.3MB)
[K     |████████████████████████████████| 1.3MB 2.7MB/s 
Collecting sentencepiece!=0.1.92
[?25l  Downloading https://files.pythonhosted.org/packages/e5/2d/6d4ca4bef9a67070fa1cac508606328329152b1df10bdf31fb6e4e727894/sentencepiece-0.1.94-cp36-cp36m-manylinux2014_x86_64.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 15.2MB/s 
[?25hCollecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 22.7MB/s 
[?25hCollecting tokenizers==0.9.2
[?25l  Downloading https://files.pythonhosted.org/packages/7c/a5/78be1a55b2ac8d6a956f0a211d372726e2b1dd2666bb537fea9b03abd62c/tokenizers-0.9.2-cp36-cp36m-manylinux1_x86_64.whl (2.9

In [2]:
def data_directory():
  try:
    if not os.path.exists('/content/twet-sentiment-extraction'):
      !unzip tweet-sentiment-extraction.zip
      ! mkdir twet-sentiment-extraction
      ! mkdir /content/twet-sentiment-extraction/train_data
      ! mkdir /content/twet-sentiment-extraction/test_data
      ! mkdir /content/twet-sentiment-extraction/sample_data
      ! unzip train.csv.zip -d /content/twet-sentiment-extraction/train_data
      !mv test.csv /content/twet-sentiment-extraction/test_data
      !mv sample_submission.csv /content/twet-sentiment-extraction/sample_data
  except:
    pass

  return {
      'train_data_path':'/content/twet-sentiment-extraction/train_data/train.csv',
      'test_data_path':'/content/twet-sentiment-extraction/test_data/test.csv',
      'sample_data_path':'/content/twet-sentiment-extraction/sample_data/sample_submission.csv'
    }

In [3]:
import transformers
import config
import torch
import pandas as pd
import torch.nn as nn
from tqdm import tqdm
from sklearn import model_selection
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup 

In [4]:
import numpy as np
class SentimentDataset:
  def __init__(self,text,selected_text,sen_len):
    self.text=text
    self.selected_text=selected_text
    self.max_len=config.MAX_LEN
    self.tokenizer=config.TOKENIZER
    self.sen_len=sen_len

  def __len__(self):
    return len(self.text)

  def __getitem__(self,item):
    tweet=self.text[item]
    selected_text=self.text[item]
    sen_len=self.sen_len[item]
    
    strt_index=-1
    end_index=-1
    char_targets=[0] * len(tweet)

    for index in (i for i,k in enumerate(tweet) if k in selected_text[0]): 
      if tweet[index:index+len(selected_text)]==selected_text:
        strt_index=index
        end_index=index+len(selected_text)-1
        break
        
    if strt_index!=1 and end_index!=1:
      for j in range(strt_index,end_index+1):
        char_targets[j]=1
    
    tok_tweet=self.tokenizer.encode(tweet)
    tweet_token=tok_tweet.tokens
    tweet_ids=tok_tweet.ids
    tweet_offsets=tok_tweet.offsets[1:-1]



    targets=[0]* (len(tweet_token)-2)

    for start,(off1,off2) in enumerate(tweet_offsets):
      if sum(char_targets[off1:off2])>0:
        targets[start]==1
    targets=[0]+targets+[0]

    tar_start=[0] * len(targets)
    tar_end=[0] * len(targets)

    non_zero=np.nonzero(targets)[0]

    if len(non_zero)>0:
      tar_start[non_zero[0]]=1
      tar_end[non_zero[-1]]=1

    mask=[1] * len(tweet_ids)
    token_type_ids=[0] * len(tweet_ids)

    padding_len=sen_len-len(tweet_ids)

    ids = tweet_ids+[0]*padding_len
    mask = mask+[0]*padding_len
    token_type_ids = token_type_ids+[0]*padding_len
    targets = targets+[0]*padding_len
    tar_start = tar_start+[0]*padding_len
    tar_end = tar_end +[0]*padding_len

    return {
        'ids':torch.tensor(ids,dtype=torch.long),
        'mask':torch.tensor(mask,dtype=torch.long),
        'token_type_ids':torch.tensor(token_type_ids,dtype=torch.long),
        'targets':torch.tensor(targets,dtype=torch.long),
        'tar_start':torch.tensor(tar_start,dtype=torch.float),
        'tar_end':torch.tensor(tar_end,dtype=torch.float),
        'tweet_token':' '.join(tweet_token),
        'selected_orig_tweet':selected_text
    }

In [5]:
#model evaluation
class SnetimentModel(nn.Module):
  def __init__(self):
    super(SnetimentModel,self).__init__()

    self.bert=transformers.BertModel.from_pretrained(config.BASE_MODEL_PATH,return_dict=True)
    self.bert_drop=nn.Dropout(0.3)
    self.out=nn.Linear(768,2)
    

  def forward(self,ids,mask,token_type_ids):
    output=self.bert(ids,attention_mask=mask,token_type_ids=token_type_ids)
    seq_output=self.bert_drop(output.last_hidden_state)
    logits=self.out(seq_output)
    
    start_logits,end_logits=logits.split(1,dim=-1)

    start_logits=start_logits.squeeze(-1)
    end_logits=end_logits.squeeze(-1)

    return start_logits,end_logits

In [6]:
#Engine part

class AverageMeter:
  def __init__(self):
    self.reset()

  def reset(self):
    self.val = 0
    self.avg = 0
    self.sum = 0
    self.count = 0

  def update(self,val,n=1):
    self.val = val
    self.sum += val * n
    self.count += n
    self.avg = self.sum / self.count

def loss_fn(o1,o2,t1,t2):
  l1=nn.BCEWithLogitsLoss()(o1,t1)
  l2=nn.BCEWithLogitsLoss()(o2,t2)
  return l1+l2

def train_fn(data_loader,model,optimizer,device,scheduler):
  model.train() 
  losses=AverageMeter()
  tqdm_train=tqdm(data_loader,total=len(data_loader))
  for data in tqdm_train:
    ids=data['ids'].to(device,dtype=torch.long)
    token_type_ids=data['token_type_ids'].to(device,dtype=torch.long)
    mask=data['mask'].to(device,dtype=torch.long)
    target_start=data['tar_start'].to(device,dtype=torch.float)
    target_end=data['tar_end'].to(device,dtype=torch.float)

    optimizer.zero_grad()

    output1,output2=model(
                ids=ids,
                mask=mask,
                token_type_ids=token_type_ids )
    
    loss=loss_fn(output1,output2,target_start,target_end)
    
    loss.backward()
    optimizer.step()
    scheduler.step()
    losses.update(loss.item(),ids.size(0))
    tqdm_train.set_postfix(loss=losses.avg)

def eval_fn(data_loader,model,device):
  losses_val=AverageMeter()
  model.eval()
  fin_output_start=[]
  fin_output_end=[]
  token=[]
  selected_token=[]
  tqdm_val=tqdm(data_loader,total=len(data_loader))
  for data in tqdm_val:
    ids=data['ids'].to(device,dtype=torch.long)
    token_type_ids=data['token_type_ids'].to(device,dtype=torch.long)
    mask=data['mask'].to(device,dtype=torch.long)
    target_start=data['tar_start'].to(device,dtype=torch.float)
    target_end=data['tar_end'].to(device,dtype=torch.float)
    tar_token=data['tweet_token']
    selected_text=data['selected_orig_tweet']

    model.bert_drop=nn.Dropout(0)
    output1,output2=model(
                ids=ids,
                mask=mask,
                token_type_ids=token_type_ids )
    
    loss=loss_fn(output1,output2,target_start,target_end)
    
    losses_val.update(loss.item(),ids.size(0))
    tqdm_val.set_postfix(loss=losses_val.avg)


In [7]:
def smart_batch(dataset,batch_size):
  dataset['text']=dataset.text.apply(lambda x:' '.join(str(x).split()))
  dataset['tokenized_tweet']=dataset.text.apply(lambda x:config.TOKENIZER.encode(x).tokens)
  dataset['sen_len']=dataset.tokenized_tweet.apply(lambda tokenized_tweet:len(tokenized_tweet))

  dataset.sort_values(by='sen_len',ignore_index=True,inplace=True)
  len_batches=np.append(dataset.sen_len[batch_size::batch_size].values,dataset.sen_len.max())
  data_batches=list(range(batch_size,dataset.shape[0],batch_size))+[dataset.shape[0]]
  dataset['batch']=0
  for k,(i,l) in enumerate(zip(data_batches,len_batches)):
    dataset.batch.iloc[batch_size * k:i].replace(0,l,inplace=True)
  return dataset

In [14]:
def get_device():
  if torch.cuda.is_available():
    return torch.device('cuda:0')
  else:
    return torch.device('cpu')

In [15]:

def run( ):

  df=pd.read_csv(data_directory()['train_data_path'],nrows=13740).dropna().reset_index(drop=True)

  df_train,df_valid=model_selection.train_test_split(
      df,
      test_size=0.1,
      random_state=42)

  sort_train_df=smart_batch(df_train,config.TRAIN_BATCH_SIZE)
  sort_valid_df=smart_batch(df_valid,config.TEST_BATCH_SIZE)
  
  Train_dataset=SentimentDataset(
                                  sort_train_df.text,
                                  sort_train_df.selected_text,
                                  sort_train_df.batch 
                                )
  
  Test_dataset=SentimentDataset(
                                  sort_valid_df.text,
                                  sort_valid_df.selected_text,
                                  sort_valid_df.batch 
                                )
  
  train_data_loader=torch.utils.data.DataLoader(
                                                  Train_dataset,
                                                  batch_size=config.TRAIN_BATCH_SIZE
                                               )

  test_data_loader=torch.utils.data.DataLoader(
                                                  Test_dataset,
                                                  batch_size=config.TEST_BATCH_SIZE
                             
                                               )
  device=get_device()

  model=SnetimentModel()
  model.to(device)

  parameters_opt=list(model.named_parameters())
  no_decay=['bias','LayerNorm.bias','LayerNorm.weight']
  optimizer_parametrs=[{
                        'params':[parameters for name,parameters in parameters_opt if not any(nd in name for nd in no_decay)],
                        'weight_decay':0.001},
                       {
                        'params':[parameters for name,parameters in parameters_opt if any(nd in name for nd in no_decay)],
                        'weight_decay':0.0
                       }]
  num_train_steps=int(len(Train_dataset)/config.TRAIN_BATCH_SIZE * config.EPOCHS)
  optimizer=AdamW(optimizer_parametrs,lr=3e-5)
  scheduler=get_linear_schedule_with_warmup(
                                            optimizer,
                                            num_warmup_steps=0,
                                            num_training_steps=num_train_steps
  )

  for epoch in range(config.EPOCHS):
    train_loss=train_fn(train_data_loader,model,optimizer,device,scheduler)
    test_loss=eval_fn(test_data_loader,model,device)
    torch.save(model.state_dict(),config.MODEL_PATH)



In [None]:
if __name__=='__main__':
  run()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in 

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




 89%|████████▊ | 343/387 [36:36<07:50, 10.70s/it, loss=0.0269]

In [1]:
def predict():
  sentence='''
    Ritu is going to india
  '''
  sentence=' '.join(sentence.split())
  tokenized_sent=config.TOKENIZER.encode(sentence)
  sen_len=len(tokenized_sent)
  data={
    
    'ids':torch.tensor(tokenized_sent.ids,dtype=torch.long),
    'mask':torch.tensor([1] * sen_len,dtype=torch.long),
    'token_type_ids':torch.tensor([0]* sen_len,dtype=torch.long)
  }
  

  device=get_device()
  model=SnetimentModel()
  model.bert_drop=nn.Dropout(0)
  model.load_state_dict(torch.load(config.MODEL_PATH))
  model.to(device)
  # print("Model's state_dict:")
  # for param_tensor in model.state_dict():
  #     print(param_tensor, "\t", model.state_dict()[param_tensor])
  # print(**data)

  with torch.no_grad():
    for k,v in data.items():
      data[k]=v.to(device).unsqueeze(0)

    
    output1,output2=model(**data)
    strt=output1.argmax(dim=-1).cpu().numpy()
    end=output2.argmax(dim=-1).cpu().numpy()
    print(tokenized_sent[end:strt].replace( ' ##',''))