In [77]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import string
import tokenizers
import tensorflow as tf
import tensorflow.keras.backend as K
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.probability import FreqDist
from transformers import *
from keras.models import Model, load_model
from sklearn.model_selection import StratifiedKFold
from keras.layers import Dense, Flatten, Conv1D, Dropout, Input
from keras.callbacks import ModelCheckpoint, EarlyStopping



In [78]:
def jaccard(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    if (len(a)==0) & (len(b)==0): return 0.5
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

In [79]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [82]:


tokenizer = tokenizers.ByteLevelBPETokenizer(
                vocab_file= 'vocab.json',
                merges_file = 'merges.txt',
                lowercase =True,
                add_prefix_space = True
)

sentiment_id = {'neutral' : tokenizer.encode('neutral').ids[0],
                'negative' : tokenizer.encode('negative').ids[0],
                'positive' : tokenizer.encode('positive').ids[0]}



Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative


In [83]:

temp = train_data.shape[0]
input_ids = np.ones((temp,max_length),dtype='int32')
attention_mask = np.zeros((temp, max_length),dtype='int32')
token_type = np.zeros((temp,max_length),dtype='int32')
start_mask = np.zeros((temp,max_length),dtype='int32')
end_mask = np.zeros((temp,max_length),dtype='int32')

for i in range(temp):
    text1 = " "+" ".join(train_data.loc[i,'text'].split())
    text2 = " ".join(train_data.loc[i, 'selected_text'].split())
    index = text1.find(text2)
    text2_loc = np.zeros((len(text1)))
    text2_loc[index:index+len(text2)]=1
    
    if text1[index-1]==" ": 
        text2_loc[index-1]=1
    encode_text1 = tokenizer.encode(text1)
    
    s_text_token_index = []
    

   
        
    
    for k,(x,y) in enumerate(encode_text1.offsets):
        sum_val = np.sum(text2_loc[x:y])
        if sum_val > 0:
            s_text_token_index.append(k)
        
        
    senti_token = sentiment_id[train_data.loc[i,'sentiment']]
    input_ids[i,:len(encode_text1.ids)+5] = [0] +encode_text1.ids+[2,2]+[senti_token]+[2]
    attention_mask[i,:len(encode_text1.ids)+5]=1
    
    if len(s_text_token_index) > 0:
        start_mask[i,s_text_token_index[0]+1] = 1
        end_mask[i, s_text_token_index[-1]+1] = 1

In [84]:
def loss_function(answer,prediction):
    loss = tf.keras.losses.categorical_crossentropy(answer,prediction, from_logits = False , label_smoothing = 0.2)
    loss = tf.reduce_mean(loss)
    return loss



In [85]:
def build_model():
    ids = tf.keras.layers.Input((max_length,), dtype=tf.int32)
    attention = tf.keras.layers.Input((max_length,), dtype=tf.int32)
    token = tf.keras.layers.Input((max_length,), dtype=tf.int32)
    PATH = '../NLP/'
    config = RobertaConfig.from_pretrained(PATH+'config-roberta-base.json')
    roberta_model = TFRobertaModel.from_pretrained(PATH+'pretrained-roberta-base.h5',config=config)
    x = roberta_model(ids,attention_mask=attention,token_type_ids=token)
    
    x1 = tf.keras.layers.Dropout(0.1)(x[0]) 
    x1 = tf.keras.layers.Conv1D(128,2,padding='same')(x1)
    x1 = tf.keras.layers.LeakyReLU()(x1)
    x1 = tf.keras.layers.Conv1D(64,2,padding='same')(x1)
    x1 = tf.keras.layers.Dense(1)(x1)
    x1 = tf.keras.layers.Flatten()(x1)
    x1 = tf.keras.layers.Activation('softmax')(x1)
    
    x2 = tf.keras.layers.Dropout(0.1)(x[0]) 
    x2 = tf.keras.layers.Conv1D(128,2,padding='same')(x2)
    x2 = tf.keras.layers.LeakyReLU()(x2)
    x2 = tf.keras.layers.Conv1D(64,2,padding='same')(x2)
    x2 = tf.keras.layers.Dense(1)(x2)
    x2 = tf.keras.layers.Flatten()(x2)
    x2 = tf.keras.layers.Activation('softmax')(x2)
    
    
    model = tf.keras.models.Model(inputs=[ids,attention,token], outputs=[x1,x2])
    optimizer = tf.keras.optimizers.Adam(learning_rate = 3e-5)
    model.compile(loss=loss_function,optimizer=optimizer)
    
    return model

In [86]:
temp2 = test_data.shape[0]

input_ids_test = np.ones((temp2,max_length),dtype='int32')
attention_mask_test = np.zeros((temp2, max_length),dtype='int32')
token_type_test = np.zeros((temp2,max_length),dtype='int32')

for i in range(temp2):
    text1 = " "+" ".join(test_data.loc[i,'text'].split())
    encode_text1 = tokenizer.encode(text1)
    
    senti_token = sentiment_id[test_data.loc[i,'sentiment']]
    input_ids_test[i,:len(encode_text1.ids)+5] = [0] + encode_text1.ids + [2,2] + [senti_token] + [2]
    attention_mask_test[i,:len(encode_text1.ids)+5] = 1

    



In [90]:
prediction_start= np.zeros((input_ids_test.shape[0],max_length))
prediction_end= np.zeros((input_ids_test.shape[0],max_length))


all_result = []

for i in range(5):
    K.clear_session()
    model = build_model()
    model.load_weights('v4-roberta-%i.h5'%4)
    prediction = model.predict([input_ids_test,attention_mask_test,token_type_test],verbose = 1)
    prediction_start = prediction_start + prediction[0]/3
    prediction_end = prediction_end + prediction[1]/3






In [91]:
for i in range(input_ids_test.shape[0]):
    x = np.argmax(prediction_start[i,])
    y = np.argmax(prediction_end[i,])
    if x>y:
        update = test_data.loc[i,'text']
    else : 
        text1 = " "+" ".join(test_data.loc[i,'text'].split())
        encode_1 = tokenizer.encode(text1)
        update = tokenizer.decode(encode.ids[x-1:y])
    all_result.append(encode)
test_data['selected_text'] = all_result
test_data.head()
        

Unnamed: 0,textID,text,sentiment,selected_text
0,f87dea47db,Last session of the day http://twitpic.com/67ezh,neutral,"Encoding(num_tokens=16, attributes=[ids, type_..."
1,96d74cb729,Shanghai is also really exciting (precisely -...,positive,"Encoding(num_tokens=31, attributes=[ids, type_..."
2,eee518ae67,"Recession hit Veronique Branquinho, she has to...",negative,"Encoding(num_tokens=20, attributes=[ids, type_..."
3,01082688c6,happy bday!,positive,"Encoding(num_tokens=4, attributes=[ids, type_i..."
4,33987a8ee5,http://twitpic.com/4w75p - I like it!!,positive,"Encoding(num_tokens=17, attributes=[ids, type_..."


In [None]:
test_data[['textID','selected_text']].to_csv('submission.csv', index=False)
