In [1]:
import pandas as pd, numpy as np
import re, string
import operator
import tensorflow as tf
import tensorflow.keras.backend as K
from sklearn.model_selection import StratifiedKFold, ShuffleSplit
from transformers import *
from featureEng import *
import tokenizers
from keras.utils import to_categorical
import nltk
#nltk.download('averaged_perceptron_tagger')
#nltk.download('stopwords')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
print('TF version',tf.__version__)
gpu_devices = tf.config.experimental.list_physical_devices("GPU")
for device in gpu_devices:
    tf.config.experimental.set_memory_growth(device, True)

TF version 2.2.0


Using TensorFlow backend.


In [2]:
from tensorflow.keras.layers import Input, Dropout, Embedding, Conv1D, GlobalMaxPooling1D, GlobalAveragePooling1D, LeakyReLU, Dense, Flatten, Activation, Reshape, Concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

def jaccard(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    if (len(a)==0) & (len(b)==0): return 0.5
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

In [3]:
def data_aug(train, multiple = 1):
    def aug(row):
        tweet = row['text']
        selected_text = row['selected_text']
        idx = tweet.find(selected_text) 
        if idx>=0:
            new_tweets = []
            prev = tweet[:idx].split()
            after = tweet[idx+len(selected_text):].split()
            pool = [(i,j) for i in range(len(prev)+1) for j in range(len(after)+1)]
            pool.remove((len(prev),0))
            if len(pool) == 0:
                return None
            for r in np.random.choice(len(pool), multiple):
                r1, r2 = pool[r]
                start = ''
                end = ''
                if r1 > 0:
                    start =' '.join(prev[r1:]) + ' '
                if r2 > 0:
                    end = ' '+' '.join(after[:r2])
                
                new_tweets.append(start+selected_text+end)
            if len(new_tweets) > 0:
                return new_tweets
            return None
        else:
            return None
        
    train_aug = {'text':[],'selected_text':[], 'sentiment':[], 'textID':[]}
    for i in train.index:
        new_tweets = aug(train.loc[i])
        if new_tweets:
            for new_tweet in new_tweets:
                train_aug['text'].append(new_tweet)
                train_aug['selected_text'].append(train.loc[i,'selected_text'])
                train_aug['sentiment'].append(train.loc[i,'sentiment'])
                train_aug['textID'].append(train.loc[i,'textID'])
    train_aug = pd.DataFrame(train_aug).dropna()
    return train_aug

In [4]:
def unique_word_fraction(text):
    """function to calculate the fraction of unique words on total words of the text"""
    text_splited = text.split(' ')
    text_splited = [''.join(c for c in s if c not in string.punctuation) for s in text_splited]
    text_splited = [s for s in text_splited if s]
    word_count = text_splited.__len__()
    unique_count = list(set(text_splited)).__len__()
    if word_count == 0:
        return 0
    return (unique_count/word_count)

eng_stopwords = set(stopwords.words("english"))
def stopwords_count(text):
    """ Number of stopwords fraction in a text"""
    text = text.lower()
    text_splited = text.split(' ')
    text_splited = [''.join(c for c in s if c not in string.punctuation) for s in text_splited]
    text_splited = [s for s in text_splited if s]
    word_count = text_splited.__len__()
    stopwords_count = len([w for w in text_splited if w in eng_stopwords])
    if word_count == 0:
        return 0
    return (stopwords_count/word_count)


def punctuations_fraction(text):
    """functiopn to claculate the fraction of punctuations over total number of characters for a given text """
    char_count = len(text)
    punctuation_count = len([c for c in text if c in string.punctuation])
    if char_count == 0:
        return 0
    return (punctuation_count/char_count)


def char_count(text):
    """function to return number of chracters """
    return len(text)

def fraction_noun(text):
    """function to give us fraction of noun over total words """
    text_splited = text.split(' ')
    text_splited = [''.join(c for c in s if c not in string.punctuation) for s in text_splited]
    text_splited = [s for s in text_splited if s]
    word_count = text_splited.__len__()
    pos_list = nltk.pos_tag(text_splited)
    noun_count = len([w for w in pos_list if w[1] in ('NN','NNP','NNPS','NNS')])
    if word_count == 0:
        return 0
    return (noun_count/word_count)

def fraction_adj(text):
    """function to give us fraction of adjectives over total words in given text"""
    text_splited = text.split(' ')
    text_splited = [''.join(c for c in s if c not in string.punctuation) for s in text_splited]
    text_splited = [s for s in text_splited if s]
    word_count = text_splited.__len__()
    pos_list = nltk.pos_tag(text_splited)
    adj_count = len([w for w in pos_list if w[1] in ('JJ','JJR','JJS')])
    if word_count == 0:
        return 0
    return (adj_count/word_count)

def fraction_verbs(text):
    """function to give us fraction of verbs over total words in given text"""
    text_splited = text.split(' ')
    text_splited = [''.join(c for c in s if c not in string.punctuation) for s in text_splited]
    text_splited = [s for s in text_splited if s]
    word_count = text_splited.__len__()
    pos_list = nltk.pos_tag(text_splited)
    verbs_count = len([w for w in pos_list if w[1] in ('VB','VBD','VBG','VBN','VBP','VBZ')])
    if word_count == 0:
        return 0
    return (verbs_count/word_count)


In [5]:
from collections import Counter
from nltk import FreqDist

class CustomizedBertForQA:
    def __init__(self, tokenizer, config_file, bert_model_file, max_length, n_splits):
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.sentiment_id = {s:self.tokenizer.encode(s).ids[0] for s in ['positive', 'negative', 'neutral']}
        self.sentiment_mapping = {'positive':0,'negative':1,'neutral':2}
        self.config_file = config_file
        self.bert_model_file = bert_model_file
        self.n_splits = n_splits
        
    def bert_data_transform(self, data, train=True):
        '''
        Transform data into arrays that BERT understands 
        '''
        data = data.reset_index(drop=True)
        ct = data.shape[0]
        input_ids = np.ones((ct,self.max_length),dtype='int32')
        input_ids_nosent = np.ones((ct,self.max_length),dtype='int32')
        attention_mask = np.zeros((ct,self.max_length),dtype='int32')
        #token_type_ids = np.zeros((ct,self.max_length),dtype='int32')
        if train:
            start_tokens = np.zeros((ct,self.max_length),dtype='int32')
            end_tokens = np.zeros((ct,self.max_length),dtype='int32')
            #start_tokens = np.zeros((ct,1),dtype='float32')
            #end_tokens = np.zeros((ct,1),dtype='float32')
            
        for k in range(ct):
            # FIND OVERLAP
            text1 = " "+" ".join(data.loc[k,'text'].split())
            enc = self.tokenizer.encode(text1)
            s_tok = self.sentiment_id[data.loc[k,'sentiment']]
            input_ids[k,:len(enc.ids)+5] = [0] + enc.ids + [2,2] + [s_tok] + [2]
            input_ids_nosent[k,:len(enc.ids)+5] = [0] + enc.ids + [2,2] + [2] + [2]
            attention_mask[k,:len(enc.ids)+5] = 1
            
            if train:
                text2 = " ".join(data.loc[k,'selected_text'].split())
                idx = text1.find(text2)
                chars = np.zeros((len(text1)))
                chars[idx:idx+len(text2)]=1
                if text1[idx-1]==' ': chars[idx-1] = 1 
                # ID_OFFSETS
                offsets = []; idx=0
                for t in enc.ids:
                    w = self.tokenizer.decode([t])
                    offsets.append((idx,idx+len(w)))
                    idx += len(w)

                # START END TOKENS
                toks = []
                for i,(a,b) in enumerate(offsets):
                    sm = np.sum(chars[a:b])
                    if sm>0: toks.append(i) 

                if len(toks)>0:
                    start_tokens[k,toks[0]+1] = 1
                    end_tokens[k,toks[-1]+1] = 1
                #    start_tokens[k] = (toks[0]+1)/len(enc.ids)
                #    end_tokens[k] = (toks[-1]+1)/len(enc.ids)
        if train:
            return (input_ids, input_ids_nosent, attention_mask, start_tokens, end_tokens)
        else:
            return (input_ids, input_ids_nosent, attention_mask)
    
    def customized_features(self, data, input_ids, version, fold):
        '''
        You can design any type of features here, also need to change code in build_model correspondingly and
        change code whereever you use the features
        '''
        ct = data.shape[0]
        add_features = np.zeros((ct,27),dtype='float')
        add_features_2D = np.zeros((ct,self.max_length,0),dtype='float')
        
        sentiment_model = self.build_sentiment_model()
        sentiment_model.load_weights('%s-sentiment-%i.h5'%(version,fold))
            
        layer_output = sentiment_model.get_layer("MaxPooling1D").output
        intermediate_model = Model(inputs=sentiment_model.input,outputs=layer_output)
        #add_features_2D[:,:,0] = tf.nn.softmax(intermediate_model.predict(input_ids))
        #add_features[:,3:6] = sentiment_model.predict(input_ids)
        data = featureEng(data,training=False)
        data = data.drop(['textID','text','selected_text','sentiment'],axis=1,errors='ignore')
        add_features = data.values
        return add_features, add_features_2D
    
    def build_model(self, add_features_shape, add_features_2D_shape):
        '''
        Add layer on top of BERT
        '''        
        ids = Input((self.max_length,), dtype=tf.int32)
        att = Input((self.max_length,), dtype=tf.int32)
        #tok = Input((self.length,), dtype=tf.int32)

        add_features = Input((add_features_shape[1],),dtype=tf.float32)
        add_features_2D = Input((add_features_2D_shape[1],add_features_2D_shape[2]), dtype=tf.float32)
        
        config = RobertaConfig.from_pretrained(self.config_file)
        bert_model = TFRobertaModel.from_pretrained(self.bert_model_file,config=config)
        
        x = bert_model(ids,attention_mask=att)
        
        def output_layer(bert_output, add_features, add_features_2D, name='start'):
            x_bert = Dropout(0.3)(bert_output)
            x_bert = Conv1D(128, 3,padding='same')(x_bert)
            x_bert = LeakyReLU()(x_bert)
            x_bert = Conv1D(64, 3,padding='same')(x_bert)
            x_bert = Dense(1)(x_bert)
            #x_bert = GlobalAveragePooling1D(data_format='channels_first')(x_bert)
            x_bert = Reshape((self.max_length, 1))(x_bert)
            #x_output = Flatten()(x_bert)
            
            x_add = Dense(self.max_length, input_dim=add_features_shape[0], activation = 'relu')(add_features)
            x_add = Reshape((self.max_length, 1))(x_add)
            x_add_2D = add_features_2D
            x_combined = Concatenate()([x_bert, x_add, x_add_2D])
            x_combined = Conv1D(128, 4, padding='same')(x_combined)
            x_combined = Dense(64, activation = 'linear')(x_combined)            
            x_combined = Dense(1, activation = 'relu')(x_combined)            
            x_combined = Flatten()(x_combined)
            #x_output = Dense(1, activation = 'sigmoid')(x_combined)
            x_output = Activation('softmax', name=name)(x_combined)
             
            return x_output
        
        x1_output = output_layer(x[0], add_features, add_features_2D, name='start')
        x2_output = output_layer(x[0], add_features, add_features_2D, name='end')
    
        model = Model(inputs=[ids, att, add_features, add_features_2D], outputs=[x1_output,x2_output])
        optimizer = Adam(learning_rate=3e-5)
        model.compile(loss='categorical_crossentropy', optimizer=optimizer,metrics=['acc'])
        #model.compile(loss='mse', optimizer=optimizer, metrics=['acc'])
        
        # print(model.summary)
        # K.clear_session()
            
        return model
        
    def train_model(self, train, epochs=3, batch_size=32, version='v0', model_path=None, verbose=1):
        # USE verbose=1 FOR INTERACTIVE
        jacs = [] 

        if self.n_splits == 1:
            rs = ShuffleSplit(n_splits=1, test_size=0.25, random_state=777)
        else:
            rs = StratifiedKFold(n_splits=self.n_splits,shuffle=True,random_state=777)
        
        for fold,(idxT,idxV) in enumerate(rs.split(train.text,train.sentiment.values)):
            print('#'*25)
            print('### FOLD %i'%(fold+1))
            print('#'*25)
            train = train.reset_index(drop=True)  
            #train = pd.concat([train, data_aug(train.loc[idxT],1)], axis=0, sort=False).reset_index(drop=True)
            input_ids, input_ids_nosent, attention_mask, start_tokens, end_tokens = self.bert_data_transform(train, train=True)
            oof_start = np.zeros((input_ids.shape[0],self.max_length))
            oof_end = np.zeros((input_ids.shape[0],self.max_length))        
            #oof_start = np.zeros((input_ids.shape[0],1))
            #oof_end = np.zeros((input_ids.shape[0],1))
            #idxT = np.append(idxT, range(len(idxT)+len(idxV),len(train)))
            K.clear_session()
            #print(model.summary)
            sentiment_model = self.build_sentiment_model()
            
            if model_path is None:
                #print('Training sentiment model...')
                #sv = ModelCheckpoint('%s-sentiment-%i.h5'%(version,fold), monitor='val_loss', 
                #                     verbose=1,
                #                     save_best_only=True,
                #                     save_weights_only=True, mode='auto', save_freq='epoch')
                #sentiment_model.fit(input_ids_nosent[idxT,], 
                #                    to_categorical(np.vectorize(self.sentiment_mapping.get)(train.sentiment.values[idxT,])), 
                #    epochs=epochs, batch_size=batch_size, verbose=verbose, callbacks=[sv],
                #    validation_data=(input_ids_nosent[idxV,], 
                #                     to_categorical(np.vectorize(self.sentiment_mapping.get)(train.sentiment.values[idxV,])))
                #)
                add_features, add_features_2D = self.customized_features(train, input_ids, version, fold)
                model = self.build_model(add_features.shape, add_features_2D.shape)
                print('Training model...')

                sv = ModelCheckpoint('%s-roberta-%i.h5'%(version,fold), monitor='val_loss', verbose=1, save_best_only=True,
                                     save_weights_only=True, mode='auto', save_freq='epoch')
                es = EarlyStopping(monitor='val_loss', mode='min', verbose=1)
                model.fit([input_ids[idxT,], attention_mask[idxT,], add_features[idxT,], add_features_2D[idxT,]], [start_tokens[idxT,], end_tokens[idxT,]], 
                    epochs=epochs, batch_size=batch_size, verbose=verbose, callbacks=[sv],
                    validation_data=([input_ids[idxV,],attention_mask[idxV,], add_features[idxV,],add_features_2D[idxV,]], 
                    [start_tokens[idxV,], end_tokens[idxV,]]))

                #print('Loading model...')
                #model.load_weights('%s-roberta-%i.h5'%(version,fold))
            else:
                add_features, add_features_2D = self.customized_features(train, input_ids, version, fold)
                model = self.build_model(add_features.shape, add_features_2D.shape)
                print('Loading model...')
                model.load_weights('%s/%s-roberta-%i.h5'%(model_path,version,fold))
            
            for label, idx in {'INF':idxT, 'OOF':idxV}.items():
                print('Predicting %s...'%label)
                oof_start[idx,],oof_end[idx,] = model.predict([input_ids[idx,],
                                                               attention_mask[idx,], 
                                                               add_features[idx,], 
                                                               add_features_2D[idx,]],verbose=verbose)

                # DISPLAY FOLD JACCARD
                all_selected_text = self.predict_decode(train, oof_start, oof_end, idx)
                all_jac = []
                for i in range(len(idx)):
                    #print('text: ', train.loc[idx[i],'text'])
                    #print('pred: ',oof_start[idx[i],],oof_end[idx[i],] )
                    #print('target: ',start_tokens[idx[i]], end_tokens[idx[i]])
                    #print('pred string: ',all_selected_text[i])
                    #print('true string: ',train.loc[idx[i],'selected_text'])
                    #print("------------------------------------")
                    all_jac.append(jaccard(all_selected_text[i],train.loc[idx[i],'selected_text']))
                print('>>>> FOLD %i Jaccard ='%(fold+1),np.mean(all_jac))
                
                if label == 'OOF':
                    jacs.append(np.mean(all_jac))
                                 
        print('>>>> OVERALL %i Fold CV Jaccard ='%self.n_splits,np.mean(jacs))
        
        
    def predict_test(self, test, version='v0', model_path=None, verbose=1):
        test = test.reset_index(drop=True)
        input_ids, input_ids_nosent, attention_mask = self.bert_data_transform(test, train=False)
        preds_start = np.zeros((input_ids.shape[0],self.max_length))
        preds_end = np.zeros((input_ids.shape[0],self.max_length))        
        #preds_start = np.zeros((input_ids.shape[0],1))
        #preds_end = np.zeros((input_ids.shape[0],1))
        
        for fold in range(self.n_splits):
            K.clear_session()
            add_features, add_features_2D = self.customized_features(test, input_ids_nosent, version, fold)
            model = self.build_model(add_features.shape, add_features_2D.shape)
        
            if model_path is None:
                model.load_weights('%s-roberta-%i.h5'%(version,fold))
            else:
                model.load_weights('%s/%s-roberta-%i.h5'%(model_path,version,fold))
            print('Predicting Test...')
            preds = model.predict([input_ids,attention_mask,add_features,add_features_2D],verbose=verbose)
            preds_start += preds[0]/self.n_splits
            preds_end += preds[1]/self.n_splits

        all_selected_text = self.predict_decode(test, preds_start, preds_end, test.index)

        return all_selected_text, test
    
    def predict_decode(self, text_data, vec_start, vec_end, vec_idx):
        all_selected_text = []
        for k in vec_idx:
            #if text_data.loc[k, 'sentiment'] == 'neutral' or len(text_data.loc[k, 'text'].split())<3: 
            #    st = text_data.loc[k,'text']
            #else:
            text1 = " "+" ".join(text_data.loc[k,'text'].split())
            enc = self.tokenizer.encode(text1)
            a = np.argmax(vec_start[k,])   
            b = np.argmax(vec_end[k,])
            #a = int(vec_start[k]*len(enc.ids))
            #b = int(vec_end[k]*len(enc.ids))
            #if a>len(enc.ids) or b<0 or a>b:
            if a>b:
                st = text_data.loc[k,'text']
            else:
                st = self.tokenizer.decode(enc.ids[a-1:b])
            all_selected_text.append(st)
        return pd.Series(all_selected_text)
    
    def build_sentiment_model(self):
        ids = Input((self.max_length,), dtype=tf.int32)
        x = Embedding(input_dim=VOCAB_SIZE, input_length=self.max_length, output_dim=512, name="Embedding-1")(ids)
        x = Dropout(0.2)(x)
        x = Conv1D(128, 2,padding='same')(x)
        x = LeakyReLU()(x)
        x = Conv1D(self.max_length, 2,padding='same',name='conv1d-2')(x)
        x = GlobalMaxPooling1D(name="MaxPooling1D")(x)
        x = Dense(3)(x)
        out = Activation('softmax',name='softmax')(x)
        sentiment_model = Model(inputs=[ids], outputs=[out])
        optimizer = Adam(learning_rate=8e-5)
        sentiment_model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=["acc"])
        
        return sentiment_model 

In [6]:
train = pd.read_csv('../input/tweet-sentiment-extraction/train.csv').fillna('')
test = pd.read_csv('../input/tweet-sentiment-extraction/test.csv').fillna('')

In [7]:
train.head()

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative


In [8]:
test.head()

Unnamed: 0,textID,text,sentiment
0,f87dea47db,Last session of the day http://twitpic.com/67ezh,neutral
1,96d74cb729,Shanghai is also really exciting (precisely -...,positive
2,eee518ae67,"Recession hit Veronique Branquinho, she has to...",negative
3,01082688c6,happy bday!,positive
4,33987a8ee5,http://twitpic.com/4w75p - I like it!!,positive


In [9]:
PATH = '../input/tf-roberta/'
tokenizer = tokenizers.ByteLevelBPETokenizer(
    vocab_file=PATH+'/vocab-roberta-base.json', 
    merges_file=PATH+'/merges-roberta-base.txt', 
    lowercase=True,
    add_prefix_space=True
)

VOCAB_SIZE = 50156
SAMPLE_RUN = False # Set True if you just want to debug implementation. Otherwise False.
MODEL_PATH = None # Set None if you want to train a new model, otherwise specify the PATH of trained model
N_SPLITS = 1 # 1 means train validation split at portion 0.75:0.25
MAX_LEN = 108

In [10]:
if SAMPLE_RUN:
    sample_train = train.sample(100)
    sample_test = test.sample(3)

    sample_model = CustomizedBertForQA(tokenizer, config_file=PATH+'config-roberta-base.json', 
                                       bert_model_file=PATH+'pretrained-roberta-base.h5', 
                                       max_length=MAX_LEN, n_splits=N_SPLITS)
    sample_model.train_model(sample_train, epochs=3, batch_size=32, version='v0', model_path=MODEL_PATH, verbose=1)
    
    sample_selected_text, sample_test = sample_model.predict_test(sample_test, version='v0', model_path=MODEL_PATH, verbose=1)
    sample_test['selected_text'] = sample_selected_text
    pd.set_option('max_colwidth', 60)
    print(sample_test)
else:
    full_model = CustomizedBertForQA(tokenizer, config_file=PATH+'config-roberta-base.json',
                                     bert_model_file=PATH+'pretrained-roberta-base.h5', 
                                     max_length=MAX_LEN, n_splits=N_SPLITS)
    full_model.train_model(train, epochs=5, batch_size=32, version='v0', model_path=MODEL_PATH, verbose=1)

    all_selected_text, test = full_model.predict_test(test, version='v0', model_path=MODEL_PATH, verbose=1)
    test['selected_text'] = all_selected_text
    test[['textID','selected_text']].to_csv('submission.csv',index=False)
    pd.set_option('max_colwidth', 60)
    print(test.sample(25))

#########################
### FOLD 1
#########################


  out=out, **kwargs)


Training model...
Epoch 1/5
Epoch 00001: val_loss improved from inf to 1.73523, saving model to v0-roberta-0.h5
Epoch 2/5
Epoch 00002: val_loss improved from 1.73523 to 1.68600, saving model to v0-roberta-0.h5
Epoch 3/5
Epoch 00003: val_loss improved from 1.68600 to 1.67550, saving model to v0-roberta-0.h5
Epoch 4/5
Epoch 00004: val_loss did not improve from 1.67550
Epoch 5/5
Epoch 00005: val_loss did not improve from 1.67550
Predicting INF...
>>>> FOLD 1 Jaccard = 0.8260213964936318
Predicting OOF...
>>>> FOLD 1 Jaccard = 0.6995461784276381
>>>> OVERALL 1 Fold CV Jaccard = 0.6995461784276381


KeyError: "['selected_text'] not found in axis"