In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**If you like this notebook Then please upvote it.**

# import packages

In [None]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
import re
from torch.utils.data import DataLoader, Dataset
import datasets, transformers
from transformers import TrainingArguments, Trainer
from transformers import AutoModelForSequenceClassification, AutoTokenizer
transformers.logging.set_verbosity_error()
os.environ["WANDB_DISABLED"] = "true"

# load data

In [None]:
train=pd.read_csv('/kaggle/input/us-patent-phrase-to-phrase-matching/train.csv')
test=pd.read_csv('/kaggle/input/us-patent-phrase-to-phrase-matching/test.csv')
submission=pd.read_csv('/kaggle/input/us-patent-phrase-to-phrase-matching/sample_submission.csv')

In [None]:
train


# cpc tags

In [None]:
cpc_codes = pd.read_csv("/kaggle/input/cpc-codes/titles.csv")
cpc_codes = cpc_codes.rename(columns = {"code" : "context"})
cpc_codes

In [None]:
train['context'].value_counts()

In [None]:
test.head()

In [None]:
test['context'].value_counts()

In [None]:
train_data=pd.merge(train,cpc_codes[["context","title"]],on="context",how="left")
test_data=pd.merge(test,cpc_codes[["context","title"]],on="context",how="left")

In [None]:
train_data.head()

In [None]:
max_len=128
train_data["title"]=train_data["title"].apply(lambda x:re.sub('[;,]','',x))
test_data["title"]=test_data["title"].apply(lambda x:re.sub('[;,]','',x))

In [None]:
train_data['text']=train_data['anchor']+'[SEP]'+train_data['target']+'[SEP]'+train_data['title']
test_data['text']=test_data['anchor']+'[SEP]'+test_data['target']+'[SEP]'+test_data['title']

In [None]:
from transformers import RobertaTokenizer, TFRobertaModel
# tokenizer = AutoTokenizer.from_pretrained("readerbench/RoBERT-base")
# model = TFAutoModel.from_pretrained("readerbench/RoBERT-base")
# inputs = tokenizer("exemplu de propoziție", return_tensors="tf")
# outputs = model(inputs)

In [None]:
tokenizer = RobertaTokenizer.from_pretrained("../input/roberta-base/")

In [None]:
d=tokenizer.batch_encode_plus(['my name is Gopal Goyal'], max_length=max_len,
                                        padding='max_length',
                                        truncation=True)

In [None]:
d


# Create Data for fine Tune the Model

In [None]:
max_len=256
def create_data(id_,train_data,train=True):
    input_ids=[]
    attention_mask=[]
    labels=[]
    ids=[]
    tok_text=tokenizer.batch_encode_plus(train_data['text'],
                                        max_length=max_len,
                                        padding='max_length',
                                        truncation=True)
    for i in range(len(train_data)):
        ids.append(id_[i])
        input_ids.append(tok_text['input_ids'][i])
        attention_mask.append(tok_text['attention_mask'][i])
        if train:
            labels.append(train_data['score'][i])
    return {"input_ids":input_ids,
           "attention_mask":attention_mask,
           "ids":ids},labels

In [None]:
train_data, train_labels = create_data(train_data['id'], train_data, train=True)

In [None]:
test_data, test_labels = create_data(test_data['id'], test_data, train=False)

In [None]:
train_data['ids'][1]

In [None]:
from tensorflow.keras import activations

def build_model():
    
    model_ids = Input(shape=(max_len, ), dtype = tf.int32)
    model_mask = Input(shape=(max_len, ), dtype = tf.int32)
    
    roberta_model = TFRobertaModel.from_pretrained("../input/roberta-base/")
    
    x = roberta_model(input_ids = model_ids, 
                      attention_mask = model_mask)       
    x = tf.keras.layers.GlobalAveragePooling1D()(x.last_hidden_state)
    x = tf.keras.layers.Dense(32,activation=activations.relu)(x)
    x = tf.keras.layers.Dense(16,activation=activations.relu)(x)
    x = tf.keras.layers.Dense(8,activation=activations.relu)(x)    
    outputs = Dense(1)(x)
    
    model = tf.keras.Model(inputs = [model_ids, model_mask], outputs = outputs)
    
    model.compile(
        optimizer = tf.keras.optimizers.Adam(),
        loss = "mse",
        metrics=["mse"])
    return model

In [None]:
def scheduler(epoch):
    learning_rate = 2e-5
    if epoch == 0:
        return learning_rate * 0.05
    else:
        return learning_rate * (0.9**epoch)
    
callback_lr = tf.keras.callbacks.LearningRateScheduler(scheduler)

In [None]:
model = build_model()

In [None]:
model.summary()

In [None]:
import timeit

device_name = tf.test.gpu_device_name()
if "GPU" not in device_name:
    print("GPU device not found")
print('Found GPU at: {}'.format(device_name))

In [None]:
model.fit((np.array(train_data['input_ids']),np.array(train_data['attention_mask'])),
         np.array(train_labels).ravel(),
          epochs=10,
          shuffle=True,
          callbacks=[EarlyStopping(monitor='val_mse',patience=3,restore_best_weights=True),
                   ModelCheckpoint('roberta_uspppm.h5', monitor='val_mse', 
                                     save_best_only=True, save_weights_only=True), 
                     callback_lr],                     
           batch_size = 16,
            validation_split=0.2 )

In [None]:
test_pred=model.predict((np.array(test_data['input_ids']),np.array(test_data['attention_mask'])))

In [None]:
submission['score'] = test_pred
submission['score'] = submission.score.apply(lambda x: 0 if x < 0 else x)
submission['score'] = submission.score.apply(lambda x: 1 if x > 1 else x)
submission.to_csv('submission.csv',index=False)
submission