In [117]:
import tensorflow as tf
import numpy as np 
import pandas as pd
import os
import tensorflow_datasets as tfds
from tqdm import tqdm
from transformers import RobertaTokenizer, TFRobertaForSequenceClassification
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping,ModelCheckpoint
from sklearn.utils.class_weight import compute_class_weight
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas
from sklearn.model_selection import train_test_split
print("Hello")

Hello


In [119]:
train_df=pd.read_csv("final_data/train.csv")
ref1=pd.read_csv("final_data/test.csv")
ref2=pd.read_csv("final_data/val.csv")
train_df=pd.concat([train_df,ref1,ref2])
print(train_df.shape)
train_df,test_df=train_test_split(train_df,test_size=50,shuffle=True,random_state=0)
print(train_df.shape)
print(test_df.shape)

(17494, 2)
(17444, 2)
(50, 2)


In [120]:
ref3=pd.read_csv("final_data/best_semi_supervised_data.csv")
print(ref3.shape)
train_df=pd.concat([train_df,ref3])
print(train_df.shape)

(5688, 2)
(23132, 2)


In [121]:
print(train_df.label.value_counts())
print(compute_class_weight('balanced',[0,1],train_df.label))

1    13378
0     9754
Name: label, dtype: int64
[1.18576994 0.86455374]


In [122]:
# def augmentText(df):
#     from tqdm import tqdm
#     aug = naw.SynonymAug(aug_src='wordnet')
#     output=[]
#     for i in tqdm(df.text):
#         output.append(aug.augment(i))
#     df['text']=output
#     return df

In [123]:
# ref=augmentText(train_df)

In [124]:
# train_df=pd.concat([train_df,ref])
# print(train_df.shape)

In [125]:
def convert_example_to_feature(x):
    
    length=len(x)
    threshold=int(0.3*length)
    x=x[-threshold:]+" "+x
    bert_input = tokenizer.encode_plus(
                        x,                      
                        add_special_tokens = True,
                        max_length = 300,
                        pad_to_max_length = True,
                        return_attention_mask = True)
    return bert_input

In [126]:
def map_example_to_dict(input_ids, attention_masks, label):
    return {
      "input_ids": input_ids,
      "attention_mask": attention_masks
    }, label

In [127]:
training_dataset = (
    tf.data.Dataset.from_tensor_slices(
        (
            tf.cast(train_df['text'].values, tf.string),
            tf.cast(train_df['label'].values, tf.int32)
        )
    )
)

In [128]:
test_dataset = (
    tf.data.Dataset.from_tensor_slices(
        (
            tf.cast(test_df['text'].values, tf.string),
            tf.cast(test_df['label'].values, tf.int32)
        )
    )
)

In [129]:
def encode_examples(ds, limit=-1):
  # prepare list, so that we can build up final TensorFlow dataset from slices.
  input_ids_list = []
  attention_mask_list = []
  label_list = []
  if (limit > 0):
      ds = ds.take(limit)
    
  for review, label in tqdm(tfds.as_numpy(ds)):
    bert_input = convert_example_to_feature(review.decode())
  
    input_ids_list.append(bert_input['input_ids'])
    attention_mask_list.append(bert_input['attention_mask'])
    label_list.append([label])
  return tf.data.Dataset.from_tensor_slices((input_ids_list, attention_mask_list,label_list)).map(map_example_to_dict)


In [130]:
test_ds = encode_examples(test_dataset).batch(16)

50it [00:00, 314.44it/s]


In [131]:
train_ds = encode_examples(training_dataset).shuffle(10000).batch(16)

23132it [00:33, 688.10it/s]


In [134]:
model = TFRobertaForSequenceClassification.from_pretrained('roberta-base')

In [135]:
# model.config

In [160]:
learning_rate = 1e-5
number_of_epochs = 3
optimizer = tf.keras.optimizers.Nadam(learning_rate=learning_rate)


loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

early=EarlyStopping(monitor='val_accuracy',
                              min_delta=0,
                              patience=3,
                              verbose=1, mode='auto')
checkpoint=ModelCheckpoint(
    "robert/models/checkpoint-{epoch:02d}-{val_accuracy:.4f}.h5", monitor='val_accuracy', verbose=1, save_best_only=True,
    save_weights_only=False, mode='auto', save_freq='epoch')

model.compile(optimizer=optimizer, loss=loss, metrics=metric)

In [161]:
model.load_weights("robert/models/checkpoint-baseline-0.9480.h5")

In [162]:
model.config

RobertaConfig {
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.2,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.2,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "type_vocab_size": 1,
  "vocab_size": 50265
}

In [165]:
bert_history = model.fit(train_ds, epochs=number_of_epochs, validation_data=test_ds,
                         callbacks=[early,checkpoint],class_weight={0:1.1865453,1:0.86414203})

Epoch 1/3
Epoch 00001: val_accuracy improved from -inf to 0.92000, saving model to robert/models/checkpoint-01-0.9200.h5
Epoch 2/3
Epoch 00002: val_accuracy improved from 0.92000 to 0.94000, saving model to robert/models/checkpoint-02-0.9400.h5
Epoch 3/3
Epoch 00003: val_accuracy did not improve from 0.94000


In [166]:
df=pd.read_csv("final_data/submission_test.csv")
df.head()

Unnamed: 0,review_id,title,year,user_review
0,1603,Counter-Strike: Global Offensive,2015.0,"Nice graphics, new maps, weapons and models. B..."
1,1604,Counter-Strike: Global Offensive,2018.0,I would not recommend getting into this at its...
2,1605,Counter-Strike: Global Offensive,2018.0,Edit 11/12/18I have tried playing CS:GO recent...
3,1606,Counter-Strike: Global Offensive,2015.0,The game is great. But the community is the wo...
4,1607,Counter-Strike: Global Offensive,2015.0,I thank TrulyRazor for buying this for me a lo...


In [167]:
input_data={}
input_data['input_ids']=[]
# input_data['token_type_ids']=[]
input_data['attention_mask']=[]

for i in tqdm(df.user_review):
    bert_output=convert_example_to_feature(i)
    input_data['input_ids'].append(bert_output['input_ids'])
#     input_data['token_type_ids'].append(bert_output['token_type_ids'])
    input_data['attention_mask'].append(bert_output['attention_mask'])

100%|██████████| 8045/8045 [00:08<00:00, 972.85it/s] 


In [169]:
input_data['input_ids']=np.array(input_data['input_ids'])
# input_data['token_type_ids']=np.array(input_data['token_type_ids'])
input_data['attention_mask']=np.array(input_data['attention_mask'])

In [176]:
from tensorflow.keras.models import load_model
model.load_weights("robert/models/checkpoint-baseline-0.9480.h5")
pred=model.predict(input_data)

In [177]:
# max_array=pred[0].max(axis=1)
# min_array=pred[0].min(axis=1)
# diff=max_array-min_array
# labels=np.argmax(pred[0],axis=1)

# df['diff']=diff
# df['new_labels']=labels
# df.to_csv("final_data/semi_supervised_data.csv",index=None)

In [178]:
final_preds=np.argmax(pred[0],axis=1)

In [179]:
df['user_suggestion']=final_preds

In [180]:
df

Unnamed: 0,review_id,title,year,user_review,user_suggestion
0,1603,Counter-Strike: Global Offensive,2015.0,"Nice graphics, new maps, weapons and models. B...",1
1,1604,Counter-Strike: Global Offensive,2018.0,I would not recommend getting into this at its...,0
2,1605,Counter-Strike: Global Offensive,2018.0,Edit 11/12/18I have tried playing CS:GO recent...,0
3,1606,Counter-Strike: Global Offensive,2015.0,The game is great. But the community is the wo...,0
4,1607,Counter-Strike: Global Offensive,2015.0,I thank TrulyRazor for buying this for me a lo...,1
...,...,...,...,...,...
8040,25198,GUNS UP!,2017.0,Early Access ReviewGuns UP!Positive: Good Idea...,1
8041,25199,GUNS UP!,2018.0,"After 170 hrs, my review. DO NOT SPEND ANY MON...",1
8042,25200,GUNS UP!,2018.0,Pros:-Fun GameplayCons: -Micro %$#*ing transac...,0
8043,25201,GUNS UP!,2018.0,"Actualy saucy, I definetly suggest to players ...",1


In [181]:
output=df[['review_id','user_suggestion']]
output.to_csv("final_data/final_submit.csv",index=None)