In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
!pip install seaborn
!pip install transformers
!pip install imblearn

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

## 1. Download datasets

In [None]:
train_df = pd.read_csv('/kaggle/input/read-books-mla2/Train.csv',
                       usecols = ['review_id', 'rating', 'review_text'])
train_df.head(5)

In [None]:
test_df = pd.read_csv('/kaggle/input/read-books-mla2/Test.csv'
                     , usecols = ['review_id', 'review_text'])
test_df.head(5)

In [None]:
test_df_ids = test_df.review_id

## 2. Investigate train dataset a bit

From Dataset description we know that:

**user_id** - Id of user<br/>
**book_id** - Id of Book<br/>
**review_id** - Id of review<br/>
**rating** - rating from 0 to 5<br/>
**review_text** - review text<br/>
**date_added** - date added<br/>
**date_updated** - date updated<br/>
**read_at** - read at<br/>
**started_at** - started at<br/>
**n_votes** - no. of votes<br/>
**n_comments** - no. of comments<br/>

In [None]:
print(f'Len of train dataset: {len(train_df)}')
print(f'Len of test dataset: {len(test_df)}')

In [None]:
sns.countplot(x = train_df.rating)

In [None]:
train_df.isna().sum()

In [None]:
test_df.isna().sum()

## 3. Clear the data

In [None]:
y = train_df.pop('rating')

In [None]:
common_df = pd.concat([train_df.review_text, test_df.review_text]).reset_index(drop = True)

In [None]:
# to lower
def to_lower(text):
    return text.lower()

In [None]:
#remove stopwords
#from nltk.corpus import stopwords
#stop = set(stopwords.words('english'))
stop = {'a', 'the', 'www', 'http', 'https', 'com'}

def remove_stopwords(text):
    return ' '.join([word for word in text.split() if word not in stop])

In [None]:
import re
from urllib.parse import urlparse

def remove_url(text):
    url = re.compile(r'https?://\S+')
    return url.sub(r'',text)

In [None]:
import string
remove = string.punctuation

period = '.'
remove = remove.replace(period, '')

def remove_punctuation(text):
    pattern = re.compile(r"[{}]".format(re.escape(remove)))
    
    res = []
    for word in text.split():
        
        # remove all punctiations except periods
        new_word = pattern.sub(r' ', word)
        new_word = new_word.strip(period)
        
        try:
            float(new_word)
        except:
            new_word = new_word.replace(period, ' ')
            
        res.append(new_word)
    
    return ' '.join(res)

In [None]:
# remove all non-alphabetic chars (punctuation, numbers, emojies ...)
def remove_non_alphabetic(text):
    alpha = re.compile(r'[^a-zA-Z]') 
    return alpha.sub(r' ', text)

In [None]:
def remove_quotes(text):
    alpha = re.compile(r'"([^"\\]|\\.)*"') 
    return alpha.sub(r' ', text)

In [None]:
def remove_spoiler_alert(text):
    spoiler = re.compile(r'(\(view spoiler\).*?\(hide spoiler\))') 
    return spoiler.sub(r' ', text)

In [None]:
def remove_small_words(text):
    return ' '.join([word for word in text.split() if len(word) > 2])

In [None]:
# after applying usual text preprocessing steps (list below)
# I found ot that many reviews are empty! (because not all of them are meaningful 🤷)
# so I commented almost all of these steps out to avoid empty reviews in both train and test sets

def process_text(text):
    text = to_lower(text)
    #text = remove_url(text)
    text = remove_punctuation(text)
    #text = remove_spoiler_alert(text)
    #text = remove_quotes(text)
    #text = remove_non_alphabetic(text)
    text = remove_stopwords(text)
    #text = remove_small_words(text)
    return text

In [None]:
common_df = common_df.apply(process_text)

In [None]:
# check min number of words
common_df.apply(lambda x : len(x.split())).min()

In [None]:
import pickle

with open('common_df.pkl', 'wb') as f:
    pickle.dump(common_df, f)

In [None]:
with open('common_df.pkl', 'rb') as f:
    common_df = pickle.load(f)

In [None]:
common_df[12]

## 5. TPU setup

In [None]:
import tensorflow as tf

# try:
#    tpu = tf.distribute.cluster_resolver.TPUClusterResolver.connect() # TPU detection
# except ValueError:
tpu = None
gpus = tf.config.experimental.list_logical_devices("GPU")
    
if tpu:
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
    print('Running on TPU')
elif len(gpus) > 1:
    strategy = tf.distribute.MirroredStrategy([gpu.name for gpu in gpus])
    print('Running on multiple GPUs ', [gpu.name for gpu in gpus])
elif len(gpus) == 1:
    strategy = tf.distribute.get_strategy() 
    print('Running on single GPU ', gpus[0].name)
else:
    strategy = tf.distribute.get_strategy() 
    print('Running on CPU')
print("Number of accelerators: ", strategy.num_replicas_in_sync)

## 6. XLM_RoBERTa model

In [None]:
X_train = common_df.iloc[:len(y)]
X_test = common_df.iloc[len(y):]

In [None]:
from tensorflow.keras.utils import to_categorical

y_cat = to_categorical(y)
y_cat.shape

In [None]:
from transformers import TFAutoModel, AutoTokenizer

MAX_LEN = 100 # use only first 100 words of review to predict the rating
model_name = 'xlm-roberta-base'

# tokenizing
tokenizer = AutoTokenizer.from_pretrained(model_name)
train_inputs = tokenizer(X_train.to_list(), 
                         max_length = MAX_LEN,
                         padding = 'max_length', 
                         truncation = True,
                         return_tensors = 'np')

test_inputs = tokenizer(X_test.to_list(), 
                        max_length = MAX_LEN,
                        padding = 'max_length',
                        truncation = True,
                        return_tensors = 'np')

In [None]:
# np.savez('train_test_inputs.npz', train_inputs=train_inputs, test_inputs=test_inputs)
with open('inputs.pkl', 'wb') as file:
    pickle.dump({'train_inputs': train_inputs, 'test_inputs': test_inputs}, file)

In [None]:
fine-tune the model
from transformers import TFAutoModelForSequenceClassification
from tensorflow.keras.optimizers import Adam

with strategy.scope():
    model = TFAutoModelForSequenceClassification.from_pretrained(model_name, num_labels = y_cat.shape[-1])
    model.compile(Adam(learning_rate = 5e-5), 
                  loss = 'categorical_crossentropy', 
                  metrics=['accuracy'], steps_per_execution = 200)

model.summary()

In [None]:
from tensorflow.keras import Input, Model
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import regularizers

# build the model
with strategy.scope():
    encoder = TFAutoModel.from_pretrained(model_name)
    
    input_word_ids = Input(shape = (MAX_LEN, ), dtype = tf.int32, name = "input_ids")
    input_mask = Input(shape = (MAX_LEN, ), dtype = tf.int32, name = "attention_mask") 

    embedding = encoder([input_word_ids, input_mask])[1] # pooled_output
    x = Dropout(0.3)(embedding)
    x = Dense(128, activation = 'relu', kernel_regularizer = regularizers.L2(0.1))(x)
    x = Dropout(0.3)(x)
    x = Dense(32, activation = 'relu', kernel_regularizer = regularizers.L2(0.1))(x)
    x = Dropout(0.3)(x)
    x = Dense(y_cat.shape[1], activation = 'softmax')(x)
    
    model = Model(inputs = [input_word_ids, input_mask], 
                  outputs = x)
    
    model.compile(Adam(learning_rate = 1e-5), 
                  loss = 'categorical_crossentropy', 
                  metrics=['accuracy'], steps_per_execution = 200)
    
model.summary()

In [None]:
# callbacks
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

early_stop = EarlyStopping(patience = 5, restore_best_weights = True, verbose = 1) # val_loss
reduce_lr = ReduceLROnPlateau(factor = 0.1, patience = 2, mode = 'min', verbose = 1) # val_loss

In [None]:
# fine-tune the model

# https://github.com/huggingface/transformers/issues/20709
# The problem is that Keras recognizes dict objects 
# but not our BatchEncoding returned by the tokenizer, 
# even though BatchEncoding is a subclass of dict.
# If you replace the last line with model.fit(dict(tokenized_data), labels) it should work.

history = model.fit(dict(train_inputs), 
                    y_cat,
                    epochs = 50,
                    verbose = 1,
                    validation_split = 0.1,
                    batch_size = 64 * strategy.num_replicas_in_sync,
                    callbacks = [reduce_lr, early_stop])

In [None]:
loss = history.history['loss']
acc = history.history['accuracy']
val_loss = history.history['val_loss']
val_acc = history.history['val_accuracy']

epochs = range(1, len(loss) + 1)

plt.figure(figsize=(16, 5))
#accuracy
plt.subplot(1,2,1)
plt.plot(epochs, acc, 'bo', label = 'Training accuracy')
plt.plot(epochs, val_acc, 'r', label = 'Validation accuracy')
plt.legend()

#loss
plt.subplot(1,2,2)
plt.plot(epochs, loss, 'bo', label = 'Trainig loss')
plt.plot(epochs, val_loss, 'r', label = 'Validation loss')
plt.legend()

plt.show()

## 7. Submission

In [None]:
sub = pd.DataFrame()
sub['review_id'] = test_df_ids
sub['rating'] = [np.argmax(i) for i in model.predict(dict(test_inputs))]
sub.head()
sub.to_csv("submission2.csv", index = False)