# Описание

В этом ноутбуке загружаются две модели Roberta: одна модель обучена на отзывах на отели, другая модель обучена на отзывах на фильмы. Модели загружаются из google drive. Сами модели были обучены с сохранены в google drive в ноутбуках 

roberta_applied_to_movies_21_december.ipynb

roberta_applied_to_hotels_21_december.ipynb

**В конце ноутбука приведен dataframe с результатами тестирования каждой из моделей на каждом из датасетов (один датасет с отзывами на отели, другой датасет – с отзывами на фильмы).**

In [None]:
!pip install tensorflow-text
!pip install transformers
import tensorflow_text as text

import tensorflow as tf
import tensorflow_hub as hub
import pandas as pd

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from keras.models import load_model
import numpy as np

from google.colab import drive
drive.mount('/content/drive')

from transformers import RobertaTokenizerFast

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
hotel_reviews = 'hotel_reviews'
movie_reviews = 'movie_reviews'
roberta_trained_on_movie_reviews = 'roberta_trained_on_movie_reviews'
roberta_trained_on_hotel_reviews = 'roberta_trained_on_hotel_reviews'

In [None]:
models_dict = dict()
models_dict[roberta_trained_on_movie_reviews] = load_model('/content/drive/MyDrive/data_for_colab/roberta_trained_on_imdb_21_december')
models_dict[roberta_trained_on_hotel_reviews] = load_model('/content/drive/MyDrive/data_for_colab/roberta_trained_on_hotel_reviews_21_december')

In [None]:
path_to_movie_reviews_dataset = '/content/drive/MyDrive/data_for_colab/IMDB Dataset.csv'
path_to_hotel_reviews_dataset = '/content/drive/MyDrive/data_for_colab/tripadvisor_hotel_reviews.csv'

Redo working with text data

In [None]:
MAX_LEN = 128
tokenizer_roberta = RobertaTokenizerFast.from_pretrained("roberta-base")

In [None]:
def test_model_on_dataset_and_write_results_to_dataframe(model_name=None, dataset_name=None, dataframe_to_write_answer=None):

    global hotel_reviews, movie_reviews
    global bert_trained_on_movie_reviews
    global bert_trained_on_hotel_reviews
    global models_dict
    global path_to_hotel_reviews_dataset
    global path_to_movie_reviews_dataset
    global MAX_LEN, tokenizer_roberta

    

    def tokenize_roberta(data,max_len=MAX_LEN) :
        input_ids = []
        attention_masks = []
        for i in range(len(data)):
            encoded = tokenizer_roberta.encode_plus(
                data[i],
                add_special_tokens=True,
                max_length=max_len,
                padding='max_length',
                return_attention_mask=True
            )
            input_ids.append(encoded['input_ids'])
            attention_masks.append(encoded['attention_mask'])
        return np.array(input_ids),np.array(attention_masks)


    if model_name is None or dataset_name is None or dataframe_to_write_answer is None:
        raise ValueError("Wrong arguments passed to function: there are none arguments!")
    if model_name != roberta_trained_on_movie_reviews and model_name != roberta_trained_on_hotel_reviews:
        raise ValueError("Wrong model_name!")
    if dataset_name != movie_reviews and dataset_name != hotel_reviews:
        raise ValueError("Wrong dataset_name!")

    model = models_dict[model_name] # загрузили пользователем заданную модель 
    # (она уже обучена на определенном датасете)

    if dataset_name == movie_reviews: # if you edit this line, edit the next line too!
        df = pd.read_csv(path_to_movie_reviews_dataset)
        df['is_positive'] = (df['sentiment'] == 'positive').astype(int)
        df.drop(columns=['sentiment'], inplace=True)
        df.columns = ['DATA_COLUMN', 'LABEL_COLUMN']
        parameter_const = 2.2 # подобрали
        df['DATA_COLUMN'] = df['DATA_COLUMN'].apply(lambda x: x[:round(MAX_LEN * parameter_const)])
        df_positive = df[df['LABEL_COLUMN']==1]
        df_negative = df[df['LABEL_COLUMN']==0]
        # Для тестовой выборки берем последние 10% негативных отзывов и последние 10% позитивных отзывов
        n_test = df_negative.shape[0] // 10 # в оригинале df_negative.shape[0] // 10
        df_negative_test = df_negative.tail(n_test)
        n_test = df_positive.shape[0] // 10 # в оригинале df_positive.shape[0] // 10
        df_positive_test = df_positive.tail(n_test)
        df_balanced_test = pd.concat([df_negative_test, df_positive_test])

    elif dataset_name == hotel_reviews: # if you edit this line, edit the next line too!
        df = pd.read_csv(path_to_hotel_reviews_dataset)
        df = df[df.Rating != 3]
        df['is_positive'] = (df['Rating'] >= 4).astype(int)
        df.drop(columns=['Rating'], inplace=True)
        df.columns = ['DATA_COLUMN', 'LABEL_COLUMN']
        parameter_const = 3 # подобрали
        df['DATA_COLUMN'] = df['DATA_COLUMN'].apply(lambda x: x[:round(MAX_LEN * parameter_const)])
        df_positive = df[df['LABEL_COLUMN']==1]
        df_negative = df[df['LABEL_COLUMN']==0]
        # Тестовая выборка
        n_test = (df_negative.shape[0] // 4) * 3 #  в оригинале (df_negative.shape[0] // 4) * 3
        df_negative_test = df_negative.tail(n_test)
        n_test = (df_positive.shape[0] // 20) * 3 #  в оригинале  (df_positive.shape[0] // 20) * 3
        df_positive_test = df_positive.tail(n_test)
        df_balanced_test = pd.concat([df_negative_test, df_positive_test])

    X_test = df_balanced_test['DATA_COLUMN'].squeeze().values
    y_test = df_balanced_test['LABEL_COLUMN'].squeeze().values
    test_input_ids, test_attention_masks = tokenize_roberta(X_test, MAX_LEN)

    y_predicted = model.predict([test_input_ids,test_attention_masks])
    y_predicted = np.where(y_predicted > 0.5, 1, 0)

    row_name_in_dataframe_to_write_answer = model_name + ' tested on ' + dataset_name + ' dataset'

    dataframe_to_write_answer.loc[row_name_in_dataframe_to_write_answer, 'accuracy'] = accuracy_score(y_test, y_predicted)
    dataframe_to_write_answer.loc[row_name_in_dataframe_to_write_answer, 'precision'] = precision_score(y_test, y_predicted)
    dataframe_to_write_answer.loc[row_name_in_dataframe_to_write_answer, 'recall'] = recall_score(y_test, y_predicted)
    dataframe_to_write_answer.loc[row_name_in_dataframe_to_write_answer, 'f1_score'] =  f1_score(y_test, y_predicted)
    print(model_name, ' on dataset', dataset_name, 'result', accuracy_score(y_test, y_predicted), precision_score(y_test, y_predicted), recall_score(y_test, y_predicted), f1_score(y_test, y_predicted) )

In [None]:
answer_dataframe_roberta = pd.DataFrame(columns=['accuracy', 'precision', 'recall', 'f1_score'])

In [None]:
for cur_model_name in [roberta_trained_on_hotel_reviews, roberta_trained_on_movie_reviews]:
    for cur_dataset_name in [hotel_reviews, movie_reviews]:
        test_model_on_dataset_and_write_results_to_dataframe(cur_model_name, cur_dataset_name, answer_dataframe_roberta)

roberta_trained_on_hotel_reviews  on dataset hotel_reviews result 0.9257118390066367 0.9318899413622012 0.9133510167992926 0.9225273498548784
roberta_trained_on_hotel_reviews  on dataset movie_reviews result 0.6938 0.8794048551292091 0.4492 0.5946518400847233
roberta_trained_on_movie_reviews  on dataset hotel_reviews result 0.7096981374438022 0.626183844011142 0.9938107869142352 0.7682843472317157
roberta_trained_on_movie_reviews  on dataset movie_reviews result 0.8378 0.8376649340263894 0.838 0.8378324335132973


In [None]:
answer_dataframe_roberta

Unnamed: 0,accuracy,precision,recall,f1_score
roberta_trained_on_hotel_reviews tested on hotel_reviews dataset,0.925712,0.93189,0.913351,0.922527
roberta_trained_on_hotel_reviews tested on movie_reviews dataset,0.6938,0.879405,0.4492,0.594652
roberta_trained_on_movie_reviews tested on hotel_reviews dataset,0.709698,0.626184,0.993811,0.768284
roberta_trained_on_movie_reviews tested on movie_reviews dataset,0.8378,0.837665,0.838,0.837832
