# В этом ноутбуке представлена вторая гипотеза для оценки сходства датасетов, а также реализован метод сравнения датасетов в соответствии со второй гипотезой.

**Вторая гипотеза** для метода оценки сходства датасетов:

_Сходство датасетов может быть связано со схожестью их текстового содержания, которое отражается в близких средних значениях эмбеддингов._

-------------------

Поэтому **метод сравнения датасетов** следующий:

Cделать векторные представления от каждого текста из датасета (например, с помощью bert) и усреднить. Для разных датасетов сравнить полученный эмбеддинг, например, с помощью cosine_similarity. Чем больше полученное значение, тем более похожими являются полученные усредненные эмбеддинги и, по нашей гипотезе, тем больше сходство сравниваемых датасетов.

In [1]:
import numpy as np
import pandas as pd
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [2]:
need_to_conduct_research = False  # Если выставить True, то будет проведено объемное по времени исследование, результаты которого сохраняются в google drive
# Если выставить False, то результаты исследования будут загружены из google MyDrive.
# Рекомендуется выставлять False

In [3]:
if need_to_conduct_research:
    from sklearn.metrics.pairwise import cosine_similarity
    !pip install simpletransformers
    from simpletransformers.language_representation import RepresentationModel

    model = RepresentationModel(
        model_type="bert",
        model_name="bert-base-uncased",
        use_cuda=False
    )

In [4]:
hotel_reviews = 'hotel reviews'
movie_reviews = 'movie reviews'
spam_sms = 'spam sms'
spam_emails = 'spam emails'

datasets_names = [hotel_reviews, movie_reviews, spam_sms, spam_emails]

In [5]:
def get_dataset_in_correct_form(dataset_name):
    if dataset_name == spam_sms:
        df = pd.read_csv('/content/drive/MyDrive/data_for_colab/spam_sms.csv', encoding = "ISO-8859-1")
        df.drop(columns=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], inplace=True)
        df.columns = ['IS_SPAM', 'DATA_COLUMN']
        df['IS_SPAM'] = (df['IS_SPAM'] == 'spam').astype(int)
        df_positive = df[df['IS_SPAM']==1]
        df_negative = df[df['IS_SPAM']==0]
        # Тестовая выборка
        n_test = df_negative.shape[0] // 2
        df_negative_test = df_negative.tail(n_test)
        n_test = df_positive.shape[0] // 2
        df_positive_test = df_positive.tail(n_test)
        df_balanced_test = pd.concat([df_negative_test, df_positive_test])
        # Обучающая выборка
        n_train = df_negative.shape[0] // 2
        df_negative_train = df_negative.head(n_train)
        n_train = df_positive.shape[0] // 2
        df_positive_train = df_positive.head(n_train)
        df_balanced_train = pd.concat([df_negative_train, df_positive_train])

    elif dataset_name == spam_emails:
        df = pd.read_csv('/content/drive/MyDrive/data_for_colab/spam_emails.csv', encoding = "ISO-8859-1")
        df.drop(columns=['Unnamed: 0', 'label'], inplace=True)
        df.columns = ['DATA_COLUMN', 'IS_SPAM']
        df['DATA_COLUMN'] = df['DATA_COLUMN'].apply(lambda x: x.replace('\r\n', ' ').replace('\n', ' '))
        df_positive = df[df['IS_SPAM']==1]
        df_negative = df[df['IS_SPAM']==0]
        # Тестовая выборка
        n_test = df_negative.shape[0] // 2
        df_negative_test = df_negative.tail(n_test)
        n_test = df_positive.shape[0] // 2
        df_positive_test = df_positive.tail(n_test)
        df_balanced_test = pd.concat([df_negative_test, df_positive_test])
        # Обучающая выборка
        n_train = df_negative.shape[0] // 2
        df_negative_train = df_negative.head(n_train)
        n_train = df_positive.shape[0] // 2
        df_positive_train = df_positive.head(n_train)
        df_balanced_train = pd.concat([df_negative_train, df_positive_train])

    elif dataset_name == hotel_reviews:
        df = pd.read_csv('/content/drive/MyDrive/data_for_colab/tripadvisor_hotel_reviews.csv')
        df = df[df.Rating != 3]
        df['is_positive'] = (df['Rating'] >= 4).astype(int)
        df.drop(columns=['Rating'], inplace=True)
        df.columns = ['DATA_COLUMN', 'LABEL_COLUMN']
        df_positive = df[df['LABEL_COLUMN']==1]
        df_negative = df[df['LABEL_COLUMN']==0]
        # Тестовая выборка
        n_test = (df_negative.shape[0] // 4) * 3
        df_negative_test = df_negative.tail(n_test)
        n_test = (df_positive.shape[0] // 20) * 3
        df_positive_test = df_positive.tail(n_test)
        df_balanced_test = pd.concat([df_negative_test, df_positive_test])
        # Обучающая выборка
        n_train = df_negative.shape[0] // 4
        df_negative_train = df_negative.head(n_train)
        n_train = df_positive.shape[0] // 20
        df_positive_train = df_positive.head(n_train)
        df_balanced_train = pd.concat([df_negative_train, df_positive_train])
    
    elif dataset_name == movie_reviews:
        df = pd.read_csv('/content/drive/MyDrive/data_for_colab/IMDB Dataset.csv')
        df['is_positive'] = (df['sentiment'] == 'positive').astype(int)
        df.drop(columns=['sentiment'], inplace=True)
        df.columns = ['DATA_COLUMN', 'LABEL_COLUMN']
        df_positive = df[df['LABEL_COLUMN']==1]
        df_negative = df[df['LABEL_COLUMN']==0]
        # Для тестовой выборки берем последние 10% негативных отзывов и последние 10% позитивных отзывов
        n_test = df_negative.shape[0] // 10 # в оригинале df_negative.shape[0] // 10
        df_negative_test = df_negative.tail(n_test)
        n_test = df_positive.shape[0] // 10 # df_positive.shape[0] // 10
        df_positive_test = df_positive.tail(n_test)
        df_balanced_test = pd.concat([df_negative_test, df_positive_test])
        # Для обучающей выборки берем первые 2.5% из начала датасета.
        n_train = df_negative.shape[0] // 40 # в оригинале df_negative.shape[0] // 40
        df_negative_train = df_negative.head(n_train)
        n_train = df_positive.shape[0] // 40 # в оригианале df_positive.shape[0] // 40
        df_positive_train = df_positive.head(n_train)
        df_balanced_train = pd.concat([df_negative_train, df_positive_train])

    else:
        raise ValueError('Wrong dataset name')

    X_train = df_balanced_train['DATA_COLUMN'].squeeze()
    X_test = df_balanced_test['DATA_COLUMN'].squeeze()
    # dataset_in_correct_form = list(pd.concat([X_train, X_test])) # работает слишком долго
    dataset_in_correct_form = list(pd.concat([X_train, X_test]))[:100]
    return dataset_in_correct_form

In [6]:
def get_vector_representations(dataset_in_correct_form):
    items_vectors = model.encode_sentences(dataset_in_correct_form, combine_strategy="mean")
    return items_vectors

In [7]:
# cosine_similarity(np.mean(get_vector_representations(get_dataset_in_correct_form(spam_emails)), axis = 0).reshape(1, -1), np.mean(get_vector_representations(get_dataset_in_correct_form(spam_sms)), axis = 0).reshape(1, -1))[0][0]

На 1/16 часть исследования требуется 2 минуты, если брать первые 100 записей в датасете.  Если брать весь датасет, тогда на 1/16 часть исследования надо более получаса, поэтому я досрочно завершил выполнение такого трудоемкого кода. 

In [8]:
if need_to_conduct_research:
    df_with_second_method_results = pd.DataFrame(columns=[hotel_reviews, movie_reviews, spam_sms, spam_emails])
    _i = 0
    for cur_first_dataset in datasets_names:
        for cur_second_dataset in datasets_names:
            _i += 1
            print(_i, 'out of', len(datasets_names) ** 2, ':', "Please, be patient! Working on comparing", cur_first_dataset, 'with', cur_second_dataset)
            df_with_second_method_results.loc[cur_first_dataset, cur_second_dataset] = cosine_similarity(np.mean(get_vector_representations(get_dataset_in_correct_form(cur_first_dataset)), axis = 0).reshape(1, -1),
                                                                                                        np.mean(get_vector_representations(get_dataset_in_correct_form(cur_second_dataset)), axis = 0).reshape(1, -1))[0][0]
    df_with_second_method_results.to_csv('/content/drive/MyDrive/data_for_colab/dataframes/second_approach/df_with_second_method_results.csv')
else:
    df_with_second_method_results = pd.read_csv('/content/drive/MyDrive/data_for_colab/dataframes/second_approach/df_with_second_method_results.csv')
    df_with_second_method_results.set_index('Unnamed: 0', inplace=True)
    df_with_second_method_results.index.names = [None]

In [9]:
df_with_second_method_results

Unnamed: 0,hotel reviews,movie reviews,spam sms,spam emails
hotel reviews,1.0,0.767404,0.592511,0.782914
movie reviews,0.767404,1.0,0.729809,0.862794
spam sms,0.592511,0.729809,1.0,0.720972
spam emails,0.782914,0.862794,0.720972,1.0


In [10]:
df_with_second_method_results

Unnamed: 0,hotel reviews,movie reviews,spam sms,spam emails
hotel reviews,1.0,0.767404,0.592511,0.782914
movie reviews,0.767404,1.0,0.729809,0.862794
spam sms,0.592511,0.729809,1.0,0.720972
spam emails,0.782914,0.862794,0.720972,1.0


# Исследуем эффективность метода для сходства датасетов в первом смысле

**Случай 1: рассматриваются все возможные пары датасетов, в том числе пары с совпадающими датасетами.**

In [11]:
model_names_lower_case_list = ['albert', 'bert', 'distilbert', 'electra', 'labse', 'lambert', 'mobilebert', 'roberta', 'talkheads_ggelu_bert', 'tn_bert']

def get_single_f1_score_value_from_df_from_train_quality_folder(df_name: str):
    cur_df = pd.read_csv('/content/drive/MyDrive/data_for_colab/dataframes/train_quality/' + df_name + '.csv')
    return cur_df.f1_score.values[0]

def get_df_from_first_series_of_experiments(df_name: str):
    cur_df = pd.read_csv('/content/drive/MyDrive/data_for_colab/dataframes/first_series_of_experiments/' + df_name + '.csv')
    cur_df.set_index('Unnamed: 0', inplace=True)
    cur_df.index.names = [None]
    return cur_df

def get_df_from_second_series_of_experiments(df_name: str):
    cur_df = pd.read_csv('/content/drive/MyDrive/data_for_colab/dataframes/second_series_of_experiments/' + df_name + '.csv')
    cur_df.set_index('Unnamed: 0', inplace=True)
    cur_df.index.names = [None]
    return cur_df

def get_f1_score_values_from_df_from_first_series_of_experiments(df_name: str):
    cur_df = pd.read_csv('/content/drive/MyDrive/data_for_colab/dataframes/first_series_of_experiments/' + df_name + '.csv')
    cur_df.set_index('Unnamed: 0', inplace=True)
    cur_df.index.names = [None]
    def get_without_spaces(arr):
        new_arr = np.array([])
        for el in arr:
            if el != ' ':
                new_arr = np.append(new_arr, float(el))
        return new_arr
    return get_without_spaces(cur_df.f1_score.values)


In [12]:
first_series_of_experiments_datasets = [hotel_reviews, movie_reviews]
second_series_of_experiments_datasets = [spam_sms, spam_emails]

correlation_for_mean_absolute_quality_difference_and_second_method_result = -1
correlation_for_median_absolute_quality_difference_and_second_method_result = -1
correlation_for_mean_relative_quality_difference_and_second_method_result = -1
correlation_for_median_relative_quality_difference_and_second_method_result = -1


list_of_mean_f1_score_absolute_difference_values = list()
list_of_median_f1_score_absolute_difference_values = list()
list_of_mean_f1_score_relative_difference_values = list()
list_of_median_f1_score_relative_difference_values = list()
list_of_method_results = list()

# first series of experiments
for first_dataset in first_series_of_experiments_datasets:
    for second_dataset in first_series_of_experiments_datasets:
        if True: # можно выбрать, использовать это условие или условие ниже. Для второй серии экспериментов должно быть выбрано то же условие, что и здесь
        # if first_dataset != second_dataset: # можно выбрать, использовать это условие или условие выше
            # Получим список с абсолютным изменение качества для фиксированной пары датасетов,
            # итерируемся по всем моделям.
            list_of_absolute_differences_for_fixed_pair_of_datasets = list()
            list_of_relative_differences_for_fixed_pair_of_datasets = list()
            for model_name in model_names_lower_case_list:
                # absolute quality difference
                cur_df = get_df_from_first_series_of_experiments('df_with_absolute_quality_difference')
                index_name = model_name + ' trained on ' + first_dataset + ' and tested on ' + second_dataset + ' dataset quality difference'
                difference = float(cur_df.loc[index_name, 'f1_score'])
                list_of_absolute_differences_for_fixed_pair_of_datasets.append(difference)

                # relative quality difference
                cur_df = get_df_from_first_series_of_experiments('df_with_relative_quality_difference_in_percent')
                index_name = model_name + ' trained on ' + first_dataset + ' and tested on ' + second_dataset + ' dataset difference in percent'
                difference = float(cur_df.loc[index_name, 'f1_score'])
                list_of_relative_differences_for_fixed_pair_of_datasets.append(difference)

            # absolute quality difference: mean result
            mean_f1_score_value = np.average(list_of_absolute_differences_for_fixed_pair_of_datasets)
            list_of_mean_f1_score_absolute_difference_values.append(mean_f1_score_value)
            # absolute quality difference: median result
            median_f1_score_value = np.median(list_of_absolute_differences_for_fixed_pair_of_datasets)
            list_of_median_f1_score_absolute_difference_values.append(median_f1_score_value)

            # relative quality difference: mean result
            mean_f1_score_value = np.average(list_of_relative_differences_for_fixed_pair_of_datasets)
            list_of_mean_f1_score_relative_difference_values.append(mean_f1_score_value)
            # relative quality difference: median result
            median_f1_score_value = np.median(list_of_relative_differences_for_fixed_pair_of_datasets)
            list_of_median_f1_score_relative_difference_values.append(median_f1_score_value)

            method_result = df_with_second_method_results.loc[first_dataset, second_dataset]
            list_of_method_results.append(method_result)


# second series of experiments
for first_dataset in second_series_of_experiments_datasets:
    for second_dataset in second_series_of_experiments_datasets:
        if True: # можно выбрать, использовать это условие или условие ниже. Для певрой серии экспериментов должно быть выбрано то же условие, что и здесь
        # if first_dataset != second_dataset: # можно выбрать, использовать это условие или условие выше
            # Получим список с абсолютным изменение качества для фиксированной пары датасетов,
            # итерируемся по всем моделям.
            list_of_absolute_differences_for_fixed_pair_of_datasets = list()
            list_of_relative_differences_for_fixed_pair_of_datasets = list()
            for model_name in model_names_lower_case_list:
                # absolute quality difference
                cur_df = get_df_from_second_series_of_experiments('df_with_absolute_quality_difference')
                index_name = model_name + ' trained on ' + first_dataset + ' tested on ' + second_dataset + ' dataset quality difference'
                difference = float(cur_df.loc[index_name, 'f1_score'])
                list_of_absolute_differences_for_fixed_pair_of_datasets.append(difference)

                # relative quality difference
                cur_df = get_df_from_second_series_of_experiments('df_with_relative_quality_difference_in_percent')
                index_name = model_name + ' trained on ' + first_dataset + ' tested on ' + second_dataset + ' dataset difference in percent'
                difference = float(cur_df.loc[index_name, 'f1_score'])
                list_of_relative_differences_for_fixed_pair_of_datasets.append(difference)

            # absolute quality difference: mean result
            mean_f1_score_value = np.average(list_of_absolute_differences_for_fixed_pair_of_datasets)
            list_of_mean_f1_score_absolute_difference_values.append(mean_f1_score_value)
            # absolute quality difference: median result
            median_f1_score_value = np.median(list_of_absolute_differences_for_fixed_pair_of_datasets)
            list_of_median_f1_score_absolute_difference_values.append(median_f1_score_value)

            # relative quality difference: mean result
            mean_f1_score_value = np.average(list_of_relative_differences_for_fixed_pair_of_datasets)
            list_of_mean_f1_score_relative_difference_values.append(mean_f1_score_value)
            # relative quality difference: median result
            median_f1_score_value = np.median(list_of_relative_differences_for_fixed_pair_of_datasets)
            list_of_median_f1_score_relative_difference_values.append(median_f1_score_value)

            method_result = df_with_second_method_results.loc[first_dataset, second_dataset]
            list_of_method_results.append(method_result)

# Пусть теперь датасеты взяты из разных серий экспериментов, решаются разные задачи:
# В датасетах из первой серии эскпериментов отзыв классифицируется как позитивный или негативный. 
# В датасетах из второй серии экспериментов содержимое классифицируется как спам / не спам. 

# Пункт 1: Здесь первый датасет из первой серии, второй датасет – из второй
for first_dataset in first_series_of_experiments_datasets:
    for second_dataset in second_series_of_experiments_datasets:
        list_of_absolute_differences_for_fixed_pair_of_datasets = list()
        for model_name in model_names_lower_case_list:
            # На первом датасете (он из ПЕРВОЙ серии экспериментов) происходит обучение.
            # Для первой серии экспериментов качество на обучающей выброке сохранено
            cur_df = get_df_from_first_series_of_experiments('df_with_train_quality_all_models')
            index_name = model_name + ' trained on ' + first_dataset + ' quality on train dataset'
            # Считаем, что качество падает до нуля
            difference = -1 * float(cur_df.loc[index_name, 'f1_score'])
            list_of_absolute_differences_for_fixed_pair_of_datasets.append(difference)

        # absolute quality difference: mean result
        mean_f1_score_value = np.average(list_of_absolute_differences_for_fixed_pair_of_datasets)
        list_of_mean_f1_score_absolute_difference_values.append(mean_f1_score_value)
        # absolute quality difference: median result
        median_f1_score_value = np.median(list_of_absolute_differences_for_fixed_pair_of_datasets)
        list_of_median_f1_score_absolute_difference_values.append(median_f1_score_value)

        # relative quality difference: mean result
        list_of_mean_f1_score_relative_difference_values.append(-100)
        # relative quality difference: median result
        list_of_median_f1_score_relative_difference_values.append(-100)
        method_result = df_with_second_method_results.loc[first_dataset, second_dataset]
        list_of_method_results.append(method_result)

# Пункт 2: Здесь первый даатсет из второй серии, второй датасет – из первой серии
for first_dataset in second_series_of_experiments_datasets:
    for second_dataset in first_series_of_experiments_datasets:
        list_of_absolute_differences_for_fixed_pair_of_datasets = list()
        for model_name in model_names_lower_case_list:
            # На первом датасете (он из ВТОРОЙ серии экспериментов) происходит обучение.
            # Для этой серии экспериментов качество на обучающей выброке сохранено в отдельной папке
            # Также считаем, что качество падает до нуля
            difference = -1 * get_single_f1_score_value_from_df_from_train_quality_folder(
                model_name + ' trained on ' + first_dataset + ' quality on train dataset'
            )
            list_of_absolute_differences_for_fixed_pair_of_datasets.append(difference)
        # absolute quality difference: mean result
        mean_f1_score_value = np.average(list_of_absolute_differences_for_fixed_pair_of_datasets)
        list_of_mean_f1_score_absolute_difference_values.append(mean_f1_score_value)
        # absolute quality difference: median result
        median_f1_score_value = np.median(list_of_absolute_differences_for_fixed_pair_of_datasets)
        list_of_median_f1_score_absolute_difference_values.append(median_f1_score_value)
        # relative quality difference: mean result
        list_of_mean_f1_score_relative_difference_values.append(-100)
        # relative quality difference: median result
        list_of_median_f1_score_relative_difference_values.append(-100)
        method_result = df_with_second_method_results.loc[first_dataset, second_dataset]
        list_of_method_results.append(method_result)

# absolute quality difference: mean result
correlation = np.corrcoef(list_of_method_results, list_of_mean_f1_score_absolute_difference_values)[0, 1]
correlation_for_mean_absolute_quality_difference_and_second_method_result = correlation
# absolute quality difference: median result
correlation = np.corrcoef(list_of_method_results, list_of_median_f1_score_absolute_difference_values)[0, 1]
correlation_for_median_absolute_quality_difference_and_second_method_result= correlation

# relative quality difference: mean result
correlation = np.corrcoef(list_of_method_results, list_of_mean_f1_score_relative_difference_values)[0, 1]
correlation_for_mean_relative_quality_difference_and_second_method_result = correlation
# relative quality difference: median result
correlation = np.corrcoef(list_of_method_results, list_of_median_f1_score_relative_difference_values)[0, 1]
correlation_for_median_relative_quality_difference_and_second_method_result = correlation
                

In [13]:
df_first_sense_first_case_results = pd.DataFrame({'value': []})

df_first_sense_first_case_results.loc[
    'correlation_for_mean_absolute_quality_difference_and_second_method_result'
    ] = correlation_for_mean_absolute_quality_difference_and_second_method_result

df_first_sense_first_case_results.loc[
    'correlation_for_median_absolute_quality_difference_and_second_method_result'
    ] = correlation_for_median_absolute_quality_difference_and_second_method_result

df_first_sense_first_case_results.loc[
    'correlation_for_mean_relative_quality_difference_and_second_method_result'
    ] = correlation_for_mean_relative_quality_difference_and_second_method_result

df_first_sense_first_case_results.loc[
    'correlation_for_median_relative_quality_difference_and_second_method_result'
    ] = correlation_for_median_relative_quality_difference_and_second_method_result

In [14]:
df_first_sense_first_case_results

Unnamed: 0,value
correlation_for_mean_absolute_quality_difference_and_second_method_result,0.719517
correlation_for_median_absolute_quality_difference_and_second_method_result,0.700215
correlation_for_mean_relative_quality_difference_and_second_method_result,0.702173
correlation_for_median_relative_quality_difference_and_second_method_result,0.689387


-----

**Случай 2: рассматриваются все возможные пары _различных_ датасетов.**


In [15]:
first_series_of_experiments_datasets = [hotel_reviews, movie_reviews]
second_series_of_experiments_datasets = [spam_sms, spam_emails]

correlation_for_mean_absolute_quality_difference_and_second_method_result = -1
correlation_for_median_absolute_quality_difference_and_second_method_result = -1
correlation_for_mean_relative_quality_difference_and_second_method_result = -1
correlation_for_median_relative_quality_difference_and_second_method_result = -1


list_of_mean_f1_score_absolute_difference_values = list()
list_of_median_f1_score_absolute_difference_values = list()
list_of_mean_f1_score_relative_difference_values = list()
list_of_median_f1_score_relative_difference_values = list()
list_of_method_results = list()

# first series of experiments
for first_dataset in first_series_of_experiments_datasets:
    for second_dataset in first_series_of_experiments_datasets:
        # if True: # можно выбрать, использовать это условие или условие ниже. Для второй серии экспериментов должно быть выбрано то же условие, что и здесь
        if first_dataset != second_dataset: # можно выбрать, использовать это условие или условие выше
            # Получим список с абсолютным изменение качества для фиксированной пары датасетов,
            # итерируемся по всем моделям.
            list_of_absolute_differences_for_fixed_pair_of_datasets = list()
            list_of_relative_differences_for_fixed_pair_of_datasets = list()
            for model_name in model_names_lower_case_list:
                # absolute quality difference
                cur_df = get_df_from_first_series_of_experiments('df_with_absolute_quality_difference')
                index_name = model_name + ' trained on ' + first_dataset + ' and tested on ' + second_dataset + ' dataset quality difference'
                difference = float(cur_df.loc[index_name, 'f1_score'])
                list_of_absolute_differences_for_fixed_pair_of_datasets.append(difference)

                # relative quality difference
                cur_df = get_df_from_first_series_of_experiments('df_with_relative_quality_difference_in_percent')
                index_name = model_name + ' trained on ' + first_dataset + ' and tested on ' + second_dataset + ' dataset difference in percent'
                difference = float(cur_df.loc[index_name, 'f1_score'])
                list_of_relative_differences_for_fixed_pair_of_datasets.append(difference)

            # absolute quality difference: mean result
            mean_f1_score_value = np.average(list_of_absolute_differences_for_fixed_pair_of_datasets)
            list_of_mean_f1_score_absolute_difference_values.append(mean_f1_score_value)
            # absolute quality difference: median result
            median_f1_score_value = np.median(list_of_absolute_differences_for_fixed_pair_of_datasets)
            list_of_median_f1_score_absolute_difference_values.append(median_f1_score_value)

            # relative quality difference: mean result
            mean_f1_score_value = np.average(list_of_relative_differences_for_fixed_pair_of_datasets)
            list_of_mean_f1_score_relative_difference_values.append(mean_f1_score_value)
            # relative quality difference: median result
            median_f1_score_value = np.median(list_of_relative_differences_for_fixed_pair_of_datasets)
            list_of_median_f1_score_relative_difference_values.append(median_f1_score_value)

            method_result = df_with_second_method_results.loc[first_dataset, second_dataset]
            list_of_method_results.append(method_result)


# second series of experiments
for first_dataset in second_series_of_experiments_datasets:
    for second_dataset in second_series_of_experiments_datasets:
        # if True: # можно выбрать, использовать это условие или условие ниже. Для певрой серии экспериментов должно быть выбрано то же условие, что и здесь
        if first_dataset != second_dataset: # можно выбрать, использовать это условие или условие выше
            # Получим список с абсолютным изменение качества для фиксированной пары датасетов,
            # итерируемся по всем моделям.
            list_of_absolute_differences_for_fixed_pair_of_datasets = list()
            list_of_relative_differences_for_fixed_pair_of_datasets = list()
            for model_name in model_names_lower_case_list:
                # absolute quality difference
                cur_df = get_df_from_second_series_of_experiments('df_with_absolute_quality_difference')
                index_name = model_name + ' trained on ' + first_dataset + ' tested on ' + second_dataset + ' dataset quality difference'
                difference = float(cur_df.loc[index_name, 'f1_score'])
                list_of_absolute_differences_for_fixed_pair_of_datasets.append(difference)

                # relative quality difference
                cur_df = get_df_from_second_series_of_experiments('df_with_relative_quality_difference_in_percent')
                index_name = model_name + ' trained on ' + first_dataset + ' tested on ' + second_dataset + ' dataset difference in percent'
                difference = float(cur_df.loc[index_name, 'f1_score'])
                list_of_relative_differences_for_fixed_pair_of_datasets.append(difference)

            # absolute quality difference: mean result
            mean_f1_score_value = np.average(list_of_absolute_differences_for_fixed_pair_of_datasets)
            list_of_mean_f1_score_absolute_difference_values.append(mean_f1_score_value)
            # absolute quality difference: median result
            median_f1_score_value = np.median(list_of_absolute_differences_for_fixed_pair_of_datasets)
            list_of_median_f1_score_absolute_difference_values.append(median_f1_score_value)

            # relative quality difference: mean result
            mean_f1_score_value = np.average(list_of_relative_differences_for_fixed_pair_of_datasets)
            list_of_mean_f1_score_relative_difference_values.append(mean_f1_score_value)
            # relative quality difference: median result
            median_f1_score_value = np.median(list_of_relative_differences_for_fixed_pair_of_datasets)
            list_of_median_f1_score_relative_difference_values.append(median_f1_score_value)

            method_result = df_with_second_method_results.loc[first_dataset, second_dataset]
            list_of_method_results.append(method_result)

# Пусть теперь датасеты взяты из разных серий экспериментов, решаются разные задачи:
# В датасетах из первой серии эскпериментов отзыв классифицируется как позитивный или негативный. 
# В датасетах из второй серии экспериментов содержимое классифицируется как спам / не спам. 

# Пункт 1: Здесь первый датасет из первой серии, второй датасет – из второй
for first_dataset in first_series_of_experiments_datasets:
    for second_dataset in second_series_of_experiments_datasets:
        list_of_absolute_differences_for_fixed_pair_of_datasets = list()
        for model_name in model_names_lower_case_list:
            # На первом датасете (он из ПЕРВОЙ серии экспериментов) происходит обучение.
            # Для первой серии экспериментов качество на обучающей выброке сохранено
            cur_df = get_df_from_first_series_of_experiments('df_with_train_quality_all_models')
            index_name = model_name + ' trained on ' + first_dataset + ' quality on train dataset'
            # Считаем, что качество падает до нуля
            difference = -1 * float(cur_df.loc[index_name, 'f1_score'])
            list_of_absolute_differences_for_fixed_pair_of_datasets.append(difference)

        # absolute quality difference: mean result
        mean_f1_score_value = np.average(list_of_absolute_differences_for_fixed_pair_of_datasets)
        list_of_mean_f1_score_absolute_difference_values.append(mean_f1_score_value)
        # absolute quality difference: median result
        median_f1_score_value = np.median(list_of_absolute_differences_for_fixed_pair_of_datasets)
        list_of_median_f1_score_absolute_difference_values.append(median_f1_score_value)

        # relative quality difference: mean result
        list_of_mean_f1_score_relative_difference_values.append(-100)
        # relative quality difference: median result
        list_of_median_f1_score_relative_difference_values.append(-100)
        method_result = df_with_second_method_results.loc[first_dataset, second_dataset]
        list_of_method_results.append(method_result)

# Пункт 2: Здесь первый даатсет из второй серии, второй датасет – из первой серии
for first_dataset in second_series_of_experiments_datasets:
    for second_dataset in first_series_of_experiments_datasets:
        list_of_absolute_differences_for_fixed_pair_of_datasets = list()
        for model_name in model_names_lower_case_list:
            # На первом датасете (он из ВТОРОЙ серии экспериментов) происходит обучение.
            # Для этой серии экспериментов качество на обучающей выброке сохранено в отдельной папке
            # Также считаем, что качество падает до нуля
            difference = -1 * get_single_f1_score_value_from_df_from_train_quality_folder(
                model_name + ' trained on ' + first_dataset + ' quality on train dataset'
            )
            list_of_absolute_differences_for_fixed_pair_of_datasets.append(difference)
        # absolute quality difference: mean result
        mean_f1_score_value = np.average(list_of_absolute_differences_for_fixed_pair_of_datasets)
        list_of_mean_f1_score_absolute_difference_values.append(mean_f1_score_value)
        # absolute quality difference: median result
        median_f1_score_value = np.median(list_of_absolute_differences_for_fixed_pair_of_datasets)
        list_of_median_f1_score_absolute_difference_values.append(median_f1_score_value)
        # relative quality difference: mean result
        list_of_mean_f1_score_relative_difference_values.append(-100)
        # relative quality difference: median result
        list_of_median_f1_score_relative_difference_values.append(-100)
        method_result = df_with_second_method_results.loc[first_dataset, second_dataset]
        list_of_method_results.append(method_result)

# absolute quality difference: mean result
correlation = np.corrcoef(list_of_method_results, list_of_mean_f1_score_absolute_difference_values)[0, 1]
correlation_for_mean_absolute_quality_difference_and_second_method_result = correlation
# absolute quality difference: median result
correlation = np.corrcoef(list_of_method_results, list_of_median_f1_score_absolute_difference_values)[0, 1]
correlation_for_median_absolute_quality_difference_and_second_method_result = correlation

# relative quality difference: mean result
correlation = np.corrcoef(list_of_method_results, list_of_mean_f1_score_relative_difference_values)[0, 1]
correlation_for_mean_relative_quality_difference_and_second_method_result = correlation
# relative quality difference: median result
correlation = np.corrcoef(list_of_method_results, list_of_median_f1_score_relative_difference_values)[0, 1]
correlation_for_median_relative_quality_difference_and_second_method_result = correlation

In [16]:
df_first_sense_second_case_results = pd.DataFrame({'value': []})

df_first_sense_second_case_results.loc[
    'correlation_for_mean_absolute_quality_difference_and_second_method_result'
    ] = correlation_for_mean_absolute_quality_difference_and_second_method_result

df_first_sense_second_case_results.loc[
    'correlation_for_median_absolute_quality_difference_and_second_method_result'
    ] = correlation_for_median_absolute_quality_difference_and_second_method_result

df_first_sense_second_case_results.loc[
    'correlation_for_mean_relative_quality_difference_and_second_method_result'
    ] = correlation_for_mean_relative_quality_difference_and_second_method_result

df_first_sense_second_case_results.loc[
    'correlation_for_median_relative_quality_difference_and_second_method_result'
    ] = correlation_for_median_relative_quality_difference_and_second_method_result

In [17]:
df_first_sense_second_case_results

Unnamed: 0,value
correlation_for_mean_absolute_quality_difference_and_second_method_result,0.137801
correlation_for_median_absolute_quality_difference_and_second_method_result,0.118253
correlation_for_mean_relative_quality_difference_and_second_method_result,0.072547
correlation_for_median_relative_quality_difference_and_second_method_result,0.08206


**Вывод:**

_Для случая 1:_

Коэффициенты корреляции получаются около 0.7, находится на границе средней и высокой корреляции.


_Для случая 2:_

Коэффициенты корреляции получаются существенно меньше: от 0.7 до 0.14, что соответствует очень слабой корреляции.


**Подводя итог, получается стойкая средняя корреляция для случая 1, но для случая 2 корреляция получается очень низкая. Метод является слабо эффективным для оценки сходства датасетов в первом смысле.**

# Исследуем эффективность метода для сходства датасетов во втором смысле


In [18]:
def get_f1_score_values_from_df_from_additional_research(df_name: str):
    cur_df = pd.read_csv('/content/drive/MyDrive/data_for_colab/dataframes/additional_research/' + df_name + '.csv')
    cur_df.set_index('Unnamed: 0', inplace=True)
    cur_df.index.names = [None]
    return cur_df.f1_score.values


**Случай 1: рассматриваются не все возможные пары датасетов, а половина из них: есть пара, соответствующая первой серии экспериментов, есть пара, соответствующая второй серии экспериментов, также есть пары, соответствующие переходам с датасета первой серии экспериментов на датасет второй серии экспериментов. Среди рассматриваемых пар выполнено, что если есть пара (А, В), то пары (В, А) нет среди рассматриваемых.**

In [19]:
def transform_name_of_dataset(dataset_name):
    if dataset_name in ['sms', 'emails']:
        return 'spam ' + dataset_name
    return dataset_name 

list_of_dataset_names_with_underscore = ['hotel_reviews', 'movie_reviews', 'spam_sms', 'spam_emails']
list_of_pairs_considered = ['hotel_reviews_to_movie_reviews', 'hotel_reviews_to_emails', 'hotel_reviews_to_sms',
                            'movie_reviews_to_emails', 'movie_reviews_to_sms', 'sms_to_emails']

correlation_for_mean_absolute_quality_difference_and_second_method_result = -1
correlation_for_median_absolute_quality_difference_and_second_method_result = -1
correlation_for_mean_relative_quality_difference_and_second_method_result = -1
correlation_for_median_relative_quality_difference_and_second_method_result = -1

list_of_mean_f1_score_absolute_difference_values = list()
list_of_median_f1_score_absolute_difference_values = list()
list_of_mean_f1_score_relative_difference_values = list()
list_of_median_f1_score_relative_difference_values = list()
list_of_method_results = list()

for pair_of_datasets in list_of_pairs_considered:
    # absolute quality difference: mean result
    df_name = 'df_absolute_quality_difference_when_switching_from_' + pair_of_datasets
    mean_f1_score_value = get_f1_score_values_from_df_from_additional_research(df_name).mean()
    list_of_mean_f1_score_absolute_difference_values.append(mean_f1_score_value)
    # absolute quality difference: median result
    median_f1_score_value = np.median(get_f1_score_values_from_df_from_additional_research(df_name))
    list_of_median_f1_score_absolute_difference_values.append(median_f1_score_value)

    # relative quality difference: mean result
    df_name = 'df_relative_quality_difference_in_percent_when_switching_from_' + pair_of_datasets
    mean_f1_score_value = get_f1_score_values_from_df_from_additional_research(df_name).mean()
    list_of_mean_f1_score_relative_difference_values.append(mean_f1_score_value)
    # relative quality difference: median result
    median_f1_score_value = np.median(get_f1_score_values_from_df_from_additional_research(df_name))
    list_of_median_f1_score_relative_difference_values.append(median_f1_score_value)
    
    first_dataset_name, second_dataset_name = [transform_name_of_dataset(dataset_name.replace('_', ' ')) for dataset_name in pair_of_datasets.split('_to_')]
    first_dataset_name_without_underscore = first_dataset_name.replace('_', ' ')
    second_dataset_name_without_underscore = second_dataset_name.replace('_', ' ')
    method_result = df_with_second_method_results.loc[first_dataset_name_without_underscore, second_dataset_name_without_underscore]
    list_of_method_results.append(method_result)


# absolute quality difference: mean result
correlation = np.corrcoef(list_of_method_results, list_of_mean_f1_score_absolute_difference_values)[0, 1]
correlation_for_mean_absolute_quality_difference_and_second_method_result = correlation
# absolute quality difference: median result
correlation = np.corrcoef(list_of_method_results, list_of_median_f1_score_absolute_difference_values)[0, 1]
correlation_for_median_absolute_quality_difference_and_second_method_result = correlation
# relative quality difference: mean result
correlation = np.corrcoef(list_of_method_results, list_of_mean_f1_score_relative_difference_values)[0, 1]
correlation_for_mean_relative_quality_difference_and_second_method_result = correlation
# relative quality difference: median result
correlation = np.corrcoef(list_of_method_results, list_of_median_f1_score_relative_difference_values)[0, 1]
correlation_for_median_relative_quality_difference_and_second_method_result = correlation



In [20]:
df_second_sense_first_case_results = pd.DataFrame({'value': []})

df_second_sense_first_case_results.loc[
    'correlation_for_mean_absolute_quality_difference_and_second_method_result'
    ] = correlation_for_mean_absolute_quality_difference_and_second_method_result

df_second_sense_first_case_results.loc[
    'correlation_for_median_absolute_quality_difference_and_second_method_result'
    ] = correlation_for_median_absolute_quality_difference_and_second_method_result

df_second_sense_first_case_results.loc[
    'correlation_for_mean_relative_quality_difference_and_second_method_result'
    ] = correlation_for_mean_relative_quality_difference_and_second_method_result

df_second_sense_first_case_results.loc[
    'correlation_for_median_relative_quality_difference_and_second_method_result'
    ] = correlation_for_median_relative_quality_difference_and_second_method_result

In [21]:
df_second_sense_first_case_results

Unnamed: 0,value
correlation_for_mean_absolute_quality_difference_and_second_method_result,0.039861
correlation_for_median_absolute_quality_difference_and_second_method_result,0.121839
correlation_for_mean_relative_quality_difference_and_second_method_result,0.070966
correlation_for_median_relative_quality_difference_and_second_method_result,0.149076


-----

**Случай 2: рассматриваются все возможные пары различных датасетов. Иными словами, к случаю 1 добавлен переход в обратную сторону.**

In [22]:
list_of_dataset_names_with_underscore = ['hotel_reviews', 'movie_reviews', 'spam_sms', 'spam_emails']

correlation_for_mean_absolute_quality_difference_and_second_method_result = -1
correlation_for_median_absolute_quality_difference_and_second_method_result = -1
correlation_for_mean_relative_quality_difference_and_second_method_result = -1
correlation_for_median_relative_quality_difference_and_second_method_result = -1

list_of_mean_f1_score_absolute_difference_values = list()
list_of_median_f1_score_absolute_difference_values = list()
list_of_mean_f1_score_relative_difference_values = list()
list_of_median_f1_score_relative_difference_values = list()
list_of_method_results = list()

for first_dataset_name in list_of_dataset_names_with_underscore:
    for second_dataset_name in list_of_dataset_names_with_underscore:
        if first_dataset_name != second_dataset_name:
            pair_of_datasets = first_dataset_name + '_to_' + second_dataset_name
            pair_of_datasets = pair_of_datasets.replace('spam_', '')

            # absolute quality difference: mean result
            df_name = 'df_absolute_quality_difference_when_switching_from_' + pair_of_datasets
            mean_f1_score_value = get_f1_score_values_from_df_from_additional_research(df_name).mean()
            list_of_mean_f1_score_absolute_difference_values.append(mean_f1_score_value)
            # absolute quality difference: median result
            median_f1_score_value = np.median(get_f1_score_values_from_df_from_additional_research(df_name))
            list_of_median_f1_score_absolute_difference_values.append(median_f1_score_value)

            # relative quality difference: mean result
            df_name = 'df_relative_quality_difference_in_percent_when_switching_from_' + pair_of_datasets
            mean_f1_score_value = get_f1_score_values_from_df_from_additional_research(df_name).mean()
            list_of_mean_f1_score_relative_difference_values.append(mean_f1_score_value)
            # relative quality difference: median result
            median_f1_score_value = np.median(get_f1_score_values_from_df_from_additional_research(df_name))
            list_of_median_f1_score_relative_difference_values.append(median_f1_score_value)
            
            first_dataset_name_without_underscore = first_dataset_name.replace('_', ' ')
            second_dataset_name_without_underscore = second_dataset_name.replace('_', ' ')
            method_result = df_with_second_method_results.loc[first_dataset_name_without_underscore, second_dataset_name_without_underscore]
            list_of_method_results.append(method_result)

# absolute quality difference: mean result
correlation = np.corrcoef(list_of_method_results, list_of_mean_f1_score_absolute_difference_values)[0, 1]
correlation_for_mean_absolute_quality_difference_and_second_method_result = correlation
# absolute quality difference: median result
correlation = np.corrcoef(list_of_method_results, list_of_median_f1_score_absolute_difference_values)[0, 1]
correlation_for_median_absolute_quality_difference_and_second_method_result = correlation
# relative quality difference: mean result
correlation = np.corrcoef(list_of_method_results, list_of_mean_f1_score_relative_difference_values)[0, 1]
correlation_for_mean_relative_quality_difference_and_second_method_result = correlation
# relative quality difference: median result
correlation = np.corrcoef(list_of_method_results, list_of_median_f1_score_relative_difference_values)[0, 1]
correlation_for_median_relative_quality_difference_and_second_method_result = correlation


In [23]:
df_second_sense_second_case_results = pd.DataFrame({'value': []})

df_second_sense_second_case_results.loc[
    'correlation_for_mean_absolute_quality_difference_and_second_method_result'
    ] = correlation_for_mean_absolute_quality_difference_and_second_method_result

df_second_sense_second_case_results.loc[
    'correlation_for_median_absolute_quality_difference_and_second_method_result'
    ] = correlation_for_median_absolute_quality_difference_and_second_method_result

df_second_sense_second_case_results.loc[
    'correlation_for_mean_relative_quality_difference_and_second_method_result'
    ] = correlation_for_mean_relative_quality_difference_and_second_method_result

df_second_sense_second_case_results.loc[
    'correlation_for_median_relative_quality_difference_and_second_method_result'
    ] = correlation_for_median_relative_quality_difference_and_second_method_result

In [24]:
df_second_sense_second_case_results

Unnamed: 0,value
correlation_for_mean_absolute_quality_difference_and_second_method_result,-1.519744e-17
correlation_for_median_absolute_quality_difference_and_second_method_result,2.017388e-17
correlation_for_mean_relative_quality_difference_and_second_method_result,0.002364859
correlation_for_median_relative_quality_difference_and_second_method_result,0.01037911


**Вывод:**

_Для случая 1:_

Коэффициенты корреляции получаются от 0.04 до 0.15, что соответствует очень слабой корреляции.


_Для случая 2:_

Коэффициенты корреляции получаются менее 0.015, это говорит об отсутствии корреляции.


**Подводя итог, получается очень слабая корреляция для случая 1, для случая 2 корреляция получается чрезвычайно близкой к нулю. Метод является абсолютно неэффективным для оценки сходства датасетов во втором смысле.**

_Замечание:_

На самом деле, не удивительно, что этот метод совершенно не подходит для оценки сходства датасетов во втором смысле. Легко заметить, что метод симметричный: если методу в качестве аргументов передать на вход датасеты в другом порядке, то возвращаемое методом численное значение не изменится. С другой стороны, сходство датасетов во втором смысле симметричным **не** является. Датасеты в случае 2 специально подобраны таким образом, чтобы подтвердить или опровергнуть значимость описанного выше несоответствия. Видим, что получилось _подтвердить_ важность описанного выше несоответствия.

