# В этом ноутбуке представлена вторая гипотеза для оценки сходства датасетов, а также реализован метод сравнения датасетов в соответствии со второй гипотезой.

**Вторая гипотеза** для метода оценки сходства датасетов:

_Сходство датасетов может быть связано со схожестью их текстового содержания, которое отражается в близких средних значениях эмбеддингов._

-------------------

Поэтому **метод сравнения датасетов** следующий:

Cделать векторные представления от каждого текста из датасета (например, с помощью bert) и усреднить. Для разных датасетов сравнить полученный эмбеддинг, например, с помощью cosine_similarity. Чем больше полученное значение, тем более похожими являются полученные усредненные эмбеддинги и, по нашей гипотезе, тем больше сходство сравниваемых датасетов.

In [None]:
import numpy as np
import pandas as pd
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
need_to_conduct_research = False  # Если выставить True, то будет проведено объемное по времени исследование, результаты которого сохраняются в google drive
# Если выставить False, то результаты исследования будут загружены из google MyDrive.
# Рекомендуется выставлять False

In [None]:
if need_to_conduct_research:
    from sklearn.metrics.pairwise import cosine_similarity
    !pip install simpletransformers
    from simpletransformers.language_representation import RepresentationModel

    model = RepresentationModel(
        model_type="bert",
        model_name="bert-base-uncased",
        use_cuda=False
    )

In [None]:
hotel_reviews = 'hotel reviews'
movie_reviews = 'movie reviews'
spam_sms = 'spam sms'
spam_emails = 'spam emails'

datasets_names = [hotel_reviews, movie_reviews, spam_sms, spam_emails]

In [None]:
def get_dataset_in_correct_form(dataset_name):
    if dataset_name == spam_sms:
        df = pd.read_csv('/content/drive/MyDrive/data_for_colab/spam_sms.csv', encoding = "ISO-8859-1")
        df.drop(columns=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], inplace=True)
        df.columns = ['IS_SPAM', 'DATA_COLUMN']
        df['IS_SPAM'] = (df['IS_SPAM'] == 'spam').astype(int)
        df_positive = df[df['IS_SPAM']==1]
        df_negative = df[df['IS_SPAM']==0]
        # Тестовая выборка
        n_test = df_negative.shape[0] // 2
        df_negative_test = df_negative.tail(n_test)
        n_test = df_positive.shape[0] // 2
        df_positive_test = df_positive.tail(n_test)
        df_balanced_test = pd.concat([df_negative_test, df_positive_test])
        # Обучающая выборка
        n_train = df_negative.shape[0] // 2
        df_negative_train = df_negative.head(n_train)
        n_train = df_positive.shape[0] // 2
        df_positive_train = df_positive.head(n_train)
        df_balanced_train = pd.concat([df_negative_train, df_positive_train])

    elif dataset_name == spam_emails:
        df = pd.read_csv('/content/drive/MyDrive/data_for_colab/spam_emails.csv', encoding = "ISO-8859-1")
        df.drop(columns=['Unnamed: 0', 'label'], inplace=True)
        df.columns = ['DATA_COLUMN', 'IS_SPAM']
        df['DATA_COLUMN'] = df['DATA_COLUMN'].apply(lambda x: x.replace('\r\n', ' ').replace('\n', ' '))
        df_positive = df[df['IS_SPAM']==1]
        df_negative = df[df['IS_SPAM']==0]
        # Тестовая выборка
        n_test = df_negative.shape[0] // 2
        df_negative_test = df_negative.tail(n_test)
        n_test = df_positive.shape[0] // 2
        df_positive_test = df_positive.tail(n_test)
        df_balanced_test = pd.concat([df_negative_test, df_positive_test])
        # Обучающая выборка
        n_train = df_negative.shape[0] // 2
        df_negative_train = df_negative.head(n_train)
        n_train = df_positive.shape[0] // 2
        df_positive_train = df_positive.head(n_train)
        df_balanced_train = pd.concat([df_negative_train, df_positive_train])

    elif dataset_name == hotel_reviews:
        df = pd.read_csv('/content/drive/MyDrive/data_for_colab/tripadvisor_hotel_reviews.csv')
        df = df[df.Rating != 3]
        df['is_positive'] = (df['Rating'] >= 4).astype(int)
        df.drop(columns=['Rating'], inplace=True)
        df.columns = ['DATA_COLUMN', 'LABEL_COLUMN']
        df_positive = df[df['LABEL_COLUMN']==1]
        df_negative = df[df['LABEL_COLUMN']==0]
        # Тестовая выборка
        n_test = (df_negative.shape[0] // 4) * 3
        df_negative_test = df_negative.tail(n_test)
        n_test = (df_positive.shape[0] // 20) * 3
        df_positive_test = df_positive.tail(n_test)
        df_balanced_test = pd.concat([df_negative_test, df_positive_test])
        # Обучающая выборка
        n_train = df_negative.shape[0] // 4
        df_negative_train = df_negative.head(n_train)
        n_train = df_positive.shape[0] // 20
        df_positive_train = df_positive.head(n_train)
        df_balanced_train = pd.concat([df_negative_train, df_positive_train])
    
    elif dataset_name == movie_reviews:
        df = pd.read_csv('/content/drive/MyDrive/data_for_colab/IMDB Dataset.csv')
        df['is_positive'] = (df['sentiment'] == 'positive').astype(int)
        df.drop(columns=['sentiment'], inplace=True)
        df.columns = ['DATA_COLUMN', 'LABEL_COLUMN']
        df_positive = df[df['LABEL_COLUMN']==1]
        df_negative = df[df['LABEL_COLUMN']==0]
        # Для тестовой выборки берем последние 10% негативных отзывов и последние 10% позитивных отзывов
        n_test = df_negative.shape[0] // 10 # в оригинале df_negative.shape[0] // 10
        df_negative_test = df_negative.tail(n_test)
        n_test = df_positive.shape[0] // 10 # df_positive.shape[0] // 10
        df_positive_test = df_positive.tail(n_test)
        df_balanced_test = pd.concat([df_negative_test, df_positive_test])
        # Для обучающей выборки берем первые 2.5% из начала датасета.
        n_train = df_negative.shape[0] // 40 # в оригинале df_negative.shape[0] // 40
        df_negative_train = df_negative.head(n_train)
        n_train = df_positive.shape[0] // 40 # в оригианале df_positive.shape[0] // 40
        df_positive_train = df_positive.head(n_train)
        df_balanced_train = pd.concat([df_negative_train, df_positive_train])

    else:
        raise ValueError('Wrong dataset name')

    X_train = df_balanced_train['DATA_COLUMN'].squeeze()
    X_test = df_balanced_test['DATA_COLUMN'].squeeze()
    # dataset_in_correct_form = list(pd.concat([X_train, X_test])) # работает слишком долго
    dataset_in_correct_form = list(pd.concat([X_train, X_test]))[:100]
    return dataset_in_correct_form

In [None]:
def get_vector_representations(dataset_in_correct_form):
    items_vectors = model.encode_sentences(dataset_in_correct_form, combine_strategy="mean")
    return items_vectors

In [None]:
# cosine_similarity(np.mean(get_vector_representations(get_dataset_in_correct_form(spam_emails)), axis = 0).reshape(1, -1), np.mean(get_vector_representations(get_dataset_in_correct_form(spam_sms)), axis = 0).reshape(1, -1))[0][0]

На 1/16 часть исследования требуется 2 минуты, если брать первые 100 записей в датасете.  Если брать весь датасет, тогда на 1/16 часть исследования надо более получаса, и я завершил выполнение кода досрочно. 

In [None]:
if need_to_conduct_research:
    df_with_second_method_results = pd.DataFrame(columns=[hotel_reviews, movie_reviews, spam_sms, spam_emails])
    _i = 0
    for cur_first_dataset in datasets_names:
        for cur_second_dataset in datasets_names:
            _i += 1
            print(_i, 'out of', len(datasets_names) ** 2, ':', "Please, be patient! Working on comparing", cur_first_dataset, 'with', cur_second_dataset)
            df_with_second_method_results.loc[cur_first_dataset, cur_second_dataset] = cosine_similarity(np.mean(get_vector_representations(get_dataset_in_correct_form(cur_first_dataset)), axis = 0).reshape(1, -1),
                                                                                                        np.mean(get_vector_representations(get_dataset_in_correct_form(cur_second_dataset)), axis = 0).reshape(1, -1))[0][0]
    df_with_second_method_results.to_csv('/content/drive/MyDrive/data_for_colab/dataframes/second_approach/df_with_second_method_results.csv')
else:
    df_with_second_method_results = pd.read_csv('/content/drive/MyDrive/data_for_colab/dataframes/second_approach/df_with_second_method_results.csv')
    df_with_second_method_results.set_index('Unnamed: 0', inplace=True)
    df_with_second_method_results.index.names = [None]

In [None]:
df_with_second_method_results

Unnamed: 0,hotel reviews,movie reviews,spam sms,spam emails
hotel reviews,1.0,0.767404,0.592511,0.782914
movie reviews,0.767404,1.0,0.729809,0.862794
spam sms,0.592511,0.729809,1.0,0.720972
spam emails,0.782914,0.862794,0.720972,1.0


In [None]:
df_with_second_method_results

Unnamed: 0,hotel reviews,movie reviews,spam sms,spam emails
hotel reviews,1.0,0.767404,0.592511,0.782914
movie reviews,0.767404,1.0,0.729809,0.862794
spam sms,0.592511,0.729809,1.0,0.720972
spam emails,0.782914,0.862794,0.720972,1.0


# Исследуем эффективность метода для сходства датасетов во втором смысле


In [None]:
def get_f1_score_values_from_df_from_additional_research(df_name: str):
    cur_df = pd.read_csv('/content/drive/MyDrive/data_for_colab/dataframes/additional_research/' + df_name + '.csv')
    cur_df.set_index('Unnamed: 0', inplace=True)
    cur_df.index.names = [None]
    return cur_df.f1_score.values


In [None]:
list_of_dataset_names_with_underscore = ['hotel_reviews', 'movie_reviews', 'spam_sms', 'spam_emails']

list_of_correlations_for_mean_absolute_quality_difference_and_first_method_result = list()
list_of_correlations_for_median_absolute_quality_difference_and_first_method_result = list()
list_of_correlations_for_mean_relative_quality_difference_and_first_method_result = list()
list_of_correlations_for_median_relative_quality_difference_and_first_method_result = list()

for index_for_num in range(1):
    list_of_mean_f1_score_absolute_difference_values = list()
    list_of_median_f1_score_absolute_difference_values = list()
    list_of_mean_f1_score_relative_difference_values = list()
    list_of_median_f1_score_relative_difference_values = list()
    list_of_method_results = list()
    for first_dataset_name in list_of_dataset_names_with_underscore:
        for second_dataset_name in list_of_dataset_names_with_underscore:
            if first_dataset_name != second_dataset_name:
                pair_of_datasets = first_dataset_name + '_to_' + second_dataset_name
                pair_of_datasets = pair_of_datasets.replace('spam_', '')

                # absolute quality difference: mean result
                df_name = 'df_absolute_quality_difference_when_switching_from_' + pair_of_datasets
                mean_f1_score_value = get_f1_score_values_from_df_from_additional_research(df_name).mean()
                list_of_mean_f1_score_absolute_difference_values.append(mean_f1_score_value)
                # absolute quality difference: median result
                median_f1_score_value = np.median(get_f1_score_values_from_df_from_additional_research(df_name))
                list_of_median_f1_score_absolute_difference_values.append(median_f1_score_value)

                # relative quality difference: mean result
                df_name = 'df_relative_quality_difference_in_percent_when_switching_from_' + pair_of_datasets
                mean_f1_score_value = get_f1_score_values_from_df_from_additional_research(df_name).mean()
                list_of_mean_f1_score_relative_difference_values.append(mean_f1_score_value)
                # relative quality difference: median result
                median_f1_score_value = np.median(get_f1_score_values_from_df_from_additional_research(df_name))
                list_of_median_f1_score_relative_difference_values.append(median_f1_score_value)
                
                first_dataset_name_without_underscore = first_dataset_name.replace('_', ' ')
                second_dataset_name_without_underscore = second_dataset_name.replace('_', ' ')
                method_result = df_with_second_method_results.loc[first_dataset_name_without_underscore, second_dataset_name_without_underscore]
                list_of_method_results.append(method_result)

    # absolute quality difference: mean result
    correlation = np.corrcoef(list_of_method_results, list_of_mean_f1_score_absolute_difference_values)[0, 1]
    list_of_correlations_for_mean_absolute_quality_difference_and_first_method_result.append(correlation)
    # absolute quality difference: median result
    correlation = np.corrcoef(list_of_method_results, list_of_median_f1_score_absolute_difference_values)[0, 1]
    list_of_correlations_for_median_absolute_quality_difference_and_first_method_result.append(correlation)
    # relative quality difference: mean result
    correlation = np.corrcoef(list_of_method_results, list_of_mean_f1_score_relative_difference_values)[0, 1]
    list_of_correlations_for_mean_relative_quality_difference_and_first_method_result.append(correlation)
    # relative quality difference: median result
    correlation = np.corrcoef(list_of_method_results, list_of_median_f1_score_relative_difference_values)[0, 1]
    list_of_correlations_for_median_relative_quality_difference_and_first_method_result.append(correlation)




In [None]:
list_of_correlations_for_mean_absolute_quality_difference_and_first_method_result

[-1.5197439593862942e-17]

In [None]:
list_of_correlations_for_median_absolute_quality_difference_and_first_method_result

[2.017388086523622e-17]

In [None]:
list_of_correlations_for_mean_relative_quality_difference_and_first_method_result

[0.0023648588529360352]

In [None]:
list_of_correlations_for_median_relative_quality_difference_and_first_method_result

[0.010379109681917582]

**Вывод:** для сходства датасетов во втором смысле не подходит. Это не удивительно. Было бы странно его применять для сходства датасетов во втором смысле, потому что этот метод симметричный (а сходство датасетов во втором смысле – не симметрично). 



# Исследуем эффективность метода для сходства датасетов в первом смысле

Итерируемся по парам, у которой эл-ты из одной и той же серии экспериментов.
По аналогии с исследованием корреляции для сходства датасетов во втором смысле делаем.