In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np

import os
from tqdm import tqdm

# Считываем файлы эмбедингов(папка хранится локально)

Считаем полученные при помощи BERT-а эмбединги и добавим к ним целевую переменную - тональность текста

In [2]:
sample_folder = '/home/roman/Документы/AmazonRecomendationSystem/data/raw/tonality/TonalityBERTStates'
batches = len(os.listdir(sample_folder))
file_template = 'state_epoch_'
file_ext = '.csv'

In [3]:
res = pd.read_csv(
        open(os.path.join(sample_folder, file_template + '0' + file_ext))
    )
batch_number = 1

for batch in tqdm(range(1, batches), leave=False):
    batch_df = pd.read_csv(
        open(os.path.join(sample_folder, file_template + str(batch_number) + file_ext))
    )
    res = pd.concat([res, batch_df], axis=0)
    batch_number += 1
    
res.drop(res.columns[0], axis=1, inplace=True)
res.columns = [f'embeded_feature_{i}' for i in range(res.shape[1] - 1)] + ['target']
res.reset_index(inplace=True)
res.drop(['index'], axis=1, inplace=True)

                                                                                

In [4]:
res.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6920 entries, 0 to 6919
Columns: 769 entries, embeded_feature_0 to target
dtypes: float64(768), int64(1)
memory usage: 40.6 MB


In [5]:
res.head()

Unnamed: 0,embeded_feature_0,embeded_feature_1,embeded_feature_2,embeded_feature_3,embeded_feature_4,embeded_feature_5,embeded_feature_6,embeded_feature_7,embeded_feature_8,embeded_feature_9,...,embeded_feature_759,embeded_feature_760,embeded_feature_761,embeded_feature_762,embeded_feature_763,embeded_feature_764,embeded_feature_765,embeded_feature_766,embeded_feature_767,target
0,-0.215934,-0.140289,0.008311,-0.101173,-0.098799,-0.092581,0.187413,0.308779,-0.079129,-0.144936,...,-0.291156,-0.136476,-0.077851,0.071078,0.063628,0.084097,-0.136949,0.5867,0.201127,1
1,-0.172627,-0.144762,0.002234,0.030628,0.120575,-0.088869,0.206804,0.255742,-0.114185,-0.007489,...,-0.115842,0.114645,0.047418,0.18532,-0.116279,-0.066597,-0.174425,0.213865,0.371975,0
2,-0.050633,0.072039,-0.029597,-0.140354,-0.080607,-0.103232,0.426805,0.245678,0.046981,-0.148649,...,-0.249636,-0.037668,-0.227981,-0.072469,0.078502,-0.011736,-0.071489,0.718524,0.262255,0
3,-0.147498,0.064452,-0.003698,-0.219294,0.060844,-0.064445,0.004213,0.256716,-0.175174,-0.163916,...,-0.013068,-0.029533,-0.143953,-0.0533,-0.050167,0.021765,-0.078147,0.423837,0.231882,1
4,-0.065416,0.063764,0.02759,-0.120538,-0.069055,-0.089219,0.151275,0.308792,-0.28177,0.076271,...,-0.263472,0.030695,-0.024583,0.068632,0.141143,-0.075043,-0.022626,0.4793,0.391161,1


In [6]:
res.shape

(6920, 769)

# Проверим корректность добавления целевой переменной 

In [7]:
orig_data = pd.read_csv(
    'https://github.com/clairett/pytorch-sentiment-classification/raw/master/data/SST2/train.tsv',
    delimiter='\t', header=None)

In [8]:
orig_data.index

RangeIndex(start=0, stop=6920, step=1)

In [9]:
res.index

RangeIndex(start=0, stop=6920, step=1)

In [10]:
diff = pd.concat([res['target'], orig_data[1]], axis=1)

diff.columns = ['target_res', 'target_orig']

In [11]:
print('Расхождение значений целевой переменной: ', ((diff['target_orig'] - diff['target_res']) ** 2).mean())

Расхождение значений целевой переменной:  0.0


# Проверим корректность эмбэдингов

In [12]:
test = pd.read_csv('/home/roman/Документы/AmazonRecomendationSystem/data/raw/tonality/emb.csv')
test.head()

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6,7,8,...,758,759,760,761,762,763,764,765,766,767
0,0,-0.215934,-0.140289,0.008311,-0.101173,-0.0988,-0.092581,0.187413,0.308779,-0.079129,...,0.007209,-0.291156,-0.136476,-0.077851,0.071078,0.063628,0.084097,-0.136948,0.5867,0.201127
1,1,-0.172627,-0.144762,0.002234,0.030628,0.120575,-0.088869,0.206804,0.255742,-0.114185,...,-0.075452,-0.115842,0.114645,0.047417,0.18532,-0.116279,-0.066597,-0.174425,0.213864,0.371975
2,2,-0.050634,0.07204,-0.029597,-0.140355,-0.080607,-0.103232,0.426805,0.245678,0.046982,...,-0.211884,-0.249635,-0.037668,-0.227981,-0.072469,0.078503,-0.011735,-0.07149,0.718524,0.262255
3,3,-0.147498,0.064452,-0.003699,-0.219294,0.060845,-0.064445,0.004213,0.256717,-0.175174,...,0.273039,-0.013068,-0.029533,-0.143953,-0.0533,-0.050167,0.021766,-0.078147,0.423838,0.231882
4,4,-0.065416,0.063764,0.02759,-0.120538,-0.069055,-0.089218,0.151275,0.308792,-0.28177,...,-0.042348,-0.263472,0.030695,-0.024583,0.068632,0.141143,-0.075043,-0.022626,0.4793,0.391161


In [13]:
test.drop(test.columns[0], axis=1, inplace=True)
test.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
0,-0.215934,-0.140289,0.008311,-0.101173,-0.0988,-0.092581,0.187413,0.308779,-0.079129,-0.144936,...,0.007209,-0.291156,-0.136476,-0.077851,0.071078,0.063628,0.084097,-0.136948,0.5867,0.201127
1,-0.172627,-0.144762,0.002234,0.030628,0.120575,-0.088869,0.206804,0.255742,-0.114185,-0.007488,...,-0.075452,-0.115842,0.114645,0.047417,0.18532,-0.116279,-0.066597,-0.174425,0.213864,0.371975
2,-0.050634,0.07204,-0.029597,-0.140355,-0.080607,-0.103232,0.426805,0.245678,0.046982,-0.148649,...,-0.211884,-0.249635,-0.037668,-0.227981,-0.072469,0.078503,-0.011735,-0.07149,0.718524,0.262255
3,-0.147498,0.064452,-0.003699,-0.219294,0.060845,-0.064445,0.004213,0.256717,-0.175174,-0.163916,...,0.273039,-0.013068,-0.029533,-0.143953,-0.0533,-0.050167,0.021766,-0.078147,0.423838,0.231882
4,-0.065416,0.063764,0.02759,-0.120538,-0.069055,-0.089218,0.151275,0.308792,-0.28177,0.076272,...,-0.042348,-0.263472,0.030695,-0.024583,0.068632,0.141143,-0.075043,-0.022626,0.4793,0.391161


In [14]:
test.columns = [f'embeded_feature_{i}' for i in range(test.shape[1])]

In [15]:
test.head()

Unnamed: 0,embeded_feature_0,embeded_feature_1,embeded_feature_2,embeded_feature_3,embeded_feature_4,embeded_feature_5,embeded_feature_6,embeded_feature_7,embeded_feature_8,embeded_feature_9,...,embeded_feature_758,embeded_feature_759,embeded_feature_760,embeded_feature_761,embeded_feature_762,embeded_feature_763,embeded_feature_764,embeded_feature_765,embeded_feature_766,embeded_feature_767
0,-0.215934,-0.140289,0.008311,-0.101173,-0.0988,-0.092581,0.187413,0.308779,-0.079129,-0.144936,...,0.007209,-0.291156,-0.136476,-0.077851,0.071078,0.063628,0.084097,-0.136948,0.5867,0.201127
1,-0.172627,-0.144762,0.002234,0.030628,0.120575,-0.088869,0.206804,0.255742,-0.114185,-0.007488,...,-0.075452,-0.115842,0.114645,0.047417,0.18532,-0.116279,-0.066597,-0.174425,0.213864,0.371975
2,-0.050634,0.07204,-0.029597,-0.140355,-0.080607,-0.103232,0.426805,0.245678,0.046982,-0.148649,...,-0.211884,-0.249635,-0.037668,-0.227981,-0.072469,0.078503,-0.011735,-0.07149,0.718524,0.262255
3,-0.147498,0.064452,-0.003699,-0.219294,0.060845,-0.064445,0.004213,0.256717,-0.175174,-0.163916,...,0.273039,-0.013068,-0.029533,-0.143953,-0.0533,-0.050167,0.021766,-0.078147,0.423838,0.231882
4,-0.065416,0.063764,0.02759,-0.120538,-0.069055,-0.089218,0.151275,0.308792,-0.28177,0.076272,...,-0.042348,-0.263472,0.030695,-0.024583,0.068632,0.141143,-0.075043,-0.022626,0.4793,0.391161


In [16]:
test.shape

(40, 768)

In [17]:
testing = res[:40]
testing.head()

Unnamed: 0,embeded_feature_0,embeded_feature_1,embeded_feature_2,embeded_feature_3,embeded_feature_4,embeded_feature_5,embeded_feature_6,embeded_feature_7,embeded_feature_8,embeded_feature_9,...,embeded_feature_759,embeded_feature_760,embeded_feature_761,embeded_feature_762,embeded_feature_763,embeded_feature_764,embeded_feature_765,embeded_feature_766,embeded_feature_767,target
0,-0.215934,-0.140289,0.008311,-0.101173,-0.098799,-0.092581,0.187413,0.308779,-0.079129,-0.144936,...,-0.291156,-0.136476,-0.077851,0.071078,0.063628,0.084097,-0.136949,0.5867,0.201127,1
1,-0.172627,-0.144762,0.002234,0.030628,0.120575,-0.088869,0.206804,0.255742,-0.114185,-0.007489,...,-0.115842,0.114645,0.047418,0.18532,-0.116279,-0.066597,-0.174425,0.213865,0.371975,0
2,-0.050633,0.072039,-0.029597,-0.140354,-0.080607,-0.103232,0.426805,0.245678,0.046981,-0.148649,...,-0.249636,-0.037668,-0.227981,-0.072469,0.078502,-0.011736,-0.071489,0.718524,0.262255,0
3,-0.147498,0.064452,-0.003698,-0.219294,0.060844,-0.064445,0.004213,0.256716,-0.175174,-0.163916,...,-0.013068,-0.029533,-0.143953,-0.0533,-0.050167,0.021765,-0.078147,0.423837,0.231882,1
4,-0.065416,0.063764,0.02759,-0.120538,-0.069055,-0.089219,0.151275,0.308792,-0.28177,0.076271,...,-0.263472,0.030695,-0.024583,0.068632,0.141143,-0.075043,-0.022626,0.4793,0.391161,1


In [18]:
metric = {}

for index in test.index:
    metric[index] = np.abs(test.loc[index] - testing.loc[index]).mean()

metric

{0: 1.968155078124382e-07,
 1: 1.991475651038757e-07,
 2: 2.782410807280143e-07,
 3: 1.8084865885472725e-07,
 4: 1.753096744791253e-07,
 5: 1.9017660156345278e-07,
 6: 1.6367721354128047e-07,
 7: 2.656096119807651e-07,
 8: 2.0064005208303907e-07,
 9: 2.3445679427131576e-07,
 10: 2.1766091145928924e-07,
 11: 1.8978177083318003e-07,
 12: 2.0173010156354565e-07,
 13: 1.3989861979262063e-07,
 14: 1.86152395832323e-07,
 15: 1.9217643229168704e-07,
 16: 1.898078125025991e-07,
 17: 2.2121014322719543e-07,
 18: 1.6441763020734844e-07,
 19: 2.0916940885484948e-07,
 20: 1.4411191406265076e-07,
 21: 1.3929111979135184e-07,
 22: 1.6733608724053262e-07,
 23: 2.2947171875091608e-07,
 24: 3.0096394531280067e-07,
 25: 2.3383072916703508e-07,
 26: 1.8156324218857172e-07,
 27: 1.0412395833176513e-07,
 28: 1.931941796875958e-07,
 29: 1.3256033723918834e-07,
 30: 1.9256651041698395e-07,
 31: 1.3366812499891718e-07,
 32: 2.1264374999993759e-07,
 33: 2.0413945963355216e-07,
 34: 1.7444153645832813e-07,
 35:

# Сохраним датасет

In [19]:
save_folder = '/home/roman/Документы/AmazonRecomendationSystem/data/processed'
res.to_csv(os.path.join(save_folder, 'tonality_embeded_dataset.csv'))