# Загрузка данных

In [1]:
import re
import numpy as np
import pandas as pd

import torch
from transformers import BertTokenizer, BertModel
from googletrans import Translator

In [2]:
train_data = pd.read_csv('data/train_issues.csv')
test_data = pd.read_csv('data/test_issues.csv')

train_comments = pd.read_csv('data/train_comments.csv')
test_comments = pd.read_csv('data/test_comments.csv')

employees = pd.read_csv('data/employees.csv')

# Корректировка файла employees

In [3]:
# Пропуски в данных

display(employees.isna().sum())

id                              0
active                          0
full_name                       0
position                      150
hiring_type                    83
payment_type                  122
salary_calculation_type       310
english_level                 327
passport                        0
is_nda_signed                   0
is_labor_contract_signed        0
is_added_to_internal_chats      0
is_added_one_to_one             0
dtype: int64

In [4]:
# Список должностей

display(employees['position'].value_counts())

Web-разработчик                                 84
Менеджер проектов                               21
Разработчик мобильных приложений                13
Системный администратор (DevOps инженер)         6
DevOps инженер                                   5
разработчик мобильных приложений                 5
Копирайтер                                       5
Инженер-тестировщик                              4
Инженер тестировщик (QA engineer)                4
Web дизайнер                                     3
Сорсер                                           3
Delivery Manager                                 2
Ассистент HR-менеджера                           2
Системный администратор                          2
 Системный администратор                         1
графический дизайнер                             1
Ассистент менеджера проектов                     1
Рекрутер                                         1
ML-инженер                                       1
IT-рекрутер                    

In [5]:
# Заполнение пропусков и переименование должностей

employees['position'] = employees['position'].fillna('Missing')
employees['position'] = employees['position'].apply(lambda x: x.lower().replace('-', ' ').strip())
employees['position'] = employees['position'].apply(lambda x: x.replace('ведущий', ''))
employees['position'] = employees['position'].apply(lambda x: x.replace('(qa engineer)', ''))

mask = employees['position'].apply(lambda x: 'devops инженер' in x)
employees.loc[mask, 'position'] = 'devops engineer'

mask = employees['position'].apply(lambda x: x in ['it рекрутер', 'менеджер по персоналу (hr менеджер)', 'рекрутер', 
                                                   'специалист отдела по управлению персоналом', 'ассистент hr менеджера',
                                                   'сорсер'])
employees.loc[mask, 'position'] = 'hr'

mask = employees['position'].apply(lambda x: 'бухгалтер' in x)
employees.loc[mask, 'position'] = 'бухгалтер'

employees['position'] = employees['position'].apply(lambda x: x.strip())

In [6]:
# Скорректированный список должностей

display(employees['position'].value_counts())

missing                                         150
web разработчик                                  86
менеджер проектов                                22
разработчик мобильных приложений                 18
devops engineer                                  13
hr                                                9
инженер тестировщик                               8
копирайтер                                        5
web дизайнер                                      4
системный администратор                           3
бухгалтер                                         2
delivery manager                                  2
офис менеджер                                     2
seo специалист                                    1
консультант                                       1
ml инженер                                        1
графический дизайнер                              1
руководитель отдела продаж                        1
менеджер по тендерам                              1
специалист п

In [7]:
# Переименование колонки id
employees.rename(columns={'id': 'emp_id'}, inplace=True)

# Объединение данных 
* data: test + train + employees
* comments: test + train

In [8]:
# Описание задач (data)

test_data['overall_worklogs'] = -1
data = pd.concat([train_data, test_data])

data = pd.merge(data, employees[['emp_id', 'position']], left_on='assignee_id', right_on='emp_id', how='left')
data = pd.merge(data, employees[['emp_id', 'position']], left_on='creator_id', right_on='emp_id', how='left')
data.drop(columns=['emp_id_x', 'emp_id_y'], inplace=True)
display(data)

Unnamed: 0,id,created,key,summary,project_id,assignee_id,creator_id,overall_worklogs,position_x,position_y
0,819952,2019-10-01 05:57:18.000,SM-10678,"UI тесты по заказу ""Добро КейДжи""",5,93,93,1800,разработчик мобильных приложений,разработчик мобильных приложений
1,819949,2019-10-01 05:59:45.000,SM-10679,"UI тесты раздела ""Профиль""",5,93,93,7200,разработчик мобильных приложений,разработчик мобильных приложений
2,819947,2019-10-01 06:00:38.000,SM-10680,"UI тесты раздела ""Личный счет""",5,93,93,14400,разработчик мобильных приложений,разработчик мобильных приложений
3,819943,2019-10-01 06:02:49.000,SM-10682,"UI тесты раздела ""Новости""",5,93,93,900,разработчик мобильных приложений,разработчик мобильных приложений
4,819941,2019-10-01 06:03:26.000,SM-10683,"UI тесты раздела ""Зоны скидок и доплат""",5,93,93,900,разработчик мобильных приложений,разработчик мобильных приложений
...,...,...,...,...,...,...,...,...,...,...
10654,702545,2020-09-24 11:21:57.000,ADBKRK-1392,Order to fix: audit result & save root cause,48,1,1,-1,missing,missing
10655,702528,2020-09-28 14:09:35.000,ADBKRK-1402,Lỗi EPC không tải trang được,48,1,1,-1,missing,missing
10656,702499,2020-10-06 13:51:25.000,ADBKRK-1422,Line red cannot calling out from system: Audit...,48,1,1,-1,missing,missing
10657,702376,2020-11-05 15:40:21.000,ADBKRK-1493,ipphone problem: push & audit result,48,1,1,-1,missing,missing


In [9]:
# Комментарии

comments = pd.concat([train_comments, test_comments])

# Поиск и создание признаков для русского и вьетнамского языка в описании задач и комментариях 

In [10]:
%%time
data['summary'] = data['summary'].apply(lambda x: x.lower().strip())
comments['text'] = comments['text'].apply(lambda x: x.lower().strip())

CPU times: total: 15.6 ms
Wall time: 17.1 ms


In [11]:
# RU
data['ru'] = data['summary'].apply(lambda x: bool(re.search('[а-яА-Я]', x))).astype(int)
comments['ru'] = comments['text'].apply(lambda x: bool(re.search('[а-яА-Я]', x))).astype(int)
print(f"Русский язык, задачи: {data['ru'].sum()}, комментарии: {comments['ru'].sum()}")

# VI
vi_letters = '[ÀÁÂÃÈÉÊÌÍÒÓÔÕÙÚĂĐĨŨƠàáâãèéêìíòóôõùúăđĩũƯĂẠẢẤẦẨẪẬẮẰẲẴẶẸẺẼỀỀỂưăạảấầẩẫậắằẳẵặẹẻẽềềểỄỆỈỊỌỎỐỒỔỖỘỚỜỞỠỢỤỦỨỪễếệỉịọỏốồổỗộớờởỡợụủứừỬỮỰỲỴÝỶỸửữựỳỷỹ]'
data['vi'] = data['summary'].apply(lambda x: bool(re.search(vi_letters, x))).astype(int)
comments['vi'] = comments['text'].apply(lambda x: bool(re.search(vi_letters, x))).astype(int)
print(f"Вьетнамский язык, задачи: {data['vi'].sum()}, комментарии: {comments['vi'].sum()}")

Русский язык, задачи: 1531, комментарии: 2646
Вьетнамский язык, задачи: 138, комментарии: 1272


In [12]:
# Примеры задачь на вьетнамском языке

comments[comments['vi'] == 1]

Unnamed: 0,comment_id,text,issue_id,author_id,ru,vi
323,116949,when user access emailmarketing.edumall.co.th ...,679380,1,0,1
337,116483,cái này anh [~namdd3] hỗ trợ bạn [~yann] cấp t...,679391,1,0,1
338,116485,"phần category thì bọn a làm xong rồi e, trang ...",679391,1,0,1
339,116486,[~namdd3] cám ơn anh.\r\n\r\nđể em review lại ...,679391,1,0,1
346,116178,"hi [~jakkaphan], i saw the attached video, i t...",679401,1,0,1
...,...,...,...,...,...,...
670,121277,"loading được rồi, em kiểm tra lại xem được ko,...",702528,1,0,1
672,121281,[~minhnh3@topica.edu.vn] em vào [http://epc.to...,702528,1,0,1
673,121292,[~minhnh3@topica.edu.vn]: close cái này nhá?,702528,1,0,1
674,121293,[~hungcm] vâng xong r nhé ạ,702528,1,0,1


# Перевод на английский
* С русского и вьетнамского

In [13]:
translator = Translator()

In [14]:
def fix_text(data: pd.Series) -> pd.Series:
    data = data.apply(lambda x: x.replace('комуникац', 'коммуникац')) 
    data = data.apply(lambda x: x.replace('(', ' ').replace(')', ' ')) 
    
    # Удаление лишних пробелов
    data = data.apply(lambda x: re.sub(r"\s+", " ", x)) 
    
    return data

data['summary'] = fix_text(data['summary'])
comments['text'] = fix_text(comments['text'])

In [15]:
def translate_text(data, feature):    
    # Перевод с русского на английский
    mask = data['ru'] == 1
    data.loc[mask, feature] = data.loc[mask, feature].apply(lambda x: translator.translate(x, src='ru', dest='en').text)
    
    # Перевод с вьетнамского на английский
    mask = data['vi'] == 1
    data.loc[mask, feature] = data.loc[mask, feature].apply(lambda x: translator.translate(x, src='vi', dest='en').text)
    
    return data

In [16]:
# Оригинальный текст summary

data.head(3)

Unnamed: 0,id,created,key,summary,project_id,assignee_id,creator_id,overall_worklogs,position_x,position_y,ru,vi
0,819952,2019-10-01 05:57:18.000,SM-10678,"ui тесты по заказу ""добро кейджи""",5,93,93,1800,разработчик мобильных приложений,разработчик мобильных приложений,1,0
1,819949,2019-10-01 05:59:45.000,SM-10679,"ui тесты раздела ""профиль""",5,93,93,7200,разработчик мобильных приложений,разработчик мобильных приложений,1,0
2,819947,2019-10-01 06:00:38.000,SM-10680,"ui тесты раздела ""личный счет""",5,93,93,14400,разработчик мобильных приложений,разработчик мобильных приложений,1,0


In [17]:
%%time
data = translate_text(data, feature='summary')

CPU times: total: 8.77 s
Wall time: 9min 55s


In [18]:
%%time
comments = translate_text(comments, feature='text')

CPU times: total: 23.5 s
Wall time: 23min 8s


In [19]:
# Переведенный текст summary

data.head(3)

Unnamed: 0,id,created,key,summary,project_id,assignee_id,creator_id,overall_worklogs,position_x,position_y,ru,vi
0,819952,2019-10-01 05:57:18.000,SM-10678,"ui tests ordered by ""dobro cages""",5,93,93,1800,разработчик мобильных приложений,разработчик мобильных приложений,1,0
1,819949,2019-10-01 05:59:45.000,SM-10679,"ui tests section ""profile""",5,93,93,7200,разработчик мобильных приложений,разработчик мобильных приложений,1,0
2,819947,2019-10-01 06:00:38.000,SM-10680,"ui tests of the ""personal account"" section",5,93,93,14400,разработчик мобильных приложений,разработчик мобильных приложений,1,0


# Создание эмбедингов с помощью Bert

In [20]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

model = BertModel.from_pretrained('bert-base-uncased',
                                  output_hidden_states = True, 
                                  )

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [21]:
def text_embeding(text, model):
    hidden_states = calculate_hidden_states(text, model)
    embedings = process_hidden_states(hidden_states)  
    
    return list(embedings.numpy())


def calculate_hidden_states(text, model):
    marked_text = "[CLS] " + text + " [SEP]"
    
    # Tokenize our sentence with the BERT tokenizer.
    tokenized_text = tokenizer.tokenize(marked_text)
    
    # Map the token strings to their vocabulary indeces.
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    
    # Mark each tokens as belonging to sentence "1".    
    segments_ids = [1] * len(tokenized_text)
    
    # Convert inputs to PyTorch tensors
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])    
    
    with torch.no_grad():
        outputs = model(tokens_tensor, segments_tensors)
        # Evaluating the model will return a different number of objects based on 
        # how it's  configured in the `from_pretrained` call earlier. In this case, 
        # becase we set `output_hidden_states = True`, the third item will be the 
        # hidden states from all layers. See the documentation for more details:
        # https://huggingface.co/transformers/model_doc/bert.html#bertmodel
        hidden_states = outputs[2]        
        
    return hidden_states


def process_hidden_states(hidden_states):
    token_vecs = hidden_states[-2][0]
    sentence_embedding = torch.mean(token_vecs, dim=0)
    
    return sentence_embedding

## Эмбединг описания задач

In [22]:
%%time
data_embedings = data['summary'].apply(lambda x: text_embeding(x, model))
data_embedings_df = pd.DataFrame(data_embedings.to_list())
data_embedings_df

CPU times: total: 43min 31s
Wall time: 5min 27s


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
0,0.073938,-0.355422,0.103924,-0.063405,0.407973,-0.118627,-0.154836,0.207292,-0.046897,0.030168,...,-0.197142,-0.045443,0.214377,0.103848,0.273329,-0.063785,-0.100240,-0.207664,-0.224855,0.085569
1,0.120545,-0.521504,0.007783,-0.238645,0.348107,-0.282540,0.187783,0.434630,-0.097194,-0.145882,...,0.099572,0.149774,0.050808,-0.225226,0.056088,0.180238,-0.202323,-0.314437,-0.306621,-0.110278
2,-0.172043,-0.390544,0.028921,0.397299,0.904702,-0.658381,0.046352,0.252826,-0.097388,0.139259,...,0.187737,-0.131439,0.074518,-0.076777,-0.013071,0.321474,0.021959,-0.278360,-0.125246,0.137917
3,-0.271000,-0.573537,-0.023668,0.282383,0.429638,-0.444236,0.026686,0.491147,0.008693,-0.037338,...,0.099188,0.030792,0.115210,-0.031079,0.153760,0.260613,-0.174081,-0.028444,0.166309,-0.266101
4,-0.218957,-0.627530,0.367746,0.496310,1.117281,-0.247721,-0.612836,0.137161,-0.538337,0.172384,...,-0.006587,-0.255048,0.057476,0.089882,0.129729,-0.003735,-0.098668,-0.279206,-0.153117,-0.028102
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10654,0.074469,-0.476831,0.158147,-0.141570,0.670702,-0.059137,0.407620,0.239959,-0.328748,-0.030946,...,0.404762,-0.225927,-0.669586,0.022252,0.050076,0.229833,-0.019424,-0.614235,-0.092262,0.238725
10655,-0.547409,-0.539908,0.187006,0.519988,0.309712,-0.128026,-0.018537,0.304902,-0.061045,0.165297,...,0.445225,-0.258338,-0.433738,-0.135312,0.497669,0.483062,-0.273962,-0.751106,-0.018447,0.121448
10656,-0.320597,-0.411899,0.518579,-0.026532,0.488122,-0.228063,0.737409,0.502583,-0.046508,0.161989,...,0.463657,-0.072728,-0.117153,0.179303,-0.134538,0.463469,-0.089737,-0.878708,-0.001422,0.282825
10657,0.147980,-0.608953,0.443672,0.073062,0.374617,-0.069110,0.182975,0.192601,-0.347163,0.174396,...,0.377399,-0.265475,-0.187053,-0.127718,0.047771,0.593430,-0.201662,-0.422017,-0.036343,-0.169581


In [23]:
data_embedings_df.index = data.index
data = pd.concat([data, data_embedings_df], axis=1)
data

Unnamed: 0,id,created,key,summary,project_id,assignee_id,creator_id,overall_worklogs,position_x,position_y,...,758,759,760,761,762,763,764,765,766,767
0,819952,2019-10-01 05:57:18.000,SM-10678,"ui tests ordered by ""dobro cages""",5,93,93,1800,разработчик мобильных приложений,разработчик мобильных приложений,...,-0.197142,-0.045443,0.214377,0.103848,0.273329,-0.063785,-0.100240,-0.207664,-0.224855,0.085569
1,819949,2019-10-01 05:59:45.000,SM-10679,"ui tests section ""profile""",5,93,93,7200,разработчик мобильных приложений,разработчик мобильных приложений,...,0.099572,0.149774,0.050808,-0.225226,0.056088,0.180238,-0.202323,-0.314437,-0.306621,-0.110278
2,819947,2019-10-01 06:00:38.000,SM-10680,"ui tests of the ""personal account"" section",5,93,93,14400,разработчик мобильных приложений,разработчик мобильных приложений,...,0.187737,-0.131439,0.074518,-0.076777,-0.013071,0.321474,0.021959,-0.278360,-0.125246,0.137917
3,819943,2019-10-01 06:02:49.000,SM-10682,"ui tests of the ""news"" section",5,93,93,900,разработчик мобильных приложений,разработчик мобильных приложений,...,0.099188,0.030792,0.115210,-0.031079,0.153760,0.260613,-0.174081,-0.028444,0.166309,-0.266101
4,819941,2019-10-01 06:03:26.000,SM-10683,"ui tests of the section ""zones of discounts an...",5,93,93,900,разработчик мобильных приложений,разработчик мобильных приложений,...,-0.006587,-0.255048,0.057476,0.089882,0.129729,-0.003735,-0.098668,-0.279206,-0.153117,-0.028102
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10654,702545,2020-09-24 11:21:57.000,ADBKRK-1392,order to fix: audit result & save root cause,48,1,1,-1,missing,missing,...,0.404762,-0.225927,-0.669586,0.022252,0.050076,0.229833,-0.019424,-0.614235,-0.092262,0.238725
10655,702528,2020-09-28 14:09:35.000,ADBKRK-1402,epc error failed to load the page,48,1,1,-1,missing,missing,...,0.445225,-0.258338,-0.433738,-0.135312,0.497669,0.483062,-0.273962,-0.751106,-0.018447,0.121448
10656,702499,2020-10-06 13:51:25.000,ADBKRK-1422,line red cannot calling out from system: audit...,48,1,1,-1,missing,missing,...,0.463657,-0.072728,-0.117153,0.179303,-0.134538,0.463469,-0.089737,-0.878708,-0.001422,0.282825
10657,702376,2020-11-05 15:40:21.000,ADBKRK-1493,ipphone problem: push & audit result,48,1,1,-1,missing,missing,...,0.377399,-0.265475,-0.187053,-0.127718,0.047771,0.593430,-0.201662,-0.422017,-0.036343,-0.169581


In [24]:
data.to_csv('data/data_embeded.csv')

## Эмбединг комментариев

In [25]:
%%time
comments_embedings = comments['text'].apply(lambda x: text_embeding(x[:500], model))
comments_embedings_df = pd.DataFrame(comments_embedings.to_list())
comments_embedings_df

CPU times: total: 3h 26min 26s
Wall time: 25min 50s


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
0,0.396655,-0.282154,0.712104,-0.042653,0.529196,-0.274449,0.343901,0.648183,-0.432211,-0.265524,...,-0.538489,-0.141691,-0.276242,-0.391345,-0.163680,0.153540,-0.327705,-0.656016,0.120112,0.208754
1,0.036283,-0.127175,0.747807,-0.409866,0.031667,-0.225562,0.179292,0.761894,-0.326114,-0.175090,...,-0.398943,-0.104033,-0.164406,-0.509012,0.326641,0.561171,-0.129044,-0.584427,0.023221,0.070148
2,-0.153393,0.155878,0.519984,-0.180548,0.161860,-0.320932,0.453828,0.584608,-0.026886,-0.202321,...,-0.063549,0.005674,-0.078658,-0.320341,0.043778,0.417975,-0.173646,-0.650891,-0.041007,0.299308
3,-0.044670,0.114046,0.469342,-0.120671,0.505078,-0.273717,0.192082,0.449015,0.138089,-0.261299,...,-0.191973,-0.192396,-0.324207,-0.248907,-0.246940,0.343616,0.123952,-0.442819,0.156971,0.303488
4,0.191356,0.019998,0.310036,0.163185,0.005745,-0.287968,0.306304,0.390006,-0.222937,-0.232054,...,-0.317493,0.062973,-0.114507,-0.108379,-0.035568,0.059363,-0.145516,-0.593454,0.326139,0.044931
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15261,-0.426910,-0.250322,0.718972,-0.271551,0.415670,-0.072001,0.234505,0.625516,-0.230891,-0.176025,...,-0.208457,-0.018836,-0.128387,-0.198288,0.224424,0.388852,-0.212744,-0.526419,-0.086822,0.318272
15262,-0.197907,-0.138590,0.629786,-0.370935,0.456970,-0.360944,0.075869,0.660784,-0.122566,-0.017496,...,-0.260309,-0.053860,-0.081117,-0.226760,0.321590,0.236594,-0.348806,-0.461338,-0.006393,0.255275
15263,-0.121795,0.086627,0.496322,-0.168169,0.471133,-0.262740,0.094094,0.370133,-0.066984,-0.069720,...,-0.155059,-0.224260,-0.001483,-0.229035,0.026603,0.255100,-0.109990,-0.370941,-0.029547,0.504808
15264,-0.309543,-0.189824,0.258524,0.105877,0.453699,-0.211555,-0.046718,0.425338,0.350240,0.308291,...,-0.210358,-0.151050,-0.062896,-0.167673,0.455893,0.207558,0.025101,-0.352384,0.059062,0.777574


In [26]:
comments_embedings_df.index = comments.index
comments = pd.concat([comments, comments_embedings_df], axis=1)
comments

Unnamed: 0,comment_id,text,issue_id,author_id,ru,vi,0,1,2,3,...,758,759,760,761,762,763,764,765,766,767
0,11779,[https://www.youtube.com/watch?v=tuhodtsvony|h...,669666,1,0,0,0.396655,-0.282154,0.712104,-0.042653,...,-0.538489,-0.141691,-0.276242,-0.391345,-0.163680,0.153540,-0.327705,-0.656016,0.120112,0.208754
1,10601,ok [~accountid:557058:3f7ab89a-8969-4547-90df-...,669670,1,0,0,0.036283,-0.127175,0.747807,-0.409866,...,-0.398943,-0.104033,-0.164406,-0.509012,0.326641,0.561171,-0.129044,-0.584427,0.023221,0.070148
2,76101,i encountered a problem with access to `/users...,670930,2,0,0,-0.153393,0.155878,0.519984,-0.180548,...,-0.063549,0.005674,-0.078658,-0.320341,0.043778,0.417975,-0.173646,-0.650891,-0.041007,0.299308
3,76102,i have learned that `users/:id/emails` endpoin...,670930,2,0,0,-0.044670,0.114046,0.469342,-0.120671,...,-0.191973,-0.192396,-0.324207,-0.248907,-0.246940,0.343616,0.123952,-0.442819,0.156971,0.303488
4,76213,we have decided with andrew to set member's em...,670930,2,0,0,0.191356,0.019998,0.310036,0.163185,...,-0.317493,0.062973,-0.114507,-0.108379,-0.035568,0.059363,-0.145516,-0.593454,0.326139,0.044931
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1343,20577,[~accountid:6040a1fd19772401136823d9] made the...,2175459,191,1,0,-0.426910,-0.250322,0.718972,-0.271551,...,-0.208457,-0.018836,-0.128387,-0.198288,0.224424,0.388852,-0.212744,-0.526419,-0.086822,0.318272
1344,20744,deployed web-api and ome. there is a camera. b...,2176673,191,1,0,-0.197907,-0.138590,0.629786,-0.370935,...,-0.260309,-0.053860,-0.081117,-0.226760,0.321590,0.236594,-0.348806,-0.461338,-0.006393,0.255275
1345,20833,asked for help in an issue on github - [https:...,2176673,191,1,0,-0.121795,0.086627,0.496322,-0.168169,...,-0.155059,-0.224260,-0.001483,-0.229035,0.026603,0.255100,-0.109990,-0.370941,-0.029547,0.504808
1346,20874,The solution was to add port 3333 and replace ...,2176673,191,1,0,-0.309543,-0.189824,0.258524,0.105877,...,-0.210358,-0.151050,-0.062896,-0.167673,0.455893,0.207558,0.025101,-0.352384,0.059062,0.777574


In [27]:
# Агрегация комментариев по id задачи

features = [x for x in range(768)]
comments_agregated = comments.groupby('issue_id')[features].mean()
comments_agregated = comments_agregated.add_prefix('comments_')
comments_agregated

Unnamed: 0_level_0,comments_0,comments_1,comments_2,comments_3,comments_4,comments_5,comments_6,comments_7,comments_8,comments_9,...,comments_758,comments_759,comments_760,comments_761,comments_762,comments_763,comments_764,comments_765,comments_766,comments_767
issue_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
669666,0.396655,-0.282154,0.712104,-0.042653,0.529196,-0.274449,0.343901,0.648183,-0.432211,-0.265524,...,-0.538489,-0.141691,-0.276242,-0.391345,-0.163680,0.153540,-0.327705,-0.656016,0.120112,0.208754
669670,0.036283,-0.127175,0.747807,-0.409866,0.031667,-0.225562,0.179292,0.761894,-0.326114,-0.175090,...,-0.398943,-0.104033,-0.164406,-0.509012,0.326641,0.561171,-0.129044,-0.584427,0.023221,0.070148
670929,0.185812,0.032089,-0.248100,0.027522,0.051841,0.190816,-0.521051,0.161664,0.012779,-0.458040,...,-0.094491,0.412604,-0.275388,-0.413010,-0.181765,0.032218,0.247779,-0.696906,0.040477,0.383392
670930,-0.002236,0.096640,0.433121,-0.046012,0.224228,-0.294206,0.317405,0.474543,-0.037245,-0.231891,...,-0.191005,-0.041250,-0.172458,-0.225876,-0.079576,0.273651,-0.065070,-0.562388,0.147367,0.215909
670934,0.128188,-0.202561,0.126652,-0.060927,0.079241,-0.281715,0.104839,0.508973,-0.050612,-0.101930,...,0.266654,-0.040954,-0.338479,-0.320714,0.069757,0.533549,0.008523,-0.490271,0.041929,0.406949
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2170252,-0.208578,-0.088907,0.542778,-0.240785,0.557583,-0.332332,0.295462,0.507588,-0.123970,-0.143424,...,-0.024134,-0.156992,-0.271184,-0.325431,0.153662,0.186081,-0.180502,-0.539517,-0.222620,0.343583
2171729,0.159291,-0.075901,0.300606,-0.075243,0.122563,-0.354529,0.034360,0.317944,-0.006488,-0.080193,...,-0.033922,-0.212943,-0.003118,-0.488605,-0.008323,0.188660,-0.270492,-0.339637,0.081137,0.114951
2175459,-0.225956,-0.241085,0.534479,-0.114024,0.286082,-0.079408,-0.018004,0.465963,-0.015045,-0.199724,...,-0.341780,-0.079619,-0.160311,-0.109652,0.138434,0.432819,-0.196008,-0.364389,0.033811,0.301920
2176348,-0.245295,-0.357312,0.242625,0.562741,0.032323,0.011807,0.126989,0.060826,-0.043198,0.387437,...,-0.076964,-0.700861,0.381071,0.032896,-0.106469,-0.061756,0.143541,-0.572775,0.077878,0.433672


In [28]:
comments_agregated.to_csv('data/comments_embeded.csv')