# **Текст**

In [1]:
!pip install -q transformers sentencepiece sentence-transformers catboost

[0m

In [2]:
import pandas as pd
from transformers import AutoModel, AutoTokenizer, Trainer, TrainingArguments, default_data_collator, DebertaV2Tokenizer, PegasusForConditionalGeneration, PegasusTokenizer
from tqdm.notebook import tqdm
import torch 
from sentence_transformers import SentenceTransformer

In [3]:
df_comment_train = pd.read_csv("../input/collector/train_comments.csv")
df_issues_train = pd.read_csv("../input/collector/train_issues.csv")
sample_solution = pd.read_csv("../input/collector/sample_solution.csv")

df_comment_test = pd.read_csv("../input/collector/test_comments.csv")
df_issues_test = pd.read_csv("../input/collector/test_issues.csv")

In [4]:
df_comment_train = df_comment_train.sort_values(by=['comment_id'])
comment_text = df_comment_train.pivot_table(index = 'issue_id', 
                            values=['text'],
                            aggfunc=['sum']).fillna(0)
comment_text.columns = [f'total_{str(i[0])}_comments_by_issues' for i in comment_text.columns]
comment_text = comment_text.reset_index()
comment_text = comment_text.rename(columns={"issue_id": "id"})
train = df_issues_train.merge(comment_text, on="id", how='left')
train['summary_total_sum_comm'] = train['summary'].astype(str) +" " + train['total_sum_comments_by_issues'].astype(str)

In [5]:
df_comment_test = df_comment_test.sort_values(by=['comment_id'])
comment_text = df_comment_test.pivot_table(index = 'issue_id', 
                            values=['text'],
                            aggfunc=[ 'sum']).fillna(0)
comment_text.columns = [f'total_{str(i[0])}_comments_by_issues' for i in comment_text.columns]
comment_text = comment_text.reset_index()
comment_text = comment_text.rename(columns={"issue_id": "id"})
test = df_issues_test.merge(comment_text, on="id", how='left')
test['summary_total_sum_comm'] = test['summary'].astype(str) + " " + test['total_sum_comments_by_issues'].astype(str)

In [6]:
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0].detach().cpu() #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / sum_mask


def make_features_transformers(df, model_name, df_model, col, max_len):
  tokenizer = AutoTokenizer.from_pretrained(model_name)
  model = AutoModel.from_pretrained(model_name).cuda()
  text_features = []
  for sentence in tqdm(df[col]):
    encoded_input = tokenizer([sentence], padding='max_length', truncation=True, max_length=max_len, return_tensors='pt')
    with torch.no_grad():
      model_output = model(input_ids=encoded_input['input_ids'].cuda())
    sentence_embeddings = list(mean_pooling(model_output, encoded_input['attention_mask']).numpy())
    text_features.extend(sentence_embeddings)
  text_features_df = pd.DataFrame(text_features, columns = [f'{df_model}_{col}_feature_{i}' for i in range(len(text_features[0]))])
  return text_features_df

In [8]:
models = [('cointegrated/LaBSE-en-ru', 512), 
          ('microsoft/mdeberta-v3-base', 512), 
          ('vicgalle/xlm-roberta-large-xnli-anli', 512), 
          ('MoritzLaurer/mDeBERTa-v3-base-mnli-xnli', 512)]
for m in models:
  print(m)
  train = train.join(make_features_transformers(train, m[0], m[0].split('/')[1], 'summary_total_sum_comm', m[1]))
  train.to_csv('transformers_text_features.csv', index=False)

('cointegrated/LaBSE-en-ru', 512)


Some weights of the model checkpoint at cointegrated/LaBSE-en-ru were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/9589 [00:00<?, ?it/s]

('microsoft/mdeberta-v3-base', 512)


Downloading:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/579 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/4.11M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
  "The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option"
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Downloading:   0%|          | 0.00/534M [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/mdeberta-v3-base were not used when initializing DebertaV2Model: ['lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.dense.weight', 'mask_predictions.classifier.bias', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.bias', 'mask_predictions.dense.bias', 'mask_predictions.LayerNorm.bias', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.LayerNorm.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/9589 [00:00<?, ?it/s]

('vicgalle/xlm-roberta-large-xnli-anli', 512)


Downloading:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/734 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/4.83M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/150 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.09G [00:00<?, ?B/s]

Some weights of the model checkpoint at vicgalle/xlm-roberta-large-xnli-anli were not used when initializing XLMRobertaModel: ['classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaModel were not initialized from the model checkpoint at vicgalle/xlm-roberta-large-xnli-anli and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predict

  0%|          | 0/9589 [00:00<?, ?it/s]

('MoritzLaurer/mDeBERTa-v3-base-mnli-xnli', 512)


Downloading:   0%|          | 0.00/463 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/4.11M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/18.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/156 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.04k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/532M [00:00<?, ?B/s]

Some weights of the model checkpoint at MoritzLaurer/mDeBERTa-v3-base-mnli-xnli were not used when initializing DebertaV2Model: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/9589 [00:00<?, ?it/s]

In [11]:
models = [('cointegrated/LaBSE-en-ru', 512), 
          ('microsoft/mdeberta-v3-base', 512), 
          ('vicgalle/xlm-roberta-large-xnli-anli', 512), 
          ('MoritzLaurer/mDeBERTa-v3-base-mnli-xnli', 512)]
for m in models:
  print(m)
  test = test.join(make_features_transformers(test, m[0], m[0].split('/')[1], 'summary_total_sum_comm', m[1]))
  test.to_csv('test_transformers_text_features.csv', index=False)

('cointegrated/LaBSE-en-ru', 512)


Some weights of the model checkpoint at cointegrated/LaBSE-en-ru were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/1070 [00:00<?, ?it/s]

('microsoft/mdeberta-v3-base', 512)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of the model checkpoint at microsoft/mdeberta-v3-base were not used when initializing DebertaV2Model: ['lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.dense.weight', 'mask_predictions.classifier.bias', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.bias', 'mask_predictions.dense.bias', 'mask_predictions.LayerNorm.bias', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.LayerNorm.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT 

  0%|          | 0/1070 [00:00<?, ?it/s]

('vicgalle/xlm-roberta-large-xnli-anli', 512)


Some weights of the model checkpoint at vicgalle/xlm-roberta-large-xnli-anli were not used when initializing XLMRobertaModel: ['classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaModel were not initialized from the model checkpoint at vicgalle/xlm-roberta-large-xnli-anli and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predict

  0%|          | 0/1070 [00:00<?, ?it/s]

('MoritzLaurer/mDeBERTa-v3-base-mnli-xnli', 512)


Some weights of the model checkpoint at MoritzLaurer/mDeBERTa-v3-base-mnli-xnli were not used when initializing DebertaV2Model: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/1070 [00:00<?, ?it/s]