In [None]:
from transformers import AutoTokenizer, AutoModel
import torch
#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / sum_mask

In [None]:
from typing import List
import random
import glob
from nltk import tokenize, download
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC, NuSVC, SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import ExtraTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [None]:
#Load AutoModel from huggingface model repository
tokenizer = AutoTokenizer.from_pretrained("ai-forever/sbert_large_mt_nlu_ru")
model = AutoModel.from_pretrained("ai-forever/sbert_large_mt_nlu_ru")
#Tokenize sentences
encoded_input = tokenizer(sentences, padding=True, truncation=True, max_length=24, return_tensors='pt')
#Compute token embeddings
with torch.no_grad():
    model_output = model(**encoded_input)
#Perform pooling. In this case, mean pooling
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

In [None]:
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

In [None]:
def find_authorwise_acc(predictions,y_test,authors):
    total_errors=0
    authorwise_acc={}
    for author in authors:
        authorwise_acc[author]=0
    for i in range(len(predictions)):
        if(predictions[i]!=y_test[i]):
            total_errors+=1
            authorwise_acc[y_test[i]]+=1
    for author in authors:
        authorwise_acc[author]/=total_errors

    print(authorwise_acc)

In [None]:
from sklearn.metrics import accuracy_score

0.8848484848484849
{'Gorbachevskiy': 0.5789473684210527, 'Borisov': 0.42105263157894735}


In [None]:
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
import pandas as pd

def plot_confusion_matrix(cm, classes: List[str],
                          normalize: bool = False,
                          title: str = 'Матрица ошибок',
                          cmap = plt.cm.Greens):
    #if normalize:
        #cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    df_cm = pd.DataFrame(cm, index = classes,
                  columns = classes)
    sns.set(font_scale=1.2)
    sns.heatmap(df_cm, annot=True, cmap = cmap)
    plt.title(title, fontsize=14)
    plt.ylabel('Правильный ответ', fontsize=14)
    plt.xlabel('Ответ модели', fontsize=12)

In [None]:
#Loop

In [None]:
def split_text(filepath: str, min_char: int = 5) -> List[str]:

    text = str()
    with open(filepath, 'r', encoding='utf8') as file:
        text = file.read().replace('\n', ' ')
#         text = text.replace('.”', '”.').replace('."', '".').replace('?”', '”?').replace('!”', '”!')
#         text = text.replace('--', ' ').replace('. . .', '').replace('_', '')

    sentences = tokenize.sent_tokenize(text)
    sentences = [sentence for sentence in sentences if len(sentence) >= min_char]

    return list(sentences)

In [None]:
from typing import List
import random

import glob
from nltk import tokenize, download
import numpy as np
import pandas as pd

download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
import warnings
warnings.filterwarnings('ignore')

sumsum = 0

for rndsd in range(81, 100):
    #rndsd = 274
    borisov = []
    for path in glob.glob('borisov_full.txt'):
        borisov += split_text(path)

    gorbachevskiy = []
    for path in glob.glob('gorbachevskiy_full.txt'):
        gorbachevskiy += split_text(path)

    text_dict = { 'Borisov': borisov, 'Gorbachevskiy': gorbachevskiy}
    np.random.seed(rndsd)
    random.seed(rndsd)
    max_len = min([len(borisov), len(gorbachevskiy)])

    names = [borisov, gorbachevskiy]
    combined = []
    for name in names:
        name = np.random.choice(name, max_len, replace = False)
        combined += list(name)
    labels = ['Borisov'] * max_len + ['Gorbachevskiy'] * max_len
    zipped = list(zip(combined, labels))
    random.shuffle(zipped)
    combined, labels = zip(*zipped)

    out_data = pd.DataFrame()
    out_data['text'] = combined
    out_data['author'] = labels

    out_data.to_csv('full_data_train_september_cardinal_3.csv', index=False)

    traindf = pd.read_csv('full_data_train_september_cardinal_3.csv')
    traindf.head()
    authorsdupl=traindf['author'].to_list()
    authors=set(authorsdupl)

    # traindf['text'] = traindf['text'].str.replace(',','')
    # traindf['text'] = traindf['text'].str.replace(')', '')
    # traindf['text'] = traindf['text'].str.replace('(', '')

    # traindf['text'] = traindf['text'].str.replace('--', '')

    # traindf['text'] = traindf['text'].str.replace(']', '')
    # traindf['text'] = traindf['text'].str.replace('[', '')

    # traindf['text'] = traindf['text'].str.replace('»', '')
    # traindf['text'] = traindf['text'].str.replace('«', '')

    # traindf['text'] = traindf['text'].str.replace('}', '')
    # traindf['text'] = traindf['text'].str.replace('{', '')

    # traindf['text'] = traindf['text'].str.replace('?', '')
    # traindf['text'] = traindf['text'].str.replace('!', '')
    # traindf['text'] = traindf['text'].str.replace(';', '')
    # # traindf['text'] = traindf['text'].str.lower()

    # # Первая часть служебных слов

    # traindf['text'] = traindf['text'].str.replace(' на ', ' ')
    # traindf['text'] = traindf['text'].str.replace(' для ', ' ')
    # traindf['text'] = traindf['text'].str.replace(' о ', ' ')
    # traindf['text'] = traindf['text'].str.replace(' об ', ' ')
    # traindf['text'] = traindf['text'].str.replace(' за ', ' ')
    # traindf['text'] = traindf['text'].str.replace(' под ', ' ')
    # traindf['text'] = traindf['text'].str.replace(' перед ', ' ')
    # traindf['text'] = traindf['text'].str.replace(' с ', ' ')
    # traindf['text'] = traindf['text'].str.replace(' между ', ' ')
    # traindf['text'] = traindf['text'].str.replace(' к ', ' ')
    # traindf['text'] = traindf['text'].str.replace(' до ', ' ')

    # traindf['text'] = traindf['text'].str.replace(' и ', ' ')
    # traindf['text'] = traindf['text'].str.replace(' а ', ' ')
    # traindf['text'] = traindf['text'].str.replace(' но ', ' ')
    # traindf['text'] = traindf['text'].str.replace(' ни ', ' ')
    # traindf['text'] = traindf['text'].str.replace(' чтобы ', ' ')
    # traindf['text'] = traindf['text'].str.replace(' что ', ' ')

    # traindf['text'] = traindf['text'].str.replace(' лишь ', ' ')
    # traindf['text'] = traindf['text'].str.replace(' только ', ' ')
    # traindf['text'] = traindf['text'].str.replace(' не ', ' ')

    # traindf['text'] = traindf['text'].str.replace(' ах ', ' ')
    # traindf['text'] = traindf['text'].str.replace(' ох ', ' ')
    # traindf['text'] = traindf['text'].str.replace(' ой ', ' ')

    # # Вторая часть служебных слов

    # traindf['text'] = traindf['text'].str.replace(' над ', ' ')
    # traindf['text'] = traindf['text'].str.replace(' около ', ' ')
    # traindf['text'] = traindf['text'].str.replace(' в ', ' ')
    # traindf['text'] = traindf['text'].str.replace(' после ', ' ')
    # traindf['text'] = traindf['text'].str.replace(' от ', ' ')
    # traindf['text'] = traindf['text'].str.replace(' у ', ' ')
    # traindf['text'] = traindf['text'].str.replace(' по ', ' ')
    # traindf['text'] = traindf['text'].str.replace(' без ', ' ')
    # traindf['text'] = traindf['text'].str.replace(' для ', ' ')
    # traindf['text'] = traindf['text'].str.replace(' при ', ' ')
    # traindf['text'] = traindf['text'].str.replace(' из ', ' ')
    # traindf['text'] = traindf['text'].str.replace(' за ', ' ')

    # traindf['text'] = traindf['text'].str.replace(' тоже ', ' ')
    # traindf['text'] = traindf['text'].str.replace(' также ', ' ')
    # traindf['text'] = traindf['text'].str.replace(' то ', ' ')
    # traindf['text'] = traindf['text'].str.replace(' же ', ' ')
    # traindf['text'] = traindf['text'].str.replace(' бы ', ' ')
    # traindf['text'] = traindf['text'].str.replace(' будто ', ' ')

    # traindf['text'] = traindf['text'].str.replace(' ведь ', ' ')
    # traindf['text'] = traindf['text'].str.replace(' уж ', ' ')
    # traindf['text'] = traindf['text'].str.replace(' таки ', ' ')

    # # Третья часть

    # traindf['text'] = traindf['text'].str.replace(' ли ', ' ')
    # traindf['text'] = traindf['text'].str.replace(' вон ', ' ')
    # traindf['text'] = traindf['text'].str.replace(' вот ', ' ')
    # traindf['text'] = traindf['text'].str.replace(' именно ', ' ')
    # traindf['text'] = traindf['text'].str.replace(' точь-в-точь ', ' ')
    # traindf['text'] = traindf['text'].str.replace(' несмотря на ', ' ')
    # traindf['text'] = traindf['text'].str.replace(' несмотря ', ' ')
    # traindf['text'] = traindf['text'].str.replace(' благодаря ', ' ')
    # traindf['text'] = traindf['text'].str.replace(' зато ', ' ')
    # traindf['text'] = traindf['text'].str.replace(' из-за ', ' ')
    # traindf['text'] = traindf['text'].str.replace(' из-под ', ' ')
    # traindf['text'] = traindf['text'].str.replace(' оттого ', ' ')
    # traindf['text'] = traindf['text'].str.replace(' причем ', ' ')
    # traindf['text'] = traindf['text'].str.replace(' причём ', ' ')
    # traindf['text'] = traindf['text'].str.replace(' притом ', ' ')
    # traindf['text'] = traindf['text'].str.replace(' дабы ', ' ')
    # traindf['text'] = traindf['text'].str.replace(' чем ', ' ')
    # traindf['text'] = traindf['text'].str.replace(' чём ', ' ')
    # traindf['text'] = traindf['text'].str.replace(' том ', ' ')
    # traindf['text'] = traindf['text'].str.replace(' ежели ', ' ')
    # traindf['text'] = traindf['text'].str.replace(' коли ', ' ')

    dictionaryauthors={}
    for x in authors:
        dictionaryauthors[x]=""
    text=traindf['text'].to_list()
    for i in range(len(text)):
        dictionaryauthors[authorsdupl[i]]+=text[i]

    sentences = traindf['text'].to_list()

    #Load AutoModel from huggingface model repository
    tokenizer = AutoTokenizer.from_pretrained("ai-forever/sbert_large_mt_nlu_ru")
    model = AutoModel.from_pretrained("ai-forever/sbert_large_mt_nlu_ru")
  #Tokenize sentences
    encoded_input = tokenizer(sentences, padding=True, truncation=True, max_length=24, return_tensors='pt')
  #Compute token embeddings
    with torch.no_grad():
        model_output = model(**encoded_input)
  #Perform pooling. In this case, mean pooling
    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

    sentence_embeddings = sentence_embeddings.numpy()

    X_train, X_test, y_train, y_test = train_test_split(sentence_embeddings, authorsdupl, test_size=0.2, random_state=1337)

  #   param_grid = [
  #   {'C': [1.0, 2.0, 3.0, 5.0, 10.0, 100.0, 1000.0], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'degree' : [1, 2, 3, 4, 5], 'gamma' : ['scale', 'auto']}
  # ]

    param_grid={"C":np.logspace(-3,3,7), "penalty":["l1","l2"]}

    # base_estimator = SVC()
    # sh = GridSearchCV(cv=5, estimator=base_estimator, param_grid=param_grid).fit(X_train, y_train)

    base_estimator = LogisticRegression()
    sh=GridSearchCV(base_estimator,param_grid,cv=10).fit(X_train, y_train)

    bs = sh.best_score_
    best_est = sh.best_estimator_

    traindf = pd.read_csv('unknown_august.csv')
    traindf.head()
    authorsdupl=traindf['author'].to_list()
    authors=set(authorsdupl)

    dictionaryauthors={}
    for x in authors:
        dictionaryauthors[x]=""

    text=traindf['text'].to_list()

    for i in range(len(text)):
        dictionaryauthors[authorsdupl[i]]+=text[i]

    # traindf['text'] = traindf['text'].str.replace(',','')
    # traindf['text'] = traindf['text'].str.replace(')', '')
    # traindf['text'] = traindf['text'].str.replace('(', '')

    # traindf['text'] = traindf['text'].str.replace('--', '')
    # #traindf['text'] = traindf['text'].str.replace('..', '')
    # #traindf['text'] = traindf['text'].str.replace('...', '')
    # #traindf['text'] = traindf['text'].str.replace('_', '')
    # #traindf['text'] = traindf['text'].str.replace('.', '')

    # traindf['text'] = traindf['text'].str.replace(']', '')
    # traindf['text'] = traindf['text'].str.replace('[', '')

    # traindf['text'] = traindf['text'].str.replace('»', '')
    # traindf['text'] = traindf['text'].str.replace('«', '')

    # traindf['text'] = traindf['text'].str.replace('}', '')
    # traindf['text'] = traindf['text'].str.replace('{', '')

    # traindf['text'] = traindf['text'].str.replace('!', '')
    # traindf['text'] = traindf['text'].str.replace('?', '')
    # traindf['text'] = traindf['text'].str.replace(';', '')

    # # Первая часть служебных слов

    # traindf['text'] = traindf['text'].str.replace(' на ', ' ')
    # traindf['text'] = traindf['text'].str.replace(' для ', ' ')
    # traindf['text'] = traindf['text'].str.replace(' о ', ' ')
    # traindf['text'] = traindf['text'].str.replace(' об ', ' ')
    # traindf['text'] = traindf['text'].str.replace(' за ', ' ')
    # traindf['text'] = traindf['text'].str.replace(' под ', ' ')
    # traindf['text'] = traindf['text'].str.replace(' перед ', ' ')
    # traindf['text'] = traindf['text'].str.replace(' с ', ' ')
    # traindf['text'] = traindf['text'].str.replace(' между ', ' ')
    # traindf['text'] = traindf['text'].str.replace(' к ', ' ')
    # traindf['text'] = traindf['text'].str.replace(' до ', ' ')

    # traindf['text'] = traindf['text'].str.replace(' и ', ' ')
    # traindf['text'] = traindf['text'].str.replace(' а ', ' ')
    # traindf['text'] = traindf['text'].str.replace(' но ', ' ')
    # traindf['text'] = traindf['text'].str.replace(' ни ', ' ')
    # traindf['text'] = traindf['text'].str.replace(' чтобы ', ' ')
    # traindf['text'] = traindf['text'].str.replace(' что ', ' ')

    # traindf['text'] = traindf['text'].str.replace(' лишь ', ' ')
    # traindf['text'] = traindf['text'].str.replace(' только ', ' ')
    # traindf['text'] = traindf['text'].str.replace(' не ', ' ')

    # traindf['text'] = traindf['text'].str.replace(' ах ', ' ')
    # traindf['text'] = traindf['text'].str.replace(' ох ', ' ')
    # traindf['text'] = traindf['text'].str.replace(' ой ', ' ')

    # # Вторая часть служебных слов

    # traindf['text'] = traindf['text'].str.replace(' над ', ' ')
    # traindf['text'] = traindf['text'].str.replace(' около ', ' ')
    # traindf['text'] = traindf['text'].str.replace(' в ', ' ')
    # traindf['text'] = traindf['text'].str.replace(' после ', ' ')
    # traindf['text'] = traindf['text'].str.replace(' от ', ' ')
    # traindf['text'] = traindf['text'].str.replace(' у ', ' ')
    # traindf['text'] = traindf['text'].str.replace(' по ', ' ')
    # traindf['text'] = traindf['text'].str.replace(' без ', ' ')
    # traindf['text'] = traindf['text'].str.replace(' для ', ' ')
    # traindf['text'] = traindf['text'].str.replace(' при ', ' ')
    # traindf['text'] = traindf['text'].str.replace(' из ', ' ')
    # traindf['text'] = traindf['text'].str.replace(' за ', ' ')

    # traindf['text'] = traindf['text'].str.replace(' тоже ', ' ')
    # traindf['text'] = traindf['text'].str.replace(' также ', ' ')
    # traindf['text'] = traindf['text'].str.replace(' то ', ' ')
    # traindf['text'] = traindf['text'].str.replace(' же ', ' ')
    # traindf['text'] = traindf['text'].str.replace(' бы ', ' ')
    # traindf['text'] = traindf['text'].str.replace(' будто ', ' ')

    # traindf['text'] = traindf['text'].str.replace(' ведь ', ' ')
    # traindf['text'] = traindf['text'].str.replace(' уж ', ' ')
    # traindf['text'] = traindf['text'].str.replace(' таки ', ' ')

    # # Третья часть

    # traindf['text'] = traindf['text'].str.replace(' ли ', ' ')
    # traindf['text'] = traindf['text'].str.replace(' вон ', ' ')
    # traindf['text'] = traindf['text'].str.replace(' вот ', ' ')
    # traindf['text'] = traindf['text'].str.replace(' именно ', ' ')
    # traindf['text'] = traindf['text'].str.replace(' точь-в-точь ', ' ')
    # traindf['text'] = traindf['text'].str.replace(' несмотря на ', ' ')
    # traindf['text'] = traindf['text'].str.replace(' несмотря ', ' ')
    # traindf['text'] = traindf['text'].str.replace(' благодаря ', ' ')
    # traindf['text'] = traindf['text'].str.replace(' зато ', ' ')
    # traindf['text'] = traindf['text'].str.replace(' из-за ', ' ')
    # traindf['text'] = traindf['text'].str.replace(' из-под ', ' ')
    # traindf['text'] = traindf['text'].str.replace(' оттого ', ' ')
    # traindf['text'] = traindf['text'].str.replace(' причем ', ' ')
    # traindf['text'] = traindf['text'].str.replace(' причём ', ' ')
    # traindf['text'] = traindf['text'].str.replace(' притом ', ' ')
    # traindf['text'] = traindf['text'].str.replace(' дабы ', ' ')
    # traindf['text'] = traindf['text'].str.replace(' чем ', ' ')
    # traindf['text'] = traindf['text'].str.replace(' чём ', ' ')
    # traindf['text'] = traindf['text'].str.replace(' том ', ' ')
    # traindf['text'] = traindf['text'].str.replace(' ежели ', ' ')
    # traindf['text'] = traindf['text'].str.replace(' коли ', ' ')

    sentences = traindf['text'].to_list()

    encoded_input = tokenizer(sentences, padding=True, truncation=True, max_length=24, return_tensors='pt')
  #Compute token embeddings
    with torch.no_grad():
        model_output = model(**encoded_input)
  #Perform pooling. In this case, mean pooling
    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

    sentence_embeddings = sentence_embeddings.numpy()

    X_train, X_test, y_train, y_test = train_test_split(sentence_embeddings, authorsdupl, test_size=0.99, random_state=1337)

    predictions = best_est.predict(X_test)
    print(list(predictions[0:10]))
    print(y_test[:10])

    sumsum += (sum(predictions == 'Gorbachevskiy')/len(X_test))

    print(f"i = {rndsd} lenB = {len(borisov)} lenG = {len(gorbachevskiy)} score = {bs}  Borisov: {sum(predictions == 'Borisov')/len(X_test)} Gorbachevskiy: {sum(predictions == 'Gorbachevskiy')/len(X_test)} Avg = {sumsum/(1+rndsd)}")


tokenizer_config.json:   0%|          | 0.00/1.24k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/1.78M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.71M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/866 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.71G [00:00<?, ?B/s]

['Gorbachevskiy', 'Gorbachevskiy', 'Gorbachevskiy', 'Gorbachevskiy', 'Borisov', 'Gorbachevskiy', 'Gorbachevskiy', 'Borisov', 'Gorbachevskiy', 'Borisov']
['Unknown', 'Unknown', 'Unknown', 'Unknown', 'Unknown', 'Unknown', 'Unknown', 'Unknown', 'Unknown', 'Unknown']
i = 81 lenB = 411 lenG = 2620 score = 0.8842890442890443  Borisov: 0.4598459845984598 Gorbachevskiy: 0.5401540154015402 Avg = 0.006587244090262685
['Gorbachevskiy', 'Gorbachevskiy', 'Gorbachevskiy', 'Gorbachevskiy', 'Gorbachevskiy', 'Gorbachevskiy', 'Gorbachevskiy', 'Borisov', 'Gorbachevskiy', 'Gorbachevskiy']
['Unknown', 'Unknown', 'Unknown', 'Unknown', 'Unknown', 'Unknown', 'Unknown', 'Unknown', 'Unknown', 'Unknown']
i = 82 lenB = 411 lenG = 2620 score = 0.8842890442890441  Borisov: 0.341034103410341 Gorbachevskiy: 0.658965896589659 Avg = 0.014447227855315652
['Gorbachevskiy', 'Gorbachevskiy', 'Gorbachevskiy', 'Gorbachevskiy', 'Gorbachevskiy', 'Gorbachevskiy', 'Gorbachevskiy', 'Borisov', 'Gorbachevskiy', 'Borisov']
['Unknown