# NeuroToken: Capturing Author Style with LSTM

Source code for COM3025 - Vasily Shcherbinin

## Preparation for Training 
### Importing all necessary libraries, defining paths to files

In [13]:
import re
import operator
import random
import random
import json
import sys
from collections import defaultdict

import nltk
import pronouncing
from pyphonetics import Soundex
from transliterate import translit
from keras.models import Sequential, load_model
from keras.layers import Dense, Activation, Bidirectional
from keras.layers import LSTM
from keras.layers import Dropout
from keras.optimizers import RMSprop
from keras.utils.data_utils import get_file
import numpy as np

SONNETS = "./sonnets.txt"
SONNET_CLEAN = "./sonnets_clean.txt"
TRUMP = "./speeches.txt"
TRUMP4 = "./trump4.txt"

WORD_RE = re.compile(r'[a-zA-Z]+')

Using TensorFlow backend.


### Initialising Soundex library

In [14]:
soundex = Soundex()

### Defining helper methods to simplify loading and saving data

In [15]:
def load(*args):
    for varname in args:
        with open("{}.txt".format(varname), "r") as f:
            yield json.loads(f.read())
            
def save(**kwargs):
    for varname, data in kwargs.items():
        with open("{}.txt".format(varname), "w") as f:
            f.write(json.dumps(data))

## Examples of using the Pronouncing library to tokenise words
### Tokenisation by part of speech, location of stressed syllable and Soundex value of a word

In [16]:
def partOfSpeech_stress_soundex(word):
    if len(word) > 1:
        sdx = soundex.phonetics(word)
    else:
        sdx = word

    pronunciation_list = pronouncing.phones_for_word(word)
    if (len(pronunciation_list) == 0):
        syllable_count = 3
        stress = 2
    else:
        pronouncing.stresses(pronunciation_list[0])

        syllable_count = pronouncing.syllable_count(pronunciation_list[0])
        print("Number of syllables: " + str(syllable_count))

        if (syllable_count == 1):
            stress_syllable = 1
        else:
            stress_syllable = re.search("1", pronouncing.stresses(pronunciation_list[0])).start() + 1
        print("Stressed syllable:" + str(stress_syllable))

        rel_stress = stress_syllable / syllable_count
        print("Relative stress: " + str(rel_stress))

        if rel_stress <= 0.5:
            stress = 0
        elif rel_stress <= 0.8:
            stress = 1
        else:
            stress = 2

    text = nltk.word_tokenize(word)

    _, partOfSpeech = nltk.pos_tag(text)[0]

    return "{}{}{}".format(partOfSpeech, stress, sdx)


print(partOfSpeech_stress_soundex("why"))

Number of syllables: 1
Stressed syllable:1
Relative stress: 1.0
WRB2W000


### Tokenisation by part of speech and Soundex value

In [17]:
def partOfSpeech_soundex(word):
    try:
        if len(word) > 1:
            sdx = soundex.phonetics(word)
        else:
            sdx = word
    except:
        return ""

    text = nltk.word_tokenize(word)

    _, partOfSpeech = nltk.pos_tag(text)[0]

    return "{}{}".format(partOfSpeech, sdx)


print(partOfSpeech_soundex("Writing"))

VBGW635


### Tokenisation by syllable count and Soundex value

In [18]:
def syllable_soundex(word):
    try:
        if len(word) > 1:
            sdx = soundex.phonetics(word)
        else:
            sdx = word
    except:
        return ""

    syllable_count = 0
    pronunciation_list = pronouncing.phones_for_word(word)
    if (len(pronunciation_list) == 0):
        print("No English Word Found")
    else:
        syllable_count = pronouncing.syllable_count(pronunciation_list[0])

    return "{}{}".format(syllable_count, sdx)


print(syllable_soundex("the"))

1T000


### Tokenisation by part of speech, syllable count and Soundex value

In [19]:
def partOfSpeech_syllable_soundex(word):
    try:
        if len(word) > 1:
            sdx = soundex.phonetics(word)
        else:
            sdx = word
    except:
        return ""

    syllable_count = 0
    pronunciation_list = pronouncing.phones_for_word(word)
    if (len(pronunciation_list) == 0):
        print("No English Word Found")
    else:
        syllable_count = pronouncing.syllable_count(pronunciation_list[0])

    text = nltk.word_tokenize(word)

    _, partOfSpeech = nltk.pos_tag(text)[0]

    return "{}{}{}".format(partOfSpeech, syllable_count, sdx)

print(partOfSpeech_syllable_soundex("walls"))

NNS1W420


### Tokenisation by location of stressed syllable and Soundex value

In [20]:
def stress_soundex(word):
    if word == "/":
        return "/"

    if len(word) > 1:
        sdx = soundex.phonetics(word)
    else:
        sdx = word

    pronunciation_list = pronouncing.phones_for_word(word)
    if (len(pronunciation_list) == 0):
        print("No English Word Found")
    else:
        pronouncing.stresses(pronunciation_list[0])

        syllable_count = pronouncing.syllable_count(pronunciation_list[0])
        print("Number of syllables: " + str(syllable_count))

        if (syllable_count == 1):
            stress_syllable = 1
        else:
            stress_syllable = re.search("1", pronouncing.stresses(pronunciation_list[0])).start() + 1
            
        print("Accented syllable: " + str(stress_syllable))

    rel_stress = stress_syllable / syllable_count
    print("Relative accent: " + str(rel_stress))

    if rel_stress <= 0.5:
        stress = 0
    elif rel_stress <= 0.8:
        stress = 1
    else:
        stress = 2

    return "{}{}".format(stress, sdx)


print(stress_soundex("amazing"))

Number of syllables: 3
Accented syllable: 2
Relative accent: 0.6666666666666666
1A525


### Using Pronouncing library to find rhyming words - potentially useful when generating poetry

In [21]:
def is_rhyme(word, rhymed_word):
    if (rhymed_word in pronouncing.rhymes(word)):
        return True
    else:
        return False

print(is_rhyme("fun", "sun"))
print(is_rhyme("fall", "wall"))
print(is_rhyme("billy", "silly"))
print(is_rhyme("orange", "apple"))

True
True
True
False


## Define the Levenshtein Distance to find similar words
### Levenshtein Distance defines the minimal number of letter substitutions required in order to convert one word/sentence into another

In [22]:
def levenshtein_distance(word1, word2):
    if len(word1) < len(word2):
        return levenshtein_distance(word2, word1)

    if len(word2) == 0:
        return len(word1)

    previous_row = list(range(len(word2) + 1))

    for i, char1 in enumerate(word1):
        current_row = [i + 1]

        for j, char2 in enumerate(word2):
            insertions = previous_row[j + 1] + 1
            deletions = current_row[j] + 1
            substitutions = previous_row[j] + (char1 != char2)

            current_row.append(min(insertions, deletions, substitutions))

        previous_row = current_row
    return previous_row[-1]

print(levenshtein_distance("hello", "hello"))
print(levenshtein_distance("hello", "hillo"))
print(levenshtein_distance("hello", "hubba"))

0
1
4


## Clean the files to be used - remove whitespace, redundant lines, punctuation and words unrecognised by Pronouncing library (i.e. non-dictionary words)

In [23]:
filename = SONNETS
def remove_empty_lines(filename):
    """Overwrite the file, removing empty lines and lines that contain only whitespace."""
    with open(filename) as in_file, open(filename, 'r+') as out_file:
        out_file.writelines(line for line in in_file if line.strip())
        out_file.truncate()
        
remove_empty_lines(SONNETS)

with open(SONNETS, encoding="utf8") as f:
    g = open("./sonnets_clean.txt","w+")
    for line in f:
        cleared_line = " ".join(WORD_RE.findall(line))
        wordList = re.sub("[^\w]", " ",  cleared_line).split()
        for f in wordList:
            pronunciation_list = pronouncing.phones_for_word(f)
            if(len(pronunciation_list) != 0):
                g.write(f.lower()+" ")

In [24]:
with open(TRUMP, encoding="utf8") as f:
    g = open(TRUMP4,"w+")
    for line in f:
        cleared_line = " ".join(WORD_RE.findall(line))
        wordList = re.sub("[^\w]", " ",  cleared_line).split()
        for f in wordList:
            pronunciation_list = pronouncing.phones_for_word(f)
            if(len(pronunciation_list) != 0):
                g.write(f.lower()+" ")

## Generate Corpuses
### For the Text whose style we would like to learn, generate several different style corpuses - any of those can then be used for training the LSTM

In [26]:
style_corpus = []
with open(TRUMP4, encoding="utf8") as f:
    for line in f:
        cleaned_line = " ".join(WORD_RE.findall(line))
        if cleaned_line:
            style_corpus += [w for w in cleaned_line.lower().split(" ") if w and not w.isnumeric()]
#             style_corpus.append("/")

print(style_corpus[:100])

style_corpus_soundex = {}
style_corpus_soundex_list = []
for w in style_corpus:
    pronunciation_list = pronouncing.phones_for_word(w)
    if (len(pronunciation_list) == 0):
        w = ""
    else:
        sdx = partOfSpeech_soundex(w)
        style_corpus_soundex_list.append(sdx)
        style_corpus_soundex[sdx] = w

print(style_corpus_soundex_list[:100])

style_corpus_syllables = {}
style_corpus_syllables_list = []
for w in style_corpus:
    sdx = syllable_soundex(w)
    style_corpus_syllables_list.append(sdx)
    style_corpus_syllables[sdx] = w

print(style_corpus_syllables_list[:100])

style_corpus_partOfSpeech_syllables = {}
style_corpus_partOfSpeech_syllables_list = []
for w in style_corpus:
    sdx = partOfSpeech_syllable_soundex(w)
    style_corpus_partOfSpeech_syllables_list.append(sdx)
    style_corpus_partOfSpeech_syllables[sdx] = w

print(style_corpus_partOfSpeech_syllables_list[:100])

save(
    style_corpus=style_corpus,
    style_corpus_soundex=style_corpus_soundex,
    style_corpus_soundex_list=style_corpus_soundex_list,
    style_corpus_syllables=style_corpus_syllables,
    style_corpus_syllables_list=style_corpus_syllables_list,
    style_corpus_partOfSpeech_syllables=style_corpus_partOfSpeech_syllables,
    style_corpus_partOfSpeech_syllables_list=style_corpus_partOfSpeech_syllables_list
)

['speech', 'thank', 'you', 'so', 'much', 'that', 's', 'so', 'nice', 't', 'he', 'a', 'great', 'guy', 'he', 't', 'get', 'a', 'fair', 'press', 'he', 't', 'get', 'it', 'it', 's', 'just', 'not', 'fair', 'and', 'i', 'have', 'to', 'tell', 'you', 'i', 'm', 'here', 'and', 'very', 'strongly', 'here', 'because', 'i', 'have', 'great', 'respect', 'for', 'steve', 'king', 'and', 'have', 'great', 'respect', 'likewise', 'for', 'citizens', 'united', 'david', 'and', 'everybody', 'and', 'tremendous', 'for', 'the', 'tea', 'party', 'also', 'also', 'the', 'people', 'of', 'iowa', 'they', 'have', 'something', 'in', 'common', 'hard', 'working', 'people', 'they', 'want', 'to', 'work', 'they', 'want', 'to', 'make', 'the', 'country', 'great', 'i', 'love', 'the', 'people', 'of', 'iowa', 'so', 'that']
['NNS120', 'NNT520', 'PRPY000', 'RBS000', 'JJM200', 'INT300', 'NNs', 'RBS000', 'JJN200', 'NNt', 'PRPH000', 'DTa', 'JJG630', 'NNG000', 'PRPH000', 'NNt', 'VBG300', 'DTa', 'NNF600', 'NNP620', 'PRPH000', 'NNt', 'VBG300', '

### For the other text, generate similar corpuses as above to be used as a dictionary when converting LSTM-generated tokens (result) into plaintext

In [29]:
dictionary_content_corpus = []
with open(TRUMP4, encoding="utf8") as f:
    for line in f:
        cleared_line = " ".join(WORD_RE.findall(line))
        if cleared_line:
            dictionary_content_corpus += [w for w in cleared_line.lower().split(" ") if w and len(w) > 1 and not w.isnumeric()]

dictionary_content_corpus = list(set(dictionary_content_corpus))
print(dictionary_content_corpus[:100])
dictionary_content_corpus_soundex = {partOfSpeech_soundex(w): w for w in dictionary_content_corpus if w}
print(list(dictionary_content_corpus_soundex.keys())[:100])
dictionary_content_corpus_syllables = {syllable_soundex(w): w for w in dictionary_content_corpus if w}
print(list(dictionary_content_corpus_syllables.keys())[:100])
dictionary_content_corpus_partOfSpeech_syllables = {partOfSpeech_syllable_soundex(w): w for w in dictionary_content_corpus if w}
print(list(dictionary_content_corpus_partOfSpeech_syllables.keys())[:100])

save(
    dictionary_content_corpus=dictionary_content_corpus, 
    dictionary_content_corpus_soundex=dictionary_content_corpus_soundex,
    dictionary_content_corpus_syllables=dictionary_content_corpus_syllables,
    dictionary_content_corpus_partOfSpeech_syllables=dictionary_content_corpus_partOfSpeech_syllables
)

['confused', 'quickly', 'boats', 'throws', 'reference', 'stephanopoulos', 'fundamental', 'aggression', 'dropped', 'newspaper', 'manager', 'unacceptable', 'collar', 'promise', 'brad', 'breakfast', 'restrict', 'christmas', 'lieutenant', 'whose', 'jokingly', 'saving', 'face', 'against', 'bridges', 'blowup', 'disgusting', 'make', 'calling', 'airlines', 'everything', 'theme', 'facilities', 'contribute', 'prayer', 'noble', 'burning', 'tubes', 'join', 'discussing', 'pander', 'obama', 'devaluations', 'congratulations', 'dwight', 'general', 'pipe', 'situation', 'states', 'negotiating', 'sees', 'mile', 'funded', 'fly', 'longer', 'preparation', 'help', 'fields', 'lens', 'speaker', 'bunch', 'nutshell', 'amendment', 'enrichment', 'shipped', 'silicon', 'refer', 'exorbitant', 'plateau', 'mexican', 'producer', 'afterwards', 'authorizes', 'formed', 'ambulances', 'doctors', 'women', 'stanford', 'blah', 'deadly', 'building', 'loaded', 'dismantle', 'evening', 'keeps', 'publishers', 'towns', 'controlled', 

## Train the model using LSTM with style corpus as input
### Choose between LSTM and Bidirectional LSTM - simply comment out as needed

In [30]:
text = " ".join(style_corpus_partOfSpeech_syllables_list)
chars = sorted(list(set(text)))
print('total chars:', len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

# cut the text in semi-redundant sequences of maxlen characters
maxlen = 40
step = 3
sentences = []
next_chars = []
for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i: i + maxlen])
    next_chars.append(text[i + maxlen])
print('nb sequences:', len(sentences))
print('Vectorization...')
X = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        X[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1

print('Building model...')
# Initialize Sequential Model
model = Sequential()
model.add(LSTM(128, input_shape=(maxlen, len(chars))))
# Add the output layer that is a softmax of the number of characters
model.add(Dense(len(chars), activation='softmax')) 
# Optimization through RMSprop
optimizer_new = RMSprop() 
model.compile(loss='categorical_crossentropy', optimizer=optimizer_new) 
    
# print('Build model...')
# model = Sequential()
# model.add(Bidirectional(LSTM(128, input_shape=(maxlen, len(chars)))))
# model.add(Dense(len(chars)))
# model.add(Activation('softmax'))
# optimizer_new = RMSprop() 
# model.compile(loss='categorical_crossentropy', optimizer=optimizer_new)

def sample(predictions, temperature=1.0):
    # helper function to sample an index from a probability array
    predictions = np.asarray(predictions).astype('float64')
    predictions = np.log(predictions) / temperature
    exp_predictions = np.exp(predictions)
    predictions = exp_predictions / np.sum(exp_predictions)
    probas = np.random.multinomial(1, predictions, 1)
    return np.argmax(probas)

# train the model, output generated text after each iteration
for iteration in range(1, 2):
    print()
    print('-' * 50)
    print('Iteration', iteration)
    model.fit(X, y,
              batch_size=128,
              epochs=150)

    start_index = random.randint(0, len(text) - maxlen - 1)

    for diversity in [0.2, 0.5, 1.0, 1.2]:
        print()
        print('----- diversity:', diversity)

        generated = '' 
        sentence = text[start_index: start_index + maxlen]
        generated += sentence
        print('----- Generating with seed: "' + sentence + '"')
        sys.stdout.write(generated)

        for i in range(300):
            x = np.zeros((1, maxlen, len(chars)))
            for t, char in enumerate(sentence):
                x[0, t, char_indices[char]] = 1.

            predictions = model.predict(x, verbose=0)[0]
            next_index = sample(predictions, diversity)
            next_char = indices_char[next_index]

            generated += next_char
            sentence = sentence[1:] + next_char

            sys.stdout.write(next_char)
            sys.stdout.flush()
        print()

total chars: 57
nb sequences: 460369
Vectorization...
Building model...
Instructions for updating:
Colocations handled automatically by placer.

--------------------------------------------------
Iteration 1
Instructions for updating:
Use tf.cast instead.
Epoch 1/150
 23424/460369 [>.............................] - ETA: 2:19 - loss: 2.5332

KeyboardInterrupt: 

### Save the model for later use

In [182]:
model.save('TrumpOnlyBiDir-128.h5')

### Load model and see model summary

In [41]:
model = load_model("models\\TrumpOnly-128.h5")
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_14 (LSTM)               (None, 128)               95232     
_________________________________________________________________
dense_14 (Dense)             (None, 57)                7353      
Total params: 102,585
Trainable params: 102,585
Non-trainable params: 0
_________________________________________________________________


## Helper method to randomly generate tokens from model -> allows to enter seed if such is known

In [34]:
model = load_model("TrumpOnly-128.h5")
def generate_rnn(count):
    generated = ""
    start_index = random.randint(0, len(text) - maxlen - 1)
    sentence = text[start_index: start_index + maxlen]
    generated += sentence

    for i in range(count):
        x = np.zeros((1, maxlen, len(chars)))
        for t, char in enumerate(sentence):
            x[0, t, char_indices[char]] = 1.

        predictions = model.predict(x, verbose=0)[0]
#         next_index = sample(predictions, diversity)
        next_index = sample(predictions, 1) #-> comment above, uncomment this if loading existing model
        next_char = indices_char[next_index]

        generated += next_char
        sentence = sentence[1:] + next_char
    return generated

content = style_corpus_partOfSpeech_syllables
generated = generate_rnn(2000).split(" ")

# seed = "RB2A450 WDT1W200 CD1T600 NN1T430 RB1N000 RB2A400 VBP1A600 VBN1V500 NNS1T000 NN15000 MD1S400 NN1S500 PRP1I300 RB1S000 PRP1S000 NN2F620 CC1A530 NN1M500 PRP$1Y600 NN1F653 NN1S300 NN2D263 IN2W350 PRP$1H200 NNS1G620 IN1W300 NNS1W632 NNS1M500 IN1W300 NNS2F462 VBN1M500 VBN1S500 MD1S400 NN2F224 NN1S360 NN1S450 NN1C400 NN1B630 IN1I500"
# generated = seed.split(" ")

print(generated)



['N1S353', 'RB1T600', 'IN1W300', 'NN3B314', 'NN3E215', 'JJ1L230', 'NN1M530', 'IN1T300', 'NN1s', 'RB1S500', 'NN1i', 'MD1W400', 'RB2E160', 'NN1T400', 'PRP1Y000', 'RB1S000', 'NN1L200', 'NN3F463', 'RB1S000', 'NN1i', 'VBN1C130', '2VSaB6M25', 'NN1C520', 'CC1A530', 'VB2R400', 'NN1T651', 'IN1I100', 'NN1i', 'NN1D500', 'NN1t', 'VB1D000', 'DT1a', 'NN1C630', 'IN1O100', 'PRP1I300', 'NN1i', 'VBD1W200', 'IN1L200', 'DT1T000', 'JJ1F600', 'NN1H300', 'DT1T000', 'NN3E423', 'IN1T300', 'NN1i', 'VB1L300', 'NN1i', 'VBD1H300', 'JJ1G630', 'JJ3N264', 'NN4D526', 'CC1A530', 'NN1i', 'NN1D500', 'NN1t', 'NN1W530', 'TO1T000', 'VB1M200', 'CD2T645', 'PRP1W000', 'VB1H100', 'TO1T000', 'NN1P300', 'NN5O125', 'PRP1W000', 'NN1V000', 'NNS3P535', 'PRP$2O600', 'NNS2P420', 'PRP1T000', 'VB1H100', 'IN1T300', 'NN3C535', 'TO1T000', 'VB1K100', 'PRP1I300', 'IN2B220', 'IN1I100', 'RB2V600', 'RB1S000', 'RB1S000', 'NN1i', 'NN1R200', 'PRP1I300', 'RB1U100', 'PRP1H000', 'NN1s', 'VBN2S123', 'TO1T000', 'VB1B000', 'RB1S000', 'RB3C630', 'VBG2D420

### From dictionary, using Levenshtein Distance find closest token to the token generated by model and convert to text  

In [35]:
for term in generated:
    guessed_words = {}
    for idx, word in content.items():
        lev_dist = levenshtein_distance(idx, term)
            
        if len(word) > 1 and lev_dist <= 1:
            guessed_words[word] = lev_dist

    if not guessed_words:
        for idx, word in style_corpus_soundex.items():
            lev_dist = levenshtein_distance(idx, term)
            if lev_dist <= 1:
                guessed_words[word] = lev_dist
    
    print(sorted(guessed_words, key=guessed_words.get)[0] if guessed_words else "", end=" ")

stand there with beautiful equipment last month that s soon i will ever tell you so look florida so i caused  change and really trump if i don t do a crowd of it i was like the free hate the election that i let i had great natural democracy and i don t want to make trillion we have to put organization we ve continents our police they have that continue to keep it because if very so so i rush it up he s supposed to be so certainly doing that so and so these i don t thank anything like it because we be negotiate for the building i backed foreign people why t we put up it had citizens but woman not it has done unfairly wonderful infrastructure is so nasty they re going to make our country great again i m a pennsylvania these people and they regulation a twitter but not something spoke waldorf failed it go to other places that it s going to make our country so i mean it s going to win they don t bit him way  it because we have a nice accident way our allies the one oil my vets you re not w

## Validate results using Multinomial Naive Bayes
### Import training and testing sets, create a word count using CountVectorizer and Term Frequency Inverse Document Frequency, then train the Naive Bayes Classifier

In [36]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
%matplotlib inline

In [37]:
import sklearn.datasets as skd
categories = ['trump', 'shakespeare']
data_train = skd.load_files('C:\\Users\\Vas-DELL\\Documents\\NeuroToken\\data\\train',categories=categories,encoding='ISO-8859-1')
data_test = skd.load_files('C:\\Users\\Vas-DELL\\Documents\\NeuroToken\\data\\test',categories=categories,encoding='ISO-8859-1')

from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
x_train_tf = count_vect.fit_transform(data_train.data)

from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
x_train_tfidf = tfidf_transformer.fit_transform(x_train_tf)

from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(x_train_tfidf,data_train.target)

In [38]:
x_test_tf = count_vect.transform(data_test.data)
x_test_tf.shape
x_test_tfidf = tfidf_transformer.transform(x_test_tf)

predicted = clf.predict(x_test_tfidf)

(42, 7975)

### Create Confusion Matrix to demonstrate Results

In [40]:
from sklearn import metrics
from sklearn.metrics import accuracy_score
print("Accuracy : ", accuracy_score(data_test.target,predicted))
print(metrics.classification_report(data_test.target,predicted,target_names=data_test.target_names))

metrics.confusion_matrix(data_test.target,predicted)

Accuracy :  1.0
              precision    recall  f1-score   support

 shakespeare       1.00      1.00      1.00        21
       trump       1.00      1.00      1.00        21

   micro avg       1.00      1.00      1.00        42
   macro avg       1.00      1.00      1.00        42
weighted avg       1.00      1.00      1.00        42



array([[21,  0],
       [ 0, 21]], dtype=int64)