# Language Identification

Language identification of similar languages using an ensemble of recurrent neural networks. Implementation based on the paper ["LIDE: Language Identification from Text
Documents"](https://arxiv.org/pdf/1701.03682.pdf).

## Imports

In [None]:
import os
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras

## Make workspace

In [None]:
# Make directories if they don't exist
os.makedirs(os.path.join('datasets/DSLCC-v2.0'), exist_ok=True)
if not os.path.exists('models'):
    os.mkdir('models')

## Download dataset

We use the [DSLCC v2.0](https://github.com/alvations/bayesmax/tree/master/bayesmax/data/DSLCC-v2.0) dataset from the [DSL Shared Task 2015](http://ttg.uni-saarland.de/lt4vardial2015/dsl.html)

In [None]:
# DSLCC v2.0
if not os.path.exists('datasets/DSLCC-v2.0/train.txt'):
    !cd datasets/DSLCC-v2.0 && curl -o train.txt https://raw.githubusercontent.com/alvations/bayesmax/master/bayesmax/data/DSLCC-v2.0/train-dev/train.txt
if not os.path.exists('datasets/DSLCC-v2.0/devel.txt'):
    !cd datasets/DSLCC-v2.0 && curl -o devel.txt https://raw.githubusercontent.com/alvations/bayesmax/master/bayesmax/data/DSLCC-v2.0/train-dev/devel.txt
if not os.path.exists('datasets/DSLCC-v2.0/test.txt'):
    !cd datasets/DSLCC-v2.0 && curl -o test.txt https://raw.githubusercontent.com/alvations/bayesmax/master/bayesmax/data/DSLCC-v2.0/test/test.txt

## Data

The corpus contains 20,000 instances per language (18,000 training + 2,000 development). Each instance is an excerpt extracted from journalistic texts containing 20 to 100 tokens and tagged with the country of origin of the text. A list of languages and the corresponing codes is shown in the following table:

<table>
    <tr>
        <th>Group Name</th>
        <th>Language Name</th>
        <th>Language Code</th>
    </tr>
    <tr>
        <td rowspan=2>South Eastern Slavic</td>
        <td>Bulgarian</td>
        <td>bg</td>
    </tr>
    <tr>
        <td>Macedonian</td>
        <td>mk</td>
    </tr>
    <tr>
        <td rowspan=3>South Western Slavic</td>
        <td>Bosnian</td>
        <td>bs</td>
    </tr>
    <tr>
        <td>Croatian</td>
        <td>hr</td>
    </tr>
    <tr>
        <td>Serbian</td>
        <td>sr</td>
    </tr>
    <tr>
        <td rowspan=2>West-Slavic</td>
        <td>Czech</td>
        <td>cz</td>
    </tr>
    <tr>
        <td>Slovak</td>
        <td>sk</td>
    </tr>
    <tr>
        <td rowspan=2>Ibero-Romance (Spanish)</td>
        <td>Peninsular Spanish</td>
        <td>es-ES</td>
    </tr>
    <tr>
        <td>Argentinian Spanish</td>
        <td>es-AR</td>
    </tr>
    <tr>
        <td rowspan=2>Ibero-Romance (Portugese)</td>
        <td>Brazilian Portugese</td>
        <td>pt-BR</td>
    </tr>
    <tr>
        <td>European Portugese</td>
        <td>pt-PT</td>
    </tr>
    <tr>
        <td rowspan=2>Astronesian</td>
        <td>Indonesian</td>
        <td>id</td>
    </tr>
    <tr>
        <td>Malay</td>
        <td>my</td>
    </tr>
    <tr>
        <td>Other</td>
        <td>Various Languages</td>
        <td>xx</td>
    </tr>
</table>

In [None]:
train = pd.read_csv('datasets/DSLCC-v2.0/train.txt', sep='\t', names=['sentence', 'language'])
validation = pd.read_csv('datasets/DSLCC-v2.0/devel.txt', sep='\t', names=['sentence', 'language'])
test = pd.read_csv('datasets/DSLCC-v2.0/test.txt', sep='\t', names=['sentence', 'language'])

In [None]:
print(f'Training set size:   {len(train)}')
print(f'Validation set size: {len(validation)}')
print(f'Test set size:       {len(test)}')

In [None]:
# Print number of instances per label
print(train['language'].value_counts())

In [None]:
train[train['language'] == 'xx'].head()

In [None]:
print(train.head())

In [None]:
CLASS_UNKNOWN = 'xx'
CLASSES = ['bg', 'mk', 'bs', 'hr', 'sr', 'cz', 'sk', 'es-ES', 'es-AR', 'pt-BR', 'pt-PT', 'id', 'my', CLASS_UNKNOWN]
CLASS_NAMES = [
    'Bulgarian', 'Macedonian', 'Bosnian', 'Croatian', 'Serbian', 'Czech', 'Slovak',
    'Peninsular Spanish', 'Argentinian Spanish', 'Brazilian Portuguese', 'European Portuguese',
    'Indonesian', 'Malay', 'Other'
]
NUM_CLASSES = len(CLASSES)

In [None]:
NUM_CLASSES

In [None]:
# Change all other language codes to xx
def mark_unknown_languages(data):
    data['language'].where([x in CLASSES for x in data['language']], CLASS_UNKNOWN, inplace=True)
mark_unknown_languages(train)
mark_unknown_languages(validation)
mark_unknown_languages(test)

## Preprocessing

### Common

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer, tokenizer_from_json
from tensorflow.keras.utils import to_categorical

In [None]:
X_train = train['sentence']
y_train = train['language']
X_validation = validation['sentence']
y_validation = validation['language']

In [None]:
print(X_train.head())
print(y_train.head())

In [None]:
from sklearn.preprocessing import OneHotEncoder

In [None]:
# Use OneHotEncoder for target variable
# This is better than get_dummies because here we specify all classes
# so all possible classes will have a column and the order will be specified
# If the language code is unkown, an error is thrown
target_encoder = OneHotEncoder(sparse=False, dtype=np.int32)
target_encoder.fit(np.array(CLASSES).reshape(-1, 1))

In [None]:
# Create, configure and train a tokenizer 
def get_tokenizer(data, num_words=None):
    tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n“„”–', num_words=num_words, oov_token='<OOV>')
    tokenizer.fit_on_texts(data)
    return tokenizer

### Word unigrams

In [None]:
tokenizer_w1 = get_tokenizer(X_train)

In [None]:
# Tokenizer vocabulary size
len(tokenizer_w1.word_index)

In [None]:
# Display number of occurrences of n-th most common word
n = 15_000
sorted(tokenizer_w1.word_counts.items(), key=lambda w: w[1], reverse=True)[n]

In [None]:
# Count words that occurr more than n times
n = 50
len([x for x in tokenizer_w1.word_counts.items() if x[1] > n])

In [None]:
NUM_UNIQUE_WORDS = 10_000

In [None]:
tokenizer_w1 = get_tokenizer(X_train, NUM_UNIQUE_WORDS)
X_train = tokenizer_w1.texts_to_sequences(X_train)

In [None]:
# Find max length of train sequences
max([len(x) for x in X_train])

In [None]:
# Find number of instances longer than n tokens
n = 50
len([x for x in X_train if len(x) > n])

In [None]:
MAX_WORD_TOKENS = 50

In [None]:
X_train = pad_sequences(X_train, padding='post', maxlen=MAX_WORD_TOKENS)

### Character n-grams

In [None]:
from tensorflow.keras.preprocessing.text import text_to_word_sequence
from nltk.util import ngrams
from itertools import chain
from tqdm import tqdm
tqdm.pandas()

In [None]:
NGRAMS_MAX_WORDS = {
    2: None,
    3: 20000,
    4: None,
    5: None
}

def sentence_to_char_ngram(sentence, n):
    s = ''.join([c if c not in '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n“„”–' else ' ' for c in sentence])
    tokens = text_to_word_sequence(s)
    ngrams_ = [[''.join(ng) for ng in list(ngrams(token, n))] for token in tokens if len(token) >= n]
    return ' '.join(chain.from_iterable(ngrams_))

def transform_to_char_ngrams(X, n):
    X_ngram_train = X.copy()
    print(f'{n} - gramming')
    return X_ngram_train.progress_apply(lambda sentence: sentence_to_char_ngram(sentence, n))

def get_char_ngram_tokenizer(X, n):
    tokenizer = get_tokenizer(X, num_words=NGRAMS_MAX_WORDS[n])
    return tokenizer

## Model

In [None]:
from tensorflow.keras import Model
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import InputLayer, LSTM, GRU, Dropout, Dense
from tensorflow.keras.utils import Sequence

In [None]:
EPOCHS = 10
BATCH_SIZE = 32

In [None]:
def get_model(input_shape, recurrent_layer_size, recurrent_dropout_rate=0.0, dropout_rate=0.0, use_lstm=False):
    model = Sequential()
    model.add(InputLayer(input_shape=input_shape))
    if use_lstm:
        model.add(LSTM(hidden_layer_size, recurrent_dropout=recurrent_dropout_rate, name='lstm'))
    else:
        model.add(GRU(hidden_layer_size, recurrent_dropout=recurrent_dropout_rate, name='gru'))
    model.add(Dropout(rate=dropout_rate, name='dropout'))
    model.add(Dense(NUM_CLASSES, activation='softmax', name='softmax'))
    
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

In [None]:
# Plot model training history
def plot_history(history):
    plt.plot([-1] + history.epoch, [0.0] + history.history['accuracy'])
    plt.plot([-1] + history.epoch, [0.0] + history.history['val_accuracy'])
    plt.legend(['Training accuracy', 'Validation accuracy'])
    plt.xlabel('epochs')
    plt.ylabel('accuracy')
    plt.xticks(np.arange(-1, len(history.epoch)), np.arange(len(history.epoch)+1))

In [None]:
# Save trained model, compilation and training data, history plot and tokenizer
def save_rnn(model, batch_size, history, tokenizer, model_name=None):
    recurrent_layer = model.get_layer(index=0)
    recurrent_type = recurrent_layer.name
    recurrent_units = recurrent_layer.units
    recurrent_dropout_rate = recurrent_layer.recurrent_dropout
    
    dropout_layer = model.get_layer(index=1)
    dropout_rate = dropout_layer.rate

    epochs = len(history.epoch)
    
    if not model_name:
        model_name = f'model_{epochs}_{batch_size}_{recurrent_type}_{recurrent_units}_{int(100*recurrent_dropout_rate)}_{int(100*dropout_rate)}_{time.strftime("%Y%m%d_%H%M%S")}'
    model_path = f'models/{model_name}'
    
    model.save(model_path)
    with open(f'{model_path}/training.txt', 'w') as f:
        f.write(f'EPOCHS:            \t {epochs}\n')
        f.write(f'BATCH SIZE:        \t {batch_size}\n')
        f.write(f'RECURRENT LAYER:   \t {recurrent_type}\n')
        f.write(f'RECURRENT UNITS:   \t {recurrent_units}\n')
        f.write(f'RECURRENT DROPOUT: \t {recurrent_dropout_rate}\n')
        f.write(f'OUTPUT DROPOUT:    \t {dropout_rate}\n')
        model.summary(print_fn = lambda x: f.write(x + '\n'))
        f.write(f'ACCURACY:     \t {history.history["accuracy"]}\n')
        f.write(f'VAL ACCURACY: \t {history.history["val_accuracy"]}\n')
    plot_history(history)
    plt.title(model_name)
    plt.savefig(f'{model_path}/history.png')
    
    with open(f'{model_path}/tokenizer.json', 'w') as f:
        f.write(tokenizer.to_json())
    
    return model_name

In [None]:
def load_rnn(model_name):
    with open(f'models/{model_name}/tokenizer.json', 'r') as f:
        tokenizer = tokenizer_from_json(f.read())
    model = load_model(f'models/{model_name}')
    return model, tokenizer

In [None]:
class DataGenerator(Sequence):
    def __init__(self, input_sequences, vocabulary_size, labels, batch_size=32, shuffle=True):
        self.input_sequences = input_sequences
        self.vocabulary_size = vocabulary_size
        # TODO move to __getitem__
        self.labels = target_encoder.transform(np.asarray(labels).reshape(-1, 1))
        self.batch_size = batch_size
        self.shuffle = shuffle
        # TODO check does this get called automatically anyway
        self.on_epoch_end()

    # Number of batches per epoch
    def __len__(self):
        return int(np.ceil(len(self.input_sequences) / self.batch_size))

    # Generate one batch
    def __getitem__(self, index):
        indexes = np.arange(index*self.batch_size, min((index+1)*self.batch_size, len(self.input_sequences)))
        X = to_categorical([self.input_sequences[index] for index in indexes], num_classes=self.vocabulary_size)
        # y = target_encoder.transform(np.asarray([labels[index] for index in indexes]).reshape(-1, 1))
        y = np.asarray([self.labels[index] for index in indexes])
        return X, y

    # Update indexes for next epoch
    def on_epoch_end(self):
        # TODO move to __init__, there is no need to re-arange indexes each epoch
        # they will either always or never be shuffled
        self.indexes = np.arange(len(self.input_sequences))
        if self.shuffle:
            np.random.shuffle(self.indexes)

In [None]:
class PredictGenerator(Sequence):
    def __init__(self, input_sequences, vocabulary_size, batch_size=32):
        self.input_sequences = input_sequences
        self.vocabulary_size = vocabulary_size
        self.batch_size = batch_size

    # Number of batches per epoch
    def __len__(self):
        return int(np.ceil(len(self.input_sequences) / self.batch_size))

    # Generate one batch
    def __getitem__(self, index):
        indexes = np.arange(index*self.batch_size, min((index+1)*self.batch_size, len(self.input_sequences)))
        X = to_categorical([self.input_sequences[index] for index in indexes], num_classes=self.vocabulary_size)
        return X

In [None]:
max_lengths = {2: 150, 3: 150}
def prepare_n_gram_model(n, Xt, Xv, yt, yv):
    print(f'{n}-gramming train set...')
    Xt = transform_to_char_ngrams(Xt, n)
    print(f'{n}-gramming validation set...')
    Xv = transform_to_char_ngrams(Xv, n)
    print(f'Tokenizing train set...')
    n_tokenizer = get_char_ngram_tokenizer(Xt, n)
    print(f'Applying tokenizer to the train set...')
    X_ngram_train_tokenized = n_tokenizer.texts_to_sequences(Xt)
    print(f'Applying tokenizer to the validation set')
    X_ngram_val_tokenized = n_tokenizer.texts_to_sequences(Xv)
    print(f'Preprocessing train set...')
    X_ngram_train = pad_sequences(X_ngram_train_tokenized, padding='post', maxlen=max_lengths[n])
    print(f'Preprocessing validation set...')
    X_ngram_val = pad_sequences(X_ngram_val_tokenized, padding='post', maxlen=max_lengths[n])

    n_classes = (len(n_tokenizer.word_index.keys()) + 1) if NGRAMS_MAX_WORDS[n] is None else n_tokenizer.num_words

    n_model = get_model((X_ngram_train.shape[1], n_classes), recurrent_layer_size=768, dropout_rate=0.35)

    train_gen = DataGenerator(X_ngram_train, n_classes, yt, BATCH_SIZE)
    val_gen = DataGenerator(X_ngram_val, n_classes, yv, BATCH_SIZE)

    return {"model": n_model, "tokenizer": n_tokenizer, "X_train": X_ngram_train, "X_val": X_ngram_val, "y_train": yt, "y_val": yv, "train_gen": train_gen, "val_gen": val_gen}

## Hyperparameter search

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
def grid_search_model(devel_X_processor=lambda x: x):
    X = devel_X_processor(X_validation.copy())

    devel_train_X, devel_test_X, devel_train_Y, devel_test_Y = train_test_split(
        X, y_validation, train_size=0.75, stratify=y_validation
    )

    tokenizer = get_tokenizer(devel_train_X, 10_000)
    
    devel_train_X = tokenizer.texts_to_sequences(devel_train_X)
    devel_test_X = tokenizer.texts_to_sequences(devel_test_X)

    devel_train_X = pad_sequences(devel_train_X, padding='post', maxlen=50)
    devel_test_X = pad_sequences(devel_test_X, padding='post', maxlen=50)

    recurrent_layer_sizes = [768, 1024, 1280]
    dropout_rates = [0.2, 0.25, 0.35, 0.4, 0.45]
    
    results_acc = np.zeros((len(recurrent_layer_sizes), len(dropout_rates)))
    results_val_acc = np.zeros((len(recurrent_layer_sizes), len(dropout_rates)))

    for i, recurrent_layer_size in enumerate(recurrent_layer_sizes):
        for j, dropout_rate in enumerate(dropout_rates):
            print('Training network with params:')
            print(f' - recurrent_layer_size = {recurrent_layer_size}')
            print(f' - dropout_rate      = {dropout_rate}')
            
            devel_train_generator = DataGenerator(devel_train_X, tokenizer.num_words, devel_train_Y, batch_size=BATCH_SIZE)
            devel_test_generator = DataGenerator(devel_test_X, tokenizer.num_words, devel_test_Y, batch_size=BATCH_SIZE)

            model = get_model((devel_train_X.shape[1], tokenizer.num_words), recurrent_layer_size, 0.0, dropout_rate)
            history = model.fit(
                devel_train_generator,
                validation_data=devel_test_generator,
                epochs=EPOCHS
            )
            model_name = save_rnn(model, BATCH_SIZE, history, tokenizer)
      
            results_acc[i][j] = history.history["accuracy"][-1]
            results_val_acc[i][j] = history.history["val_accuracy"][-1]
            print(f'Results for {recurrent_layer_size}, {dropout_rate} ({i}, {j}):')
            print(f'accuracy:     {history.history["accuracy"]}')
            print(f'val_accuracy: {history.history["val_accuracy"]}')

    grid_search_acc = pd.DataFrame(results_acc, index=recurrent_layer_sizes, columns=dropout_rates)
    grid_search_val_acc = pd.DataFrame(results_val_acc, index=recurrent_layer_sizes, columns=dropout_rates)
    grid_search_acc.to_csv('grid_search_acc.csv')
    grid_search_val_acc.to_csv('grid_search_val_acc.csv')

In [None]:
grid_search_model()

In [None]:
recurrent_layer_sizes = [768, 1024, 1280]
dropout_rates = [0.2, 0.25, 0.35, 0.4, 0.45]

In [None]:
grid_search_val_acc = np.asarray([
    [0.8206967115402222, 0.8084016442298889, 0.8452953100204468, 0.8305288553237915, 0.8359806537628174],
    [0.7321428656578064, 0.8373969793319702, 0.8104395866394043, 0.7388392686843872, 0.838083803653717 ],
    [0.8328210115432739, 0.8314549326896667, 0.8317964673042297, 0.7439903616905212, 0.8360655903816223]
])

In [None]:
plt.imshow(grid_search_val_acc, cmap='viridis_r')
plt.colorbar()
plt.yticks(np.arange(len(recurrent_layer_sizes)), recurrent_layer_sizes)
plt.xticks(np.arange(len(dropout_rates)), dropout_rates)
plt.show()

In [None]:
best_index = np.argmax(grid_search_val_acc)
best_recurrent_layer_size = recurrent_layer_sizes[best_index // len(dropout_rates)]
best_dropout = dropout_rates[best_index % len(dropout_rates)]
print(best_index)
print(best_recurrent_layer_size)
print(best_dropout)

## Ensemble

In [None]:
import pickle
import seaborn as sns
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import KFold, train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from tensorflow.keras.models import load_model

In [None]:
X_train_model, X_val_model, y_train_model, y_val_model = train_test_split(
    X_train, y_train, train_size=0.9, stratify=y_train, random_state=7
)

### Word unigram

In [None]:
MODEL_W1_NAME = 'model_w1_final'

In [None]:
tokenizer_w1 = get_tokenizer(X_train_model, 10_000)

X_train_w1 = tokenizer_w1.texts_to_sequences(X_train_model)
X_val_w1 = tokenizer_w1.texts_to_sequences(X_val_model)

X_train_w1 = pad_sequences(X_train_w1, padding='post', maxlen=50)
X_val_w1 = pad_sequences(X_val_w1, padding='post', maxlen=50)

In [None]:
# train the model
model_w1 = get_model((X_train_w1.shape[1], tokenizer_w1.num_words), best_recurrent_layer_size, 0.0, best_dropout)
history_w1 = model_w1.fit(
    DataGenerator(X_train_w1, tokenizer_w1.num_words, y_train_model, batch_size=BATCH_SIZE),
    validation_data=DataGenerator(X_val_w1, tokenizer_w1.num_words, y_val_model, batch_size=BATCH_SIZE),
    epochs=2
)

In [None]:
save_rnn(model_w1, BATCH_SIZE, history_w1, tokenizer_w1, model_name=MODEL_W1_NAME)

In [None]:
# load model_w1
# model_w1, tokenizer_w1 = load_rnn(MODEL_W1_NAME)
# model_w1.summary()

In [None]:
output_w1 = model_w1.predict(PredictGenerator(X_val_w1, tokenizer_w1.num_words, batch_size=BATCH_SIZE))

### Character 2-grams

In [None]:
MODEL_C2_NAME = 'c2_model'

In [None]:
c2_model_bundle = prepare_n_gram_model(2, X_train_model, X_val_model, y_train_model, y_val_model)

In [None]:
c2_model = c2_model_bundle["model"]
c2_train_gen = c2_model_bundle["train_gen"]
c2_val_gen = c2_model_bundle["val_gen"]
c2_tokenizer = c2_model_bundle["tokenizer"]

In [None]:
c2_history = c2_model.fit(c2_train_gen, validation_data=c2_val_gen, epochs=3)

In [None]:
save_rnn(c2_model, BATCH_SIZE, c2_history, c2_tokenizer, model_name=MODEL_C2_NAME)

In [None]:
c2_tokenizer = c2_model_bundle['tokenizer']
X_val_c2 = c2_model_bundle['X_val']
output_c2 = c2_model.predict(PredictGenerator(X_val_c2, c2_tokenizer.num_words, batch_size=BATCH_SIZE))

### Character 3-grams

In [None]:
MODEL_C3_NAME = 'c3_model'

In [None]:
c3_model_bundle = prepare_n_gram_model(3, X_train_model, X_val_model, y_train_model, y_val_model)

In [None]:
c3_model = c3_model_bundle["model"]
c3_train_gen = c3_model_bundle["train_gen"]
c3_val_gen = c3_model_bundle["val_gen"]
c3_tokenizer = c3_model_bundle["tokenizer"]

In [None]:
c3_history = c3_model.fit(c3_train_gen, validation_data=c3_val_gen, epochs=3)

In [None]:
save_rnn(c3_model, BATCH_SIZE, c3_history, c3_tokenizer, model_name=MODEL_C3_NAME)

In [None]:
c3_tokenizer = c3_model_bundle['tokenizer']
X_val_c3 = c3_model_bundle['X_val']
output_c3 = c3_model.predict(PredictGenerator(X_val_c3, c3_tokenizer.num_words, batch_size=BATCH_SIZE))

### Logistic regression

In [None]:
ENSEMBLE_NAME = 'ensemble_final'

In [None]:
# combine outputs
# TODO add other models
# X_ensemble = pd.DataFrame(np.hstack((output_w1, output_c2, output_c3)))
X_ensemble = pd.DataFrame(output_w1)
y_ensemble = pd.Series(y_val_model)

In [None]:
# print(len(X_ensemble))
# print(len(y_ensemble))

In [None]:
# example_index = 3
# print(X_ensemble.iloc[example_index])
# print(y_ensemble.iloc[example_index])

In [None]:
# print(y_ensemble.value_counts())

In [None]:
N_SPLITS = 5
kf = KFold(n_splits=N_SPLITS)

ensemble = LogisticRegression()
scores = cross_val_score(ensemble, X_ensemble, y_ensemble, scoring='accuracy', cv=kf)
print(f'All scores: {scores}')
print(f'Average score: {np.mean(scores)}')

In [None]:
ensemble = LogisticRegression()
ensemble.fit(X_ensemble, y_ensemble)

In [None]:
# Save ensemble model
if not os.path.exists(f'models/{ENSEMBLE_NAME}'):
    os.mkdir(f'models/{ENSEMBLE_NAME}')
with open(f'models/{ENSEMBLE_NAME}/model.pkl', 'wb') as f:
    pickle.dump(ensemble, f)
with open(f'models/{ENSEMBLE_NAME}/weights.txt', 'w') as f:
    f.write(f'coef:      {ensemble.coef_}\n')
    f.write(f'intercept: {ensemble.intercept_}')

In [None]:
# Load ensemble model
# with open('models/ensemble_final/model.pkl', 'rb') as f:
#     ensemble = pickle.load(f)

In [None]:
y_pred = ensemble.predict(X_ensemble)

In [None]:
# Plot confusion matrix
cm = confusion_matrix(y_ensemble, y_pred, labels=CLASSES)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=CLASSES)
fig, ax = plt.subplots(figsize=(14,10))
disp.plot(ax=ax, cmap=plt.cm.Blues)

In [None]:
# Plot normalized confusion matrix
cm = confusion_matrix(y_ensemble, y_pred, labels=CLASSES)
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
cm = pd.DataFrame(cm, index=CLASSES, columns=CLASSES)
plt.figure(figsize=(14,10))
sns.heatmap(cm, annot=True)
plt.show()

## Testing