In [1]:
from google.colab import drive
drive.mount("/content/drive")

ModuleNotFoundError: No module named 'google.colab'

In [2]:
import os
os.chdir('drive/My Drive/lab_1')

In [3]:
import tensorflow as tf

In [4]:
import pandas as pd
import numpy as np
from tqdm import tqdm

In [5]:
from sklearn.preprocessing import normalize
from sklearn.model_selection import train_test_split
from utils import train_dev_test, plot_train_acc, plot_train_loss, classifier_out

In [6]:
train = pd.read_csv('data/train_non_lemmatized.csv')
test = pd.read_csv('data/test_non_lemmatized.csv')
train_features = pd.read_csv('preproc_files/train_features_16.csv')
test_features = pd.read_csv('preproc_files/test_features_16.csv')

In [7]:
y = train['target']
X_f = normalize(train_features)
X_test_f = normalize(test_features)

In [8]:
# Сколько максимум слов из словаря нам юзать
dict_size = 20000
# Ограничимся 300 словами
max_words = 300
# Пусть размерность эмбеддинга будет 200
embed_dim = 200

In [9]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [10]:
tokenizer = Tokenizer(num_words=dict_size, lower=True)

In [11]:
tokenizer.fit_on_texts(train['text'].values)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 352514 unique tokens.


In [12]:
X = tokenizer.texts_to_sequences(train['text'].values)
X = pad_sequences(X, maxlen=max_words)
print('Shape of data tensor:', X.shape)

Shape of data tensor: (48000, 300)


In [13]:
X_train_e, X_val_e, y_train, y_val = train_test_split(X, y, test_size = 0.1, random_state=42)

In [14]:
X_train_f, X_val_f, _, _ = train_test_split(X_f, y, test_size = 0.1, random_state=42)

#### Pretrained block

In [15]:
EMBEDDING_FILE =  'embeddings/glove.twitter.27B.200d.txt'

In [16]:
def load_embed(file):
    def get_coefs(word,*arr): 
        return word, np.asarray(arr, dtype='float32')
    
    if file == 'embeddings/wiki-news-300d-1M.vec':
        embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(file) if len(o)>100)
    else:
        embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(file, encoding='latin'))
        
    return embeddings_index

In [17]:
embeddings_index = load_embed(EMBEDDING_FILE)

In [18]:
all_embs = np.stack(embeddings_index.values())
emb_mean, emb_std = all_embs.mean(), all_embs.std()
embed_size = all_embs.shape[1]

word_index = tokenizer.word_index
nb_words = min(dict_size, len(word_index))+1
#change below line if computing normal stats is too slow
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= dict_size: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector


arrays to stack must be passed as a "sequence" type such as list or tuple. Support for non-sequence iterables such as generators is deprecated as of NumPy 1.16 and will raise an error in the future.



#### Model block

In [19]:
from tensorflow.keras import Model, regularizers, optimizers
from tensorflow.keras.layers import Input, Embedding, Bidirectional, SpatialDropout1D, LSTM, Dense, BatchNormalization, Dropout, Concatenate
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau

In [20]:
def multimodal_lstm(max_words=max_words, misc_input_length=16,
                    dict_size=nb_words, embed_dim=200, num_classes=3):
    #Define inputs
    emb_input = Input(shape=(max_words,), name='post_body_input')
    misc_input = Input(shape=(misc_input_length), name='misc_features_input')

    # Embedding branch
    x_e = Embedding(input_dim=dict_size,
                    output_dim=embed_dim,
                    weights=[embedding_matrix],
                    input_length=max_words,  # X.shape[1]
                    trainable=False,
                    name='post_body_embedding')(emb_input)
    x_e = SpatialDropout1D(0.5)(x_e)
    x_e = Bidirectional(LSTM(units=64,
                             return_sequences=True))(x_e)
    x_e = Bidirectional(LSTM(units=64))(x_e)
    x_e = Model(inputs=emb_input, outputs=x_e)

    combined = Concatenate()([x_e.output, misc_input])

    z = Dense(num_classes, activation='softmax')(combined)

    model = Model(inputs=[x_e.input, misc_input], outputs=z)

    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    return model

In [21]:
earlystop = EarlyStopping(monitor='val_loss', patience=20, min_delta=0.0001)
lr_reduction = ReduceLROnPlateau(monitor='val_accuracy', patience = 2, verbose=1,factor=0.4, min_lr=0.0000001)

### K-fold

In [22]:
import gc
import pickle
from tensorflow.keras import backend as K
from sklearn.model_selection import KFold

In [23]:
X_test = tokenizer.texts_to_sequences(test['text'].values)
X_test = pad_sequences(X_test, maxlen=max_words)

In [24]:
batch_size = 256

epochs = 100
gc.collect()
K.clear_session()


num_folds = 10 #number of folds

predict = np.zeros((X_test.shape[0],3))

Здесь гугл колаб меня выкинул на девятом шаге, так что я пытался завершить локально. Если убрать следующую ячейку и if-else в цикле, то всё получается нормально. 

In [26]:
with open('results/predict_7_69.pickle', 'rb') as p:
    predict = pickle.load(p)
p.close()

In [27]:
kf = KFold(n_splits=num_folds, shuffle=True, random_state=69)  # nice!

In [28]:
for num, (train_index, test_index) in enumerate(kf.split(X)):
    if num < 8:
        pass
    
    else:
            
        print(f' Fold {num} '.center(20, '-'))

        kfold_y_train, kfold_y_valid = y[train_index], y[test_index]
        kfold_X_train = X[train_index]
        kfold_X_features = X_f[train_index]
        kfold_X_valid = X[test_index]
        kfold_X_valid_features = X_f[test_index] 
        
        gc.collect()
        K.clear_session()
        
        model = multimodal_lstm()

        mc = ModelCheckpoint(f"checkpoints/kfold_{num}_lstm.h5", monitor='val_loss', mode='auto', save_best_only=True)

        model.fit([kfold_X_train, kfold_X_features], kfold_y_train,
                batch_size=batch_size,
                validation_data=([kfold_X_valid, kfold_X_valid_features], kfold_y_valid),
                epochs=epochs,
                callbacks=[mc, earlystop, lr_reduction])
        
        gc.collect()

        model.load_weights(f"checkpoints/kfold_{num}_lstm.h5")

        predict += model.predict([X_test, X_test_f], batch_size=batch_size, verbose=1) / num_folds

        with open(f'results/predict_{num}.pickle', 'wb') as p:
            pickle.dump(predict, p)
        p.close()

print('Done!')


------ Fold 8 ------
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Train on 43200 samples, validate on 4800 samples
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Epoch 1/100
 2048/43200 [>.............................] - ETA: 4:49 - loss: 1.0865 - acc: 0.3931

KeyboardInterrupt: 

In [None]:
predictions = [np.argmax(p) for p in predictions]

In [None]:
from utils import classifier_out

In [None]:
classifier_out(predictions, '18.combined_lstm_small_glove_noval')

Лучшие резы были получены при train-test-split на 420 random state. Я пробовал посылать 69, 420 и среднее между 69 и 420, и лучшие резы были у 420.