In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Activation, Dropout
from tensorflow.keras.layers import LSTM, GRU, BatchNormalization, Attention
from tensorflow.keras import regularizers
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, LambdaCallback, EarlyStopping
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split

In [None]:
# Must be 2.1.0
tf.__version__

In [None]:
import random
import sys
import io
import re

In [None]:
import pandas as pd
import numpy as np
#import seaborn as sns
import json
import matplotlib.pyplot as plt
import emoji
%matplotlib inline

In [None]:
np.__version__

In [None]:
from tensorflow.python.client import device_lib

In [None]:
# Cuda stopped working for some reason, we set number of parallel threads to 2 to avoid CPU over use
tf.config.threading.set_inter_op_parallelism_threads(2)
tf.config.threading.set_intra_op_parallelism_threads(2)

In [None]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

In [None]:
tf.config.experimental.list_physical_devices('GPU')

# Preprocessing routine

In [None]:
# Change your path to your telegram export csv
df = pd.read_csv('telegram_data_20200505.csv')
print(len(df))

# Removing anything non-text, like stickers
df = df.dropna(subset=['text'])
print(len(df))

In [None]:
# TODO : Check why this conversion doesn't work
# df['text'] = df['text'].str.replace('[0-9]{6}', '######')
# df['text'] = df['text'].str.replace('[0-9]{5}', '#####')
# df['text'] = df['text'].str.replace('[0-9]{4}', '####')
# df['text'] = df['text'].str.replace('[0-9]{3}', '###')

# Encoding end of sequence to stop prediction
df['text'] = df['text']+ " <EOS>"

In [None]:
# Formatting hashtags
df['text'] = df['text'].str.replace("\{'type': 'hashtag', 'text': '", '')
df['text'] = df['text'].str.replace("'\}", '')
# Formatting urls
reg = re.compile(r"\{'type': 'link', 'text': '.*'\}")
df['text'] = df['text'].str.replace("\{'type': 'link', 'text': '.*", '', regex=True)
# Formatting mentions
df['text'] = df['text'].str.replace("\{'type': 'mention_name', 'text': '", '@')
df['text'] = df['text'].str.replace(", 'user_id': .*\}", '')

df['text'] = df['text'].str.replace("\{'type': 'mention', 'text': '", '')
df['text'] = df['text'].str.replace("'\}", '')

In [None]:
df = df[~df['text'].str.contains('{')]
len(df)

In [None]:
# Converting everything that remains stored as list by telegram to plain string
df['text'] = df['text'].str.replace('\[', '')
df['text'] = df['text'].str.replace('\]', '')

In [None]:
# TODO : rework this to handle punctuation properly
# for punct in ".,!?'":
#     df['text'] = df['text'].str.replace(punct, ' {}'.format(punct))

In [None]:
df = df[df['text'].str.split().map(len)>1]
len(df)

In [None]:
filter_before = '!"$%&()*+,-./:;=?@[\\]^_`{|}~\t\n«»’'+"'"
# filter belows does not remove punctuation
# filter_after = '$%&"*+-/=?@[\\]()^_`{|}~\t\n«»’'

# Tokenizer

We fit a tokenizer on the complete, uncleared vocabulary (because I like a bot that makes typos).
End of a sequence are encoded as \<EOS\> to help prediction

In [None]:
tokenizer = Tokenizer(num_words=None, filters=filter_before, lower=True, split=' ', char_level=False, oov_token='<UNK>', document_count=0)

In [None]:
tokenizer.fit_on_texts(df['text'].map(str))

In [None]:
sequences = tokenizer.texts_to_sequences(df['text'].map(str))
df['sequences'] = sequences

In [None]:
# TODO : test removing unique word

# count_thres = 1
# low_count_words = [w for w,c in tokenizer.word_counts.items() if c <= count_thres]
# removed = []
# for w in low_count_words:
#     removed.append(w) 
#     del tokenizer.word_index[w]
#     del tokenizer.word_docs[w]
#     del tokenizer.word_counts[w]
# print("removed : ", len(removed))

Note : Keras does not handle very well variable length sequences. We need to pad tokenized sequences to 0s
Don't forget to add Embedding parameter mask_zero=True to ensure the NN ignores padded values.

In [None]:
X=[]
y=[]
padded_sequences = pad_sequences(sequences, maxlen=20)
for seq in padded_sequences:
    X.append(np.array([0]+list(seq[:-1])))
    y.append(seq)
y = np.array(y)
X = np.array(X)

In [None]:
model = Sequential()
model.add(Embedding(num_words, 100, input_length=None, mask_zero=True))
model.add(LSTM(30, return_sequences=True))
model.add(Dense(50, activation='relu'))
model.add(Dropout(0.1))
model.add(Dense(num_words, activation='softmax'))

optimizer = Adam(learning_rate=0.01)

In [None]:
model.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer)

In [None]:
print(model.summary())

In [None]:
def sample(a, temperature=500):
    preds = np.asarray(a).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)
A = 0

def generate(seed, length, n):
    print("SEED: ",seed)
    n = min(n, 150)
    seed_tk = tokenizer.texts_to_sequences([seed])
    for i in range(length):
        y_pred = model.predict(seed_tk)
        next_words_proba = y_pred[0][-1]
        best_n_next = next_words_proba.argsort()[-n:]
        
        #print("BEST: ", tokenizer.index_word[best_n_next[0]])
        #print("BEST n: ", [tokenizer.index_word[best_n_next[i]] for i in range(n)])
        next_word = np.random.choice(best_n_next)
        y_text = tokenizer.index_word[next_word]
        if y_text == '<eos>':
            return seed
        seed = seed + ' ' + y_text
        seed_tk = tokenizer.texts_to_sequences([seed])
    print(seed)
    for punct in '.,!?()"'+'"':
        seed = seed.replace(' {}'.format(punct), punct)
    return seed

def on_epoch_end(epoch, _):
    print()
    print('----- Generating text after Epoch: %d' % epoch)
    
    start_index = random.randint(0, len(word_splits) - maxlen - 1)
    sentence = ' '.join(word_splits[start_index: start_index + maxlen])

    print('----- Generating with seed: "' + sentence + '"')

    #s = "Je suis"
    s = sentence
    token = tokenizer.texts_to_sequences([s])
    print('----- Generated: "' + generate(s, n=5, length=20) + '"')
    print('----- Generated baseline : "' + generate("Je suis", n=5, length=20) + '"')
    
print_callback = LambdaCallback(on_epoch_end=on_epoch_end)


In [None]:
checkpoint = ModelCheckpoint('model.hdf5', monitor='loss',
                             verbose=1, save_best_only=True,
                             mode='min')
reduce_lr = ReduceLROnPlateau(monitor='loss', factor=0.2, mode='min',
                              patience=1, min_lr=0.000001 , verbose=1)
early_stopping = EarlyStopping(monitor='loss', min_delta=0.001, patience=2, verbose=1, mode='auto', baseline=None, restore_best_weights=True)
callbacks = [checkpoint, reduce_lr, early_stopping, print_callback]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Note : because we test the model on each epoch in an iterative manner, tensorflow retracing function may return a hugh number of warnings
# Training goes for approx an hour (on 2 threads)
model.fit(X_train,
          y_train.reshape(y_train.shape[0], y_train.shape[1], 1),
          batch_size=512,
          epochs=20,
          callbacks=callbacks,
          validation_data=(X_test, y_test.reshape(y_test.shape[0], y_test.shape[1], 1)))

In [None]:
plt.plot(model.history.history['val_loss'], label='val')
plt.plot(model.history.history['loss'], label='train')

# Read and Write routines

Quick routine to save, and to make sure everything has been saved properly

In [None]:
import pickle

# saving
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
from tensorflow.keras.models import load_model
loaded = load_model("model.hdf5")

In [None]:
with open('tokenizer.pickle', 'rb') as handle:
    loaded_tokenizer = pickle.load(handle)