In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
cd drive/MyDrive/ML_German

/content/drive/MyDrive/ML_German


In [6]:
import numpy as np
import pandas as pd 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error as MAE
from sklearn.metrics import mean_squared_error as MSE
import xgboost as xgb
from sklearn import linear_model
import tensorflow as tf
from string import punctuation
from sklearn.preprocessing import LabelEncoder
import tensorflow.keras.layers as layers
import keras
from tensorflow.keras.layers import GlobalAveragePooling2D, Reshape, Dense, Permute, multiply
import tensorflow.keras.backend as K

In [7]:
allowed_charachters = [ch for ch in ' abcdefghijklmnopqrstuvwxyz' + 'öäüß']
def filter_texts(texts):
  return [''.join([ch for ch in text.lower() if ch in allowed_charachters]) for text in texts]

In [8]:
training_data = pd.read_csv('./csv_files/training_no_emoji.csv')
validation_data = pd.read_csv('./csv_files/validation_no_emoji.csv')

In [9]:
def convert2ArrayInto2D (x1_array, x2_array):
  final_2Darray = []
  for x,y in zip(x1_array, x2_array):
    final_2Darray.append([x,y])
  return final_2Darray 

def convert2Dinto2Arrays (DArray):
  first_array = []
  second_array = []
  for x in DArray:
    first_array.append(x[0])
    second_array.append(x[1])
  return first_array, second_array

In [11]:
values = np.array(training_data[['Long','Lat']])

In [12]:
data_train, data_test, values_train, values_test = train_test_split(training_data['Text'], values, test_size=0.2)

In [13]:
data_train = filter_texts(data_train)
data_test = filter_texts(data_test)

In [27]:
def n_gram_dictionary(minGram, maxGram, texts):
  ngrams, id = {}, 1
  for text in texts:
    for r in range(minGram, maxGram + 1):
      for i in range(0, len(text) - r + 1):
        ngram = text[i : i + r]
        if ngram not in ngrams:
          ngrams[ngram] = id
          id += 1
  return ngrams

def tranfsform_texts(ngram_dict, minGram, maxGram, input_size, texts):
  vectors = []
  for text in texts:
    vector = []
    for r in range(minGram, maxGram + 1):
      for i in range(0, len(text) - r + 1):
        ngram = text[i : i + r]
        if ngram in ngram_dict:
          vector.append(ngram_dict[ngram])
    if len(vector) < input_size:
      vector += [0] * (input_size - len(vector))
    else:
      vector = vector[:input_size]
    vectors.append(vector)
  return vectors

In [None]:
# label_encoder = LabelEncoder()
# unique_chars = set()
# for text in data_train:
#   for char in text:
#     unique_chars.add(char)
# unique_chars.add('#')
# print(len(unique_chars))
# label_encoder.fit_transform(list(unique_chars))

In [None]:
  # def transformTextToVector(text, label_encoder, input_size):
  #   new_text = text[:min(input_size, len(text))].ljust(input_size, '#')
  #   labels_char = label_encoder.transform([ch for ch in new_text]).tolist()
  #   return labels_char

In [None]:
# nr_of_char_from_text = 3000
# nr_char = len(unique_chars)

In [None]:
# X_train_CNN = [transformTextToVector(text = x, label_encoder = label_encoder, input_size = nr_of_char_from_text) for x in data_train]
# X_test_CNN = [transformTextToVector(text = x, label_encoder = label_encoder, input_size = nr_of_char_from_text) for x in data_test]

In [28]:
input_size = 3000
ngram_dict = n_gram_dictionary(3, 5, data_train)

In [35]:
X_train_CNN = tranfsform_texts(ngram_dict, 3, 5, input_size, data_train)
X_test_CNN = tranfsform_texts(ngram_dict, 3, 5, input_size, data_test)

In [None]:
def squeeze_excite_block(tensor, ratio=16):
    init = tensor
    channel_axis = 1 if K.image_data_format() == "channels_first" else -1
    filters = init._keras_shape[channel_axis]
    se_shape = (1, 1, filters)

    se = GlobalAveragePooling2D()(init)
    se = Reshape(se_shape)(se)
    se = Dense(filters // ratio, activation='relu', kernel_initializer='he_normal', use_bias=False)(se)
    se = Dense(filters, activation='sigmoid', kernel_initializer='he_normal', use_bias=False)(se)

    if K.image_data_format() == 'channels_first':
        se = Permute((3, 1, 2))(se)

    x = multiply([init, se])
    return x

In [32]:
class CustomSaver(keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs={}):
        if epoch % 5 == 0:  # or save after some epoch, each k-th epoch etc.
            self.model.save("./models/model_{}.hd5".format(epoch))

In [31]:
model = tf.keras.models.Sequential([
    layers.Embedding(len(ngram_dict) + 1, 128, input_length=input_size), #nr_of_char_from_text X 128 
    layers.Conv1D(filters=128, kernel_size=7, activation='relu', strides=1, padding='same'), # 128 X nr_of_char_from_text X 144
    layers.MaxPool1D(pool_size=3, strides=3, padding='valid', data_format='channels_last'), #128 X nr_of_char_from_text X 48
    layers.Conv1D(filters=128, kernel_size=7, activation='relu', strides=1, padding='same'), #128 X 128 X nr_of_char_from_text 42
    layers.MaxPool1D(pool_size=3, strides=3, padding='valid', data_format='channels_last'), # 128 X 128 X nr_of_char_from_text X 16
    layers.Conv1D(filters=128, kernel_size=3, activation='relu', strides=1, padding='same'), #128 X 128 X 128 X nr_of_char_from_text X 14
    layers.MaxPool1D(pool_size=3, strides=None, padding='valid', data_format='channels_last'), #128 X 128 X 128 X nr_of_char_from_text X 12,
    layers.Flatten(),
    layers.Dense(units=256, activation='relu'),
    layers.Dropout(rate=0.5),
    layers.Dense(units=128, activation='relu'),
    layers.Dropout(rate=0.5),
    layers.Dense(units=2)
])
optimizer = tf.keras.optimizers.Adam(lr=0.001)
model.compile(optimizer=optimizer,
              loss='mean_absolute_error',
              metrics=['mean_absolute_error'])

In [33]:
saver = CustomSaver()
tensorboard_callback = tf.keras.callbacks.TensorBoard(
    log_dir = './models',  
    write_graph = True, 
    update_freq = 'epoch'
)

In [36]:
model.fit(np.array(X_train_CNN).astype('float32'), np.array(values_train).astype('float32'),
          epochs=60, batch_size=150, initial_epoch=0,
          validation_data=(np.array(X_test_CNN).astype('float32'), np.array(values_test).astype('float32')), callbacks = [saver, tensorboard_callback])

Epoch 1/60
INFO:tensorflow:Assets written to: ./models/model_0.hd5/assets
Epoch 2/60
Epoch 3/60
Epoch 4/60
Epoch 5/60
Epoch 6/60
 14/121 [==>...........................] - ETA: 17:21 - loss: 4.3510 - mean_absolute_error: 4.3510

KeyboardInterrupt: ignored