In [20]:
import numpy as np
import pandas as pd
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from keras.layers import Input, Embedding, Activation, Flatten, Dense
from keras.layers import Conv1D, MaxPooling1D, Dropout, LSTM
from keras.models import Model, load_model
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
import os

In [21]:
train_data_source = './iben/iben/trac2_iben_train_transliterated.csv'
test_data_source = './iben/iben/trac2_iben_dev_transliterated.csv'

train_df = pd.read_csv(train_data_source,)
train_df.head()

Unnamed: 0,ID,Text,Sub-task A,Sub-task B,transliterated
0,C45.688,Dada taratari,NAG,NGEN,Dada taratari
1,C45.635.5,Tumi korbe Amar sathe,NAG,NGEN,Tumi korbe Amar sathe
2,C47.103,Ar ta chara a sob bessha peter cheleder okhan ...,OAG,GEN,Ar ta chara a sob bessha peter cheleder okhan ...
3,C68.147,কাকের শরীরে ময়ুরের পাখা লাগিয়েছে,CAG,NGEN,kākera śarīre maয়urera pākhā lāgiয়eche
4,C45.615,পতিতাদের চরিত্র রাজনৈতিক নেতাদের থেকে হাজার গু...,NAG,NGEN,patitādera caritra rājanaitika netādera theke ...


In [22]:
test_df = pd.read_csv(test_data_source)
test_df.head()

Unnamed: 0,ID,Text,Sub-task A,Sub-task B,transliterated
0,C59.2078,Ek dom sothik kotha bolecho jhekane theka uthe...,NAG,NGEN,Ek dom sothik kotha bolecho jhekane theka uthe...
1,C68.537,ফালতু মেয়ে,CAG,NGEN,phālatu meya়e
2,C59.1344,DARUN AKDOM THIK,NAG,NGEN,DARUN AKDOM THIK
3,C59.706,Sala ranu magi,OAG,GEN,Sala ranu magi
4,C68.663,থামবেল ঠিককোরে বানা,OAG,NGEN,thāmavela ṭhikakore vānā


In [23]:
# convert string to lower case
train_texts = train_df['Text'].values
train_texts = [s.lower() for s in train_texts]

test_texts = test_df['Text'].values
test_texts = [s.lower() for s in test_texts]

In [24]:
train_df['length'] = list(map(lambda x: len(x), train_df['Text']))
train_df['length'].describe()

count    3826.000000
mean       50.225301
std        63.396208
min         3.000000
25%        17.000000
50%        31.000000
75%        58.000000
max       840.000000
Name: length, dtype: float64

In [25]:
len(train_df[train_df['length']> 150])/len(train_df)

0.05279665446941976

In [26]:
task = 'Sub-task B'
train_df[task].value_counts()

NGEN    3114
GEN      712
Name: Sub-task B, dtype: int64

In [27]:


# =======================Convert string to index================
# Tokenizer
tk = Tokenizer(num_words=None, char_level=True, oov_token='UNK')
tk.fit_on_texts(train_texts)
# If we already have a character list, then replace the tk.word_index
# If not, just skip below part

# -----------------------Skip part start--------------------------
# construct a new vocabulary
alphabet = "abcdefghijklmnopqrstuvwxyz0123456789,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{}"
char_dict = {}
for i, char in enumerate(alphabet):
    char_dict[char] = i + 1

# Use char_dict to replace the tk.word_index
tk.word_index = char_dict.copy()
# Add 'UNK' to the vocabulary
tk.word_index[tk.oov_token] = max(char_dict.values()) + 1
# -----------------------Skip part end----------------------------

# Convert string to index
train_sequences = tk.texts_to_sequences(train_texts)
test_texts = tk.texts_to_sequences(test_texts)

In [28]:

# Padding
train_data = pad_sequences(train_sequences, maxlen=150, padding='post')
test_data = pad_sequences(test_texts, maxlen=150, padding='post')

# Convert to numpy array
train_data = np.array(train_data, dtype='float32')
test_data = np.array(test_data, dtype='float32')

# =======================Get classes================
train_df[task]= pd.Categorical(train_df[task])
train_df['target_class'] = train_df[task].cat.codes
#train_class_list = [x - 1 for x in train_classes]

test_df[task] = pd.Categorical(test_df[task])
test_df['target_class'] = test_df[task].cat.codes
#test_class_list = [x - 1 for x in test_classes]

from keras.utils import to_categorical
#Y = pd.get_dummies(train_data['Sub-task A']).values
train_classes = to_categorical(train_df['target_class'])
test_classes = to_categorical(test_df['target_class'])

In [29]:
test_df['target_class'].value_counts()

1    766
0    191
Name: target_class, dtype: int64

In [30]:
input_size = 150
vocab_size = len(tk.word_index)
embedding_size = 100
conv_layers = [[256, 7, 3],
               [256, 7, 3],
               [256, 3, -1],
               [256, 3, -1],
               [256, 3, -1],
               [256, 3, 3]]

fully_connected_layers = [1024, 1024]
num_of_classes = len(train_df['target_class'].value_counts())
dropout_p = 0.5
optimizer = 'adam'
loss = 'categorical_crossentropy'

In [31]:


# Embedding weights
embedding_weights = []  # (70, 69)
embedding_weights.append(np.zeros(vocab_size))  # (0, 69)

for char, i in tk.word_index.items():  # from index 1 to 69
    onehot = np.zeros(vocab_size)
    onehot[i - 1] = 1
    embedding_weights.append(onehot)

embedding_weights = np.array(embedding_weights)
print('Load')

# Embedding layer Initialization
embedding_layer = Embedding(vocab_size + 1,
                            embedding_size,
                            input_length=input_size,
                            #weights=[embedding_weights]
                           )

# Model Construction
# Input
inputs = Input(shape=(input_size,), name='input', dtype='int64')  # shape=(?, 1014)
# Embedding
x = embedding_layer(inputs)
# Conv
for filter_num, filter_size, pooling_size in conv_layers:
    x = Conv1D(filter_num, filter_size)(x)
    x = Activation('relu')(x)
    if pooling_size != -1:
        x = MaxPooling1D(pool_size=pooling_size)(x)  # Final shape=(None, 34, 256)
x = Flatten()(x)  # (None, 8704)
# Fully connected layers
for dense_size in fully_connected_layers:
    x = Dense(dense_size, activation='relu')(x)  # dense_size == 1024
    x = Dropout(dropout_p)(x)
# Output Layer
predictions = Dense(num_of_classes, activation='softmax')(x)
# Build model
model = Model(inputs=inputs, outputs=predictions)
model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])  # Adam, categorical_crossentropy
model.summary()

# Shuffle


Load
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input (InputLayer)           (None, 150)               0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 150, 100)          7000      
_________________________________________________________________
conv1d_7 (Conv1D)            (None, 144, 256)          179456    
_________________________________________________________________
activation_7 (Activation)    (None, 144, 256)          0         
_________________________________________________________________
max_pooling1d_4 (MaxPooling1 (None, 48, 256)           0         
_________________________________________________________________
conv1d_8 (Conv1D)            (None, 42, 256)           459008    
_________________________________________________________________
activation_8 (Activation)    (None, 42, 256)           0         
_____

In [32]:
indices = np.arange(train_data.shape[0])
np.random.shuffle(indices)

x_train = train_data[indices]
y_train = train_classes[indices]

x_test = test_data
y_test = test_classes

In [33]:
# Training
epochs = 30
model_file = 'ben_trans_' + task + '.h5'
max_f1 = 0
if os.path.exists(model_file):
    model = load_model(model_file)
    y_pred = model.predict(x_test, batch_size=64, verbose=1)
    y_pred_bool = np.argmax(y_pred, axis=1)    
    print(classification_report(test_df['target_class'], y_pred_bool))
    max_f1 = f1_score(test_df['target_class'], y_pred_bool, average = 'weighted')

In [34]:
for i in range(epochs):
    print(i)
    model.fit(x_train, y_train,
              validation_data=(x_test, y_test),
              batch_size=128,
              epochs=1,
              verbose=1)
    y_pred = model.predict(x_test, batch_size=64, verbose=1)
    y_pred_bool = np.argmax(y_pred, axis=1)    
    print(classification_report(test_df['target_class'], y_pred_bool))
    f1s = classification_report(test_df['target_class'], y_pred_bool, output_dict=True)['weighted avg']['f1-score']
    if f1s > max_f1:
        print('saved t epoch ', i, ' with f1 ', f1s)
        model.save(model_file)
        max_f1 = f1s

0
Train on 3826 samples, validate on 957 samples
Epoch 1/1
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       191
           1       0.80      1.00      0.89       766

   micro avg       0.80      0.80      0.80       957
   macro avg       0.40      0.50      0.44       957
weighted avg       0.64      0.80      0.71       957

saved t epoch  0  with f1  0.7116891087511698


  'precision', 'predicted', average, warn_for)


1
Train on 3826 samples, validate on 957 samples
Epoch 1/1
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       191
           1       0.80      1.00      0.89       766

   micro avg       0.80      0.80      0.80       957
   macro avg       0.40      0.50      0.44       957
weighted avg       0.64      0.80      0.71       957

2
Train on 3826 samples, validate on 957 samples
Epoch 1/1
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       191
           1       0.80      1.00      0.89       766

   micro avg       0.80      0.80      0.80       957
   macro avg       0.40      0.50      0.44       957
weighted avg       0.64      0.80      0.71       957

3
Train on 3826 samples, validate on 957 samples
Epoch 1/1
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       191
           1       0.80      1.00      0.89       766

   mi

              precision    recall  f1-score   support

           0       0.82      0.28      0.42       191
           1       0.85      0.98      0.91       766

   micro avg       0.84      0.84      0.84       957
   macro avg       0.83      0.63      0.67       957
weighted avg       0.84      0.84      0.81       957

saved t epoch  15  with f1  0.8123141985368203
16
Train on 3826 samples, validate on 957 samples
Epoch 1/1
              precision    recall  f1-score   support

           0       0.81      0.27      0.40       191
           1       0.84      0.98      0.91       766

   micro avg       0.84      0.84      0.84       957
   macro avg       0.83      0.63      0.66       957
weighted avg       0.84      0.84      0.81       957

17
Train on 3826 samples, validate on 957 samples
Epoch 1/1
              precision    recall  f1-score   support

           0       0.71      0.28      0.40       191
           1       0.84      0.97      0.90       766

   micro avg   

In [35]:

y_pred = model.predict(x_test, batch_size=64, verbose=1)
y_pred_bool = np.argmax(y_pred, axis=1)

print(classification_report(test_df['target_class'], y_pred_bool))
#print(classification_report(test_df['target_class'], y_pred_bool, output_dict=True))

              precision    recall  f1-score   support

           0       0.76      0.30      0.43       191
           1       0.85      0.98      0.91       766

   micro avg       0.84      0.84      0.84       957
   macro avg       0.81      0.64      0.67       957
weighted avg       0.83      0.84      0.81       957

