In [3]:
from keras.models import Model
from keras.layers import Input, LSTM, Dense, Bidirectional, Embedding, Reshape, Dropout
from keras.preprocessing.sequence import pad_sequences
import numpy as np
from sklearn.model_selection import train_test_split

In [4]:
import tensorflow as tf
def custom_loss(y_true, y_pred):
    mse = tf.keras.losses.MeanSquaredError()
    loss1 = mse(y_true, y_pred)
    pred_mean = tf.reduce_mean(y_pred)
    true_mean = tf.cast(tf.reduce_mean(y_true), tf.float32)
    loss2 = (pred_mean - true_mean)**2
    loss3 = tf.reduce_sum(tf.square(tf.subtract(tf.cast(y_true, tf.float32), y_pred)))
    loss = loss3 #loss1 + 0.5*loss2
    return loss

def train_predict_sandhi_window(dtrain, dtest):
    batch_size = 64  # Batch size for training.
    epochs = 60  # Number of epochs to train for.
    latent_dim = 64  # Latent dimensionality of the encoding space.

    # Vectorize the data.
    inputs = []
    targets = []
    characters = set()
    
    for data in dtrain:
        target = np.array(list(data[1]))
        input_word = data[0]
    
        inputs.append(input_word)
        targets.append(target)
    
        for char in input_word:
            if char not in characters:
                characters.add(char)

    maxlen = max([len(s) for s in inputs])
    print(inputs[0])
    print(maxlen)

    """
    * is used as padding character
    """
    characters.add('*')
    char2idx = dict([(char, i) for i, char in enumerate(characters)])
    num_tokens = len(characters)
    
    X_train = [[char2idx[c] for c in w] for w in inputs]
    X_train = pad_sequences(maxlen=maxlen, sequences=X_train, padding="post", value=char2idx['*'])
    
    Y_train = targets
    Y_train = pad_sequences(maxlen=maxlen, sequences=Y_train, padding="post", value=0.0)
    Y_train = np.array(Y_train).reshape(-1, maxlen, 1)
    
    inputs = []
    targets = []
    for data in dtest:
        target = np.array(list(data[1]))
        input_word = data[0]
    
        inputs.append(input_word)
        targets.append(target)
    
        for char in input_word:
            if char not in characters:
                characters.add(char)
    
    print('Number of training samples:', len(X_train))
    print('Number of unique tokens:', num_tokens)
    
    # Define an input sequence and process it.
    inputword = Input(shape=(maxlen,))
    embed = Embedding(input_dim=num_tokens, output_dim=8, input_length=maxlen, mask_zero=True)(inputword)
    bilstm = Bidirectional(LSTM(latent_dim, return_sequences=True, return_state=True))
    out, forward_h, forward_c, backward_h, backward_c = bilstm(embed)
    outd = Dropout(0.5)(out)
    outputtarget = Dense(1, activation="sigmoid")(outd)
    
    model = Model(inputword, outputtarget)
    model.compile(optimizer='rmsprop', loss=custom_loss, metrics=['accuracy'])
    model.summary()
    model.fit(X_train, Y_train, batch_size, epochs, validation_split=0.1)
    return model, char2idx, maxlen
    
with open("final_data_slp1.csv", 'r', encoding='utf-8') as f:
    odl = f.readlines()
dl = []
for ol in odl:
    lol = ol.split(',')
    dl.append([lol[0], lol[2]])

dtrain, dtest = train_test_split(dl, test_size=0.2, random_state=1)
model, char2idx, maxlen = train_predict_sandhi_window(dtrain, dtest)

viparyayopArohI
72
Number of training samples: 69214
Number of unique tokens: 49


Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 72)]              0         
                                                                 
 embedding (Embedding)       (None, 72, 8)             392       
                                                                 
 bidirectional (Bidirection  [(None, 72, 128),         37376     
 al)                          (None, 64),                        
                              (None, 64),                        
                              (None, 64),                        
                              (None, 64)]                        
                                                                 
 dropout (Dropout)           (None, 72, 128)           0         
                                            

Epoch 41/60
Epoch 42/60
Epoch 43/60
Epoch 44/60
Epoch 45/60
Epoch 46/60
Epoch 47/60
Epoch 48/60
Epoch 49/60
Epoch 50/60
Epoch 51/60
Epoch 52/60
Epoch 53/60
Epoch 54/60
Epoch 55/60
Epoch 56/60
Epoch 57/60
Epoch 58/60
Epoch 59/60
Epoch 60/60


In [7]:
from tqdm import tqdm

def test_model(dtest, model, char2idx, maxlen):
    np.set_printoptions(precision=2, suppress=True)
    passed = 0
    failed = 0
    total_samasa = 0
    correct_samasa = 0
    inputs = []
    targets = []
    for data in dtest:
        target = np.array(list(data[1]))
        input_word = data[0]
    
        inputs.append(input_word)
        targets.append(target)
    
    X_test = [[char2idx[c] for c in w] for w in inputs]
    X_test = pad_sequences(maxlen=maxlen, sequences=X_test, padding="post", value=char2idx['*'])
    
    Y_test = targets
    Y_test = pad_sequences(maxlen=maxlen, sequences=Y_test, padding="post", value=0.0)
    Y_test = np.array(Y_test).reshape(-1, maxlen, 1)
   
    startlist = []
    fp = open("failed.txt", 'w')
    for i in tqdm(range(X_test.shape[0])):
        test = X_test[i].reshape((-1, maxlen))
        res = model.predict(test, verbose=0)
        res = res.reshape((maxlen))
        dup = np.copy(res)
        act = Y_test[i].reshape((maxlen))

        wordlen = 0
        for j in range(maxlen):
            if X_test[i][j] == char2idx['*']:
                break
            else:
                wordlen = wordlen + 1

        res = res[0:wordlen]
        act = act[0:wordlen]
        origres = res
        
        for j in range(wordlen):
            if(res[j] >= 0.5):
                res[j] = 1
            else:
                res[j] = 0
                
        ires = res.astype(int)
        iact = act.astype(int)
        temp = np.multiply(ires, iact)
        total_samasa = total_samasa + np.sum(iact)
        correct_samasa = correct_samasa + np.sum(temp)

        comparison = ires == iact
        
        if comparison.all():
            passed = passed + 1
        else:
            failed = failed + 1
            fp.write(str(ires))
            fp.write('\n')
            fp.write(str(iact))
            fp.write('\n')
            fp.write('*****************************************************\n')

    fp.close()
    print(passed)
    print(failed)
    print(passed*100/(passed+failed))
    print(correct_samasa)
    print(total_samasa)
    print(correct_samasa*100/total_samasa)

    return startlist

test_model(dtest, model, char2idx, maxlen)

100%|██████████| 17304/17304 [26:22<00:00, 10.93it/s]

15036
2268
86.89320388349515
23717
25861
91.70952399365841





[]

In [12]:
# Save model and test files
model.save('stage1_bilstm.h5')
fh = open('stage1_char2idx.txt', 'w')
data = str(char2idx)
fh.write(data)
fh.close()

In [14]:
fp = open("dtest.csv", "w")
for data in dtest:
    fp.write(data[0]+','+data[1]+'\n')
fp.close()