# Language Model

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import string
import numpy as np
import keras
from keras.models import Sequential
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


# Extract Unique Risk Descriptions into a file and remove punctuations

In [None]:
def extract_unique_rec():
    file = 'D:/RiskProject/NewRiskDataFile.csv'

    translator = str.maketrans('','',string.punctuation)

    records = pd.read_csv(file,encoding='ISO-8859-1',usecols=['RISK DESCRIPTION'],index_col = False)
    unique_records = records.drop_duplicates('RISK DESCRIPTION', keep = 'last')

    unique_records['RISK DESCRIPTION'] = unique_records['RISK DESCRIPTION'].apply(lambda x: x.translate(translator))

    unique_records.to_csv('F:/unique_desc/Unique_Riskdesc.csv',index = False,header = False)

# Split the uniqe records file into smaller chunks of 50 lines each

In [None]:
def split_desc_small_chunks():
    in_file = 'F:/unique_desc/Unique_Riskdesc.csv'
    output_file = 'F:/unique_desc/unique0.csv'
    max_num_lines = 50
    with open(in_file) as fin:
        fout = open(output_file,'w')
        for i, line in enumerate(fin):
            fout.write(line)
            if ((i+1)%50) == 0:
                fout.close()
                fout = open("F:/unique_desc/unique%d.csv"%(i//max_num_lines+1),'w')
        fout.close()

# Define data

In [3]:
data = """ Jack and Jill went up the hill\n
To fetch a pail of water\n
Jack fell down and broke his crown\n
And Jill came tumbling after\n """

# Model 1: One word in One word out sequences

In [4]:
tokenizer = keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts([data])
encoded = tokenizer.texts_to_sequences([data])[0]
sequences = list()
for i in range(1, len(encoded)):
    sequence = encoded[i-1:i+1]
    sequences.append(sequence)

print('Total Sequences: %d' % len(sequences))
sequences = np.array (sequences)
X, y = sequences[:,0],sequences[:,1]

Total Sequences: 24


In [5]:
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary Size: %d' % vocab_size)

Vocabulary Size: 22


In [6]:
y = keras.utils.to_categorical(y, num_classes = vocab_size)

In [7]:
model = Sequential()
model.add(keras.layers.Embedding(vocab_size, 10, input_length = 1))
model.add(keras.layers.LSTM(50))
model.add(keras.layers.Dense(vocab_size, activation='softmax'))
model.summary()

W0828 12:17:48.727929  5872 deprecation_wrapper.py:119] From d:\users\pendnkr\appdata\local\programs\python\python36\lib\site-packages\keras\backend\tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0828 12:17:48.745930  5872 deprecation_wrapper.py:119] From d:\users\pendnkr\appdata\local\programs\python\python36\lib\site-packages\keras\backend\tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0828 12:17:48.748931  5872 deprecation_wrapper.py:119] From d:\users\pendnkr\appdata\local\programs\python\python36\lib\site-packages\keras\backend\tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.



_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 1, 10)             220       
_________________________________________________________________
lstm_1 (LSTM)                (None, 50)                12200     
_________________________________________________________________
dense_1 (Dense)              (None, 22)                1122      
Total params: 13,542
Trainable params: 13,542
Non-trainable params: 0
_________________________________________________________________


In [8]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X, y, epochs=500, verbose=2)

W0828 12:17:55.192575  5872 deprecation_wrapper.py:119] From d:\users\pendnkr\appdata\local\programs\python\python36\lib\site-packages\keras\optimizers.py:790: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.

W0828 12:17:55.219578  5872 deprecation_wrapper.py:119] From d:\users\pendnkr\appdata\local\programs\python\python36\lib\site-packages\keras\backend\tensorflow_backend.py:3295: The name tf.log is deprecated. Please use tf.math.log instead.

W0828 12:17:55.488605  5872 deprecation.py:323] From d:\users\pendnkr\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\ops\math_grad.py:1250: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
W0828 12:17:56.332689  5872 deprecation_wrapper.py:119] From d:\users\pendnkr\appdata\local\programs\python\pyth

Epoch 1/500
 - 1s - loss: 3.0903 - acc: 0.1250
Epoch 2/500
 - 0s - loss: 3.0894 - acc: 0.1250
Epoch 3/500
 - 0s - loss: 3.0888 - acc: 0.1250
Epoch 4/500
 - 0s - loss: 3.0880 - acc: 0.1250
Epoch 5/500
 - 0s - loss: 3.0872 - acc: 0.1250
Epoch 6/500
 - 0s - loss: 3.0864 - acc: 0.0833
Epoch 7/500
 - 0s - loss: 3.0856 - acc: 0.1250
Epoch 8/500
 - 0s - loss: 3.0848 - acc: 0.1250
Epoch 9/500
 - 0s - loss: 3.0840 - acc: 0.1250
Epoch 10/500
 - 0s - loss: 3.0831 - acc: 0.2083
Epoch 11/500
 - 0s - loss: 3.0823 - acc: 0.2083
Epoch 12/500
 - 0s - loss: 3.0814 - acc: 0.2083
Epoch 13/500
 - 0s - loss: 3.0805 - acc: 0.2083
Epoch 14/500
 - 0s - loss: 3.0796 - acc: 0.2083
Epoch 15/500
 - 0s - loss: 3.0786 - acc: 0.2083
Epoch 16/500
 - 0s - loss: 3.0777 - acc: 0.2083
Epoch 17/500
 - 0s - loss: 3.0767 - acc: 0.2083
Epoch 18/500
 - 0s - loss: 3.0757 - acc: 0.2083
Epoch 19/500
 - 0s - loss: 3.0747 - acc: 0.2083
Epoch 20/500
 - 0s - loss: 3.0737 - acc: 0.2083
Epoch 21/500
 - 0s - loss: 3.0726 - acc: 0.2083
E

Epoch 171/500
 - 0s - loss: 1.9052 - acc: 0.5417
Epoch 172/500
 - 0s - loss: 1.8913 - acc: 0.5417
Epoch 173/500
 - 0s - loss: 1.8773 - acc: 0.5417
Epoch 174/500
 - 0s - loss: 1.8633 - acc: 0.5417
Epoch 175/500
 - 0s - loss: 1.8494 - acc: 0.5417
Epoch 176/500
 - 0s - loss: 1.8355 - acc: 0.5417
Epoch 177/500
 - 0s - loss: 1.8216 - acc: 0.5833
Epoch 178/500
 - 0s - loss: 1.8077 - acc: 0.5833
Epoch 179/500
 - 0s - loss: 1.7938 - acc: 0.5833
Epoch 180/500
 - 0s - loss: 1.7800 - acc: 0.5833
Epoch 181/500
 - 0s - loss: 1.7661 - acc: 0.5833
Epoch 182/500
 - 0s - loss: 1.7523 - acc: 0.5833
Epoch 183/500
 - 0s - loss: 1.7386 - acc: 0.6250
Epoch 184/500
 - 0s - loss: 1.7248 - acc: 0.6250
Epoch 185/500
 - 0s - loss: 1.7111 - acc: 0.6250
Epoch 186/500
 - 0s - loss: 1.6974 - acc: 0.6250
Epoch 187/500
 - 0s - loss: 1.6837 - acc: 0.6667
Epoch 188/500
 - 0s - loss: 1.6700 - acc: 0.6667
Epoch 189/500
 - 0s - loss: 1.6564 - acc: 0.6667
Epoch 190/500
 - 0s - loss: 1.6427 - acc: 0.6667
Epoch 191/500
 - 0s 

Epoch 339/500
 - 0s - loss: 0.3836 - acc: 0.8750
Epoch 340/500
 - 0s - loss: 0.3809 - acc: 0.8750
Epoch 341/500
 - 0s - loss: 0.3783 - acc: 0.8750
Epoch 342/500
 - 0s - loss: 0.3757 - acc: 0.8750
Epoch 343/500
 - 0s - loss: 0.3731 - acc: 0.8750
Epoch 344/500
 - 0s - loss: 0.3706 - acc: 0.8750
Epoch 345/500
 - 0s - loss: 0.3681 - acc: 0.8750
Epoch 346/500
 - 0s - loss: 0.3657 - acc: 0.8750
Epoch 347/500
 - 0s - loss: 0.3633 - acc: 0.8750
Epoch 348/500
 - 0s - loss: 0.3609 - acc: 0.8750
Epoch 349/500
 - 0s - loss: 0.3586 - acc: 0.8750
Epoch 350/500
 - 0s - loss: 0.3563 - acc: 0.8750
Epoch 351/500
 - 0s - loss: 0.3541 - acc: 0.8750
Epoch 352/500
 - 0s - loss: 0.3519 - acc: 0.8750
Epoch 353/500
 - 0s - loss: 0.3498 - acc: 0.8750
Epoch 354/500
 - 0s - loss: 0.3477 - acc: 0.8750
Epoch 355/500
 - 0s - loss: 0.3456 - acc: 0.8750
Epoch 356/500
 - 0s - loss: 0.3435 - acc: 0.8750
Epoch 357/500
 - 0s - loss: 0.3415 - acc: 0.8750
Epoch 358/500
 - 0s - loss: 0.3396 - acc: 0.8750
Epoch 359/500
 - 0s 

<keras.callbacks.History at 0x53dfcf8>

In [9]:
in_text = 'the'
print(in_text)
encoded = tokenizer.texts_to_sequences([in_text])[0]
encoded = np.array(encoded)
yhat = model.predict_classes(encoded, verbose=0)
for word, index in tokenizer.word_index.items():
    if index == yhat:
        print(word)

the
hill


# Model 2: Line Based Sequences

In [10]:
sequences = list()
for line in data.split('\n'):
    encoded = tokenizer.texts_to_sequences([line])[0]
    for i in range(1,len(encoded)):
        sequence = encoded[:i+1]
        sequences.append(sequence)
print("total sequences: ",len(sequences))

total sequences:  21


In [11]:
max_len = max([len(seq) for seq in sequences])
sequences = pad_sequences(sequences,maxlen = max_len, padding = 'pre')
print("max seq lenght: ",max_len)

max seq lenght:  7


In [12]:
sequences = np.array(sequences)
X,y = sequences[:,:-1],sequences[:,-1]
y = keras.utils.to_categorical(y,num_classes = vocab_size)

In [13]:
model = keras.models.Sequential()
model.add(keras.layers.Embedding(vocab_size, 10, input_length = max_len - 1))
model.add(keras.layers.LSTM(50))
model.add(keras.layers.Dense(vocab_size, activation = 'softmax'))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 6, 10)             220       
_________________________________________________________________
lstm_2 (LSTM)                (None, 50)                12200     
_________________________________________________________________
dense_2 (Dense)              (None, 22)                1122      
Total params: 13,542
Trainable params: 13,542
Non-trainable params: 0
_________________________________________________________________


In [14]:
model.compile(loss = 'categorical_crossentropy',optimizer = 'adam', metrics = ['accuracy'])
model.fit(X,y, epochs=500, verbose=2)

Epoch 1/500
 - 1s - loss: 3.0910 - acc: 0.0476
Epoch 2/500
 - 0s - loss: 3.0899 - acc: 0.0476
Epoch 3/500
 - 0s - loss: 3.0887 - acc: 0.0476
Epoch 4/500
 - 0s - loss: 3.0874 - acc: 0.0476
Epoch 5/500
 - 0s - loss: 3.0861 - acc: 0.0952
Epoch 6/500
 - 0s - loss: 3.0847 - acc: 0.1429
Epoch 7/500
 - 0s - loss: 3.0832 - acc: 0.0952
Epoch 8/500
 - 0s - loss: 3.0817 - acc: 0.0952
Epoch 9/500
 - 0s - loss: 3.0802 - acc: 0.0952
Epoch 10/500
 - 0s - loss: 3.0785 - acc: 0.0952
Epoch 11/500
 - 0s - loss: 3.0768 - acc: 0.0952
Epoch 12/500
 - 0s - loss: 3.0751 - acc: 0.0952
Epoch 13/500
 - 0s - loss: 3.0732 - acc: 0.0952
Epoch 14/500
 - 0s - loss: 3.0713 - acc: 0.0952
Epoch 15/500
 - 0s - loss: 3.0692 - acc: 0.0952
Epoch 16/500
 - 0s - loss: 3.0670 - acc: 0.0952
Epoch 17/500
 - 0s - loss: 3.0647 - acc: 0.0952
Epoch 18/500
 - 0s - loss: 3.0623 - acc: 0.0952
Epoch 19/500
 - 0s - loss: 3.0597 - acc: 0.0952
Epoch 20/500
 - 0s - loss: 3.0570 - acc: 0.0952
Epoch 21/500
 - 0s - loss: 3.0540 - acc: 0.0952
E

Epoch 171/500
 - 0s - loss: 0.6498 - acc: 0.8571
Epoch 172/500
 - 0s - loss: 0.6442 - acc: 0.8571
Epoch 173/500
 - 0s - loss: 0.6388 - acc: 0.8571
Epoch 174/500
 - 0s - loss: 0.6337 - acc: 0.8571
Epoch 175/500
 - 0s - loss: 0.6285 - acc: 0.8571
Epoch 176/500
 - 0s - loss: 0.6233 - acc: 0.8571
Epoch 177/500
 - 0s - loss: 0.6182 - acc: 0.8571
Epoch 178/500
 - 0s - loss: 0.6134 - acc: 0.8571
Epoch 179/500
 - 0s - loss: 0.6085 - acc: 0.8571
Epoch 180/500
 - 0s - loss: 0.6036 - acc: 0.8571
Epoch 181/500
 - 0s - loss: 0.5987 - acc: 0.8571
Epoch 182/500
 - 0s - loss: 0.5940 - acc: 0.8571
Epoch 183/500
 - 0s - loss: 0.5893 - acc: 0.8571
Epoch 184/500
 - 0s - loss: 0.5846 - acc: 0.8571
Epoch 185/500
 - 0s - loss: 0.5800 - acc: 0.8571
Epoch 186/500
 - 0s - loss: 0.5755 - acc: 0.8571
Epoch 187/500
 - 0s - loss: 0.5710 - acc: 0.8571
Epoch 188/500
 - 0s - loss: 0.5665 - acc: 0.8571
Epoch 189/500
 - 0s - loss: 0.5620 - acc: 0.8571
Epoch 190/500
 - 0s - loss: 0.5577 - acc: 0.8571
Epoch 191/500
 - 0s 

Epoch 339/500
 - 0s - loss: 0.2013 - acc: 0.9524
Epoch 340/500
 - 0s - loss: 0.2001 - acc: 0.9524
Epoch 341/500
 - 0s - loss: 0.1992 - acc: 0.9524
Epoch 342/500
 - 0s - loss: 0.1981 - acc: 0.9524
Epoch 343/500
 - 0s - loss: 0.1969 - acc: 0.9524
Epoch 344/500
 - 0s - loss: 0.1957 - acc: 0.9524
Epoch 345/500
 - 0s - loss: 0.1946 - acc: 0.9524
Epoch 346/500
 - 0s - loss: 0.1936 - acc: 0.9524
Epoch 347/500
 - 0s - loss: 0.1927 - acc: 0.9524
Epoch 348/500
 - 0s - loss: 0.1915 - acc: 0.9524
Epoch 349/500
 - 0s - loss: 0.1904 - acc: 0.9524
Epoch 350/500
 - 0s - loss: 0.1895 - acc: 0.9524
Epoch 351/500
 - 0s - loss: 0.1884 - acc: 0.9524
Epoch 352/500
 - 0s - loss: 0.1873 - acc: 0.9524
Epoch 353/500
 - 0s - loss: 0.1864 - acc: 0.9524
Epoch 354/500
 - 0s - loss: 0.1853 - acc: 0.9524
Epoch 355/500
 - 0s - loss: 0.1843 - acc: 0.9524
Epoch 356/500
 - 0s - loss: 0.1834 - acc: 0.9524
Epoch 357/500
 - 0s - loss: 0.1824 - acc: 0.9524
Epoch 358/500
 - 0s - loss: 0.1814 - acc: 0.9524
Epoch 359/500
 - 0s 

<keras.callbacks.History at 0xfb53f60>

In [15]:
in_text = 'to fetch a pail of water'
print(in_text)
encoded = tokenizer.texts_to_sequences([in_text])[0]
encoded = pad_sequences([encoded], maxlen=max_len - 1, padding='pre')
yhat = model.predict_classes(encoded, verbose=0)
for word, index in tokenizer.word_index.items():
    if index == yhat:
        print(word)

to fetch a pail of water
water


# Model 3: Two words in one word out Sequence

In [16]:
encoded = tokenizer.texts_to_sequences([data])[0]
sequences = list()
for i in range(2, len(encoded)):
    sequence = encoded[i-2:i+1]
    sequences.append(sequence)

print('Total Sequences: %d' % len(sequences))
sequences = np.array (sequences)
X, y = sequences[:,:-1],sequences[:,-1]

Total Sequences: 23


In [17]:
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary Size: %d' % vocab_size)

Vocabulary Size: 22


In [18]:
y = keras.utils.to_categorical(y, num_classes = vocab_size)

In [19]:
model = Sequential()
model.add(keras.layers.Embedding(vocab_size, 10, input_length = 2))
model.add(keras.layers.LSTM(50))
model.add(keras.layers.Dense(vocab_size, activation='softmax'))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 2, 10)             220       
_________________________________________________________________
lstm_3 (LSTM)                (None, 50)                12200     
_________________________________________________________________
dense_3 (Dense)              (None, 22)                1122      
Total params: 13,542
Trainable params: 13,542
Non-trainable params: 0
_________________________________________________________________


In [20]:
model.compile(loss = 'categorical_crossentropy',optimizer = 'adam', metrics = ['accuracy'])
model.fit(X,y,epochs=500,verbose=2)

Epoch 1/500
 - 1s - loss: 3.0913 - acc: 0.0435
Epoch 2/500
 - 0s - loss: 3.0906 - acc: 0.1304
Epoch 3/500
 - 0s - loss: 3.0898 - acc: 0.1304
Epoch 4/500
 - 0s - loss: 3.0890 - acc: 0.1739
Epoch 5/500
 - 0s - loss: 3.0882 - acc: 0.1739
Epoch 6/500
 - 0s - loss: 3.0873 - acc: 0.1304
Epoch 7/500
 - 0s - loss: 3.0864 - acc: 0.0870
Epoch 8/500
 - 0s - loss: 3.0854 - acc: 0.0870
Epoch 9/500
 - 0s - loss: 3.0845 - acc: 0.0870
Epoch 10/500
 - 0s - loss: 3.0835 - acc: 0.0870
Epoch 11/500
 - 0s - loss: 3.0826 - acc: 0.0870
Epoch 12/500
 - 0s - loss: 3.0816 - acc: 0.0870
Epoch 13/500
 - 0s - loss: 3.0805 - acc: 0.0870
Epoch 14/500
 - 0s - loss: 3.0795 - acc: 0.0870
Epoch 15/500
 - 0s - loss: 3.0784 - acc: 0.0870
Epoch 16/500
 - 0s - loss: 3.0773 - acc: 0.0870
Epoch 17/500
 - 0s - loss: 3.0762 - acc: 0.0870
Epoch 18/500
 - 0s - loss: 3.0750 - acc: 0.0870
Epoch 19/500
 - 0s - loss: 3.0738 - acc: 0.0870
Epoch 20/500
 - 0s - loss: 3.0725 - acc: 0.0870
Epoch 21/500
 - 0s - loss: 3.0712 - acc: 0.0870
E

Epoch 171/500
 - 0s - loss: 1.1389 - acc: 0.7826
Epoch 172/500
 - 0s - loss: 1.1215 - acc: 0.7826
Epoch 173/500
 - 0s - loss: 1.1043 - acc: 0.7826
Epoch 174/500
 - 0s - loss: 1.0872 - acc: 0.8261
Epoch 175/500
 - 0s - loss: 1.0703 - acc: 0.8261
Epoch 176/500
 - 0s - loss: 1.0536 - acc: 0.8261
Epoch 177/500
 - 0s - loss: 1.0370 - acc: 0.8261
Epoch 178/500
 - 0s - loss: 1.0206 - acc: 0.8261
Epoch 179/500
 - 0s - loss: 1.0044 - acc: 0.8261
Epoch 180/500
 - 0s - loss: 0.9884 - acc: 0.8261
Epoch 181/500
 - 0s - loss: 0.9725 - acc: 0.8696
Epoch 182/500
 - 0s - loss: 0.9569 - acc: 0.8696
Epoch 183/500
 - 0s - loss: 0.9414 - acc: 0.8696
Epoch 184/500
 - 0s - loss: 0.9260 - acc: 0.9130
Epoch 185/500
 - 0s - loss: 0.9109 - acc: 0.9130
Epoch 186/500
 - 0s - loss: 0.8959 - acc: 0.9130
Epoch 187/500
 - 0s - loss: 0.8811 - acc: 0.9130
Epoch 188/500
 - 0s - loss: 0.8664 - acc: 0.9130
Epoch 189/500
 - 0s - loss: 0.8519 - acc: 0.9130
Epoch 190/500
 - 0s - loss: 0.8377 - acc: 0.9130
Epoch 191/500
 - 0s 

 - 0s - loss: 0.1084 - acc: 0.9565
Epoch 339/500
 - 0s - loss: 0.1078 - acc: 0.9565
Epoch 340/500
 - 0s - loss: 0.1073 - acc: 0.9565
Epoch 341/500
 - 0s - loss: 0.1067 - acc: 0.9565
Epoch 342/500
 - 0s - loss: 0.1062 - acc: 0.9565
Epoch 343/500
 - 0s - loss: 0.1057 - acc: 0.9565
Epoch 344/500
 - 0s - loss: 0.1051 - acc: 0.9565
Epoch 345/500
 - 0s - loss: 0.1046 - acc: 0.9565
Epoch 346/500
 - 0s - loss: 0.1041 - acc: 0.9565
Epoch 347/500
 - 0s - loss: 0.1037 - acc: 0.9565
Epoch 348/500
 - 0s - loss: 0.1032 - acc: 0.9565
Epoch 349/500
 - 0s - loss: 0.1027 - acc: 0.9565
Epoch 350/500
 - 0s - loss: 0.1023 - acc: 0.9565
Epoch 351/500
 - 0s - loss: 0.1018 - acc: 0.9565
Epoch 352/500
 - 0s - loss: 0.1014 - acc: 0.9565
Epoch 353/500
 - 0s - loss: 0.1009 - acc: 0.9565
Epoch 354/500
 - 0s - loss: 0.1005 - acc: 0.9565
Epoch 355/500
 - 0s - loss: 0.1001 - acc: 0.9565
Epoch 356/500
 - 0s - loss: 0.0997 - acc: 0.9565
Epoch 357/500
 - 0s - loss: 0.0993 - acc: 0.9565
Epoch 358/500
 - 0s - loss: 0.0989

<keras.callbacks.History at 0x13afe748>

In [21]:
test = "jill came"
encoded_test = tokenizer.texts_to_sequences([test])[0]
encoded_test = np.array(encoded_test)
encoded_test = encoded_test.reshape(1,2)
yhat = model.predict_classes(encoded_test,verbose=0)
for word, index in tokenizer.word_index.items():
    if index == yhat:
        print(word)

tumbling
