In [163]:
import numpy as np
import pandas as pd
import collections
import os
import string
import nltk
import operator
from numpy import array
from itertools import islice
from collections import Counter
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /Users/talat/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/talat/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Generic Functions

In [20]:
def sentence_to_array(sentence):
    word_list = sentence.split(' ')
    
    clean_word_list = []
    for word in word_list:
        op= word_filter(word)
        if len(op) > 0:
            clean_word_list.append(op)
            
    return clean-word_list

In [119]:
xs = [[0 for x in range(10)] for y in range(5)]
xs[3][4] = 20
print(xs[3][4])

20


In [80]:
def remove_punctuations(word):
    translator = str.maketrans('', '', string.punctuation)
    word = word.translate(translator)
    return word

In [81]:
def word_filter(word):
    
    word = word.lower()
    
    word = remove_punctuations(word)
    
    # Remove stop words
    stop_words = stopwords.words('english')
    if word in stop_words:
        return ''
    
    # Remove Common words
    common_words = ['bid', 'offer', 'buy', 'sell', 'put', 'minus', 'plus', 'lifted', 'hit']
    if word in common_words:
        return ''
    
    # Remove numbers
    if is_number(word):
        return ''
    
    # Remove commas
    word = word.replace(',','')
    
    return word

## Data

In [2]:
IP_FILEPATH = '../data/processed/rnn_input_target.csv'
OP_FILEPATH = '../data/processed/rnn_output_chat.txt'

In [51]:
def read_data_as_words(filename):
    df_data = pd.read_csv(IP_FILEPATH)
    return df_data

In [4]:
def write_text(filename, txt):
    if os.path.exists(filename):
        append_write = 'a' # append if already exists
    else:
        append_write = 'w' # make a new file if not
        
    with open(filename, append_write) as f:
        f.write(txt + '\n')

In [43]:
def is_number(s):
    try:
        float(s)
        return True
    except ValueError:
        return False

In [82]:
df_data = read_data_as_words(IP_FILEPATH)
df_data['word'] = df_data['word'].apply(lambda x: x.lower())
df_data['market'] = df_data['market'].apply(lambda x: x.lower())
print(df_data.shape)

(46286, 3)


In [83]:
df_data['word'] = df_data['word'].apply(lambda x: word_filter(x))
df_data['market'] = df_data['market'].apply(lambda x: remove_punctuations(x))
print(df_data.shape)

(46286, 3)


In [84]:
df_data = df_data.loc[df_data['word'] != '']
print(df_data.shape)

(40074, 3)


In [85]:
inputs_txt = df_data['word'].values
targets_txt = df_data['market'].values
sentence_idx = df_data['sentence_idx'].values

In [86]:
vocab = list(set(inputs_txt))
vocab_size = len(vocab)
print('Number of unique words in %s is %d' % (IP_FILEPATH, vocab_size))
print('Total number of words in %s is %d' % (IP_FILEPATH, len(inputs_txt)))

target_vocab = list(set(targets_txt))
target_vocab_size = len(target_vocab)
print('Number of unique targets in %s is %d' % (IP_FILEPATH, target_vocab_size))
print('Total number of targets in %s is %d' % (IP_FILEPATH, len(targets_txt)))

Number of unique words in ../data/processed/rnn_input_target.csv is 5834
Total number of words in ../data/processed/rnn_input_target.csv is 40074
Number of unique targets in ../data/processed/rnn_input_target.csv is 82
Total number of targets in ../data/processed/rnn_input_target.csv is 40074


In [87]:
# List of all the unique markets
print(target_vocab)

['bakned', 'clkedm', 'dapl space', 'wtimidwti', 'whwcs', 'bakfob', 'syn', 'sw', 'niowti', 'bls', 'bakram dapl', 'baknt', 'sc', 'midale', 'wcs', 'clkhard', 'nioponcawti', 'uhccromer', 'baklitm', 'wtimid', 'ff', 'psy', 'cil', 'bakdivh', 'baktren', 'bakepp', 'bakbl', 'wcw', 'awbwcs', 'bakstan dapl', 'lt swwr', 'wcscush', 'brnwcs', 'llkwcs', 'swkersw', 'bakstan', 'bakdapl', 'bakjoco', 'lt swguern', 'epnd space', 'bakwilliam', 'bakjoco dapl', 'bakdore', 'pony express space', 'wtimeh', 'csw', 'wcc', 'double h space', 'pso', 'sybhardwcs', 'lt swcasper', 'lsb', 'blswti', 'pipeline space', 'bakepp dapl', 'wti', 'bak truck throughput', 'bakmountrail', 'nio', 'llk', 'ahswcs', 'bakalex', 'gsw', 'c5', 'awb', 'bakdorelt swguern', 'shlwti', 'bakpat', 'clkhardwcs', 'cal', 'baktruck rate', 'dj common', 'mxsguern', 'bakbert', 'cdbwcs', 'clkedmwcs', 'sybedmwcs', 'ssp', 'uhc', 'baktren dapl', 'bakneder', 'llbwcs']


### Dictionaries for Lookup

In [88]:
def build_dataset(words):
    count = collections.Counter(words).most_common()
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return dictionary, reverse_dictionary

In [90]:
word_to_ix, ix_to_word = build_dataset(inputs_txt)
target_to_ix, ix_to_target = build_dataset(targets_txt)

### Generating Test and Train Split

In [94]:
test_size = 0.2 # in Percentages

# Get the number of sentences
unique_sentence_count = np.unique(sentence_idx)
n_count = len(unique_sentence_count)
test_size = round(n_count * test_size)
print('Unique Sentences: ', n_count)

test_sentence_ixes = np.random.choice(unique_sentence_count, size=test_size, replace=True)
train_sentence_ixes = list(set(sentence_idx) - set(test_sentence_ixes))

print('Test sentence count', len(test_sentence_ixes))
print('Train sentence count', len(train_sentence_ixes))

df_train = df_data.loc[df_data['sentence_idx'].isin(train_sentence_ixes)]
df_test = df_data.loc[df_data['sentence_idx'].isin(test_sentence_ixes)]

print('Test df shape', df_test.shape)
print('Train df shape', df_train.shape)

Unique Sentences:  2419
Test sentence count 484
Train sentence count 1982
Test df shape (7524, 3)
Train df shape (32550, 3)


In [103]:
train_inputs_txt = df_train['word'].values
train_targets_txt = df_train['market'].values
train_sentence_idx = df_train['sentence_idx'].values

test_inputs_txt = df_test['word'].values
test_targets_txt = df_test['market'].values
test_sentence_idx = df_test['sentence_idx'].values

## Hyperparameters

In [95]:
hidden_size = 100
seq_length = 25
learning_rate = 1e-1

## Model Parameters

In [186]:
Wxh = np.random.randn(hidden_size, vocab_size) * 0.01
Whh = np.random.randn(hidden_size, hidden_size) * 0.01
Why = np.random.randn(target_vocab_size, hidden_size) * 0.01

# Biases
bh = np.zeros((hidden_size, 1))
by = np.zeros((target_vocab_size, 1))

print('Wxh shape: ', Wxh.shape)
print('Whh shape: ', Whh.shape)
print('Why shape: ', Why.shape)

print('bh shape: ', bh.shape)
print('by shape: ', by.shape)

Wxh shape:  (100, 5834)
Whh shape:  (100, 100)
Why shape:  (82, 100)
bh shape:  (100, 1)
by shape:  (82, 1)


### Functions

Input:
1. list of input words
2. list of target words
3. the previous hidden state

Output:
1. Loss
2. Gradient for eah pareamters between layers
3. Last hidden state

In [97]:
# xs[t] is the vector that encode the words at position t
# ps[t] is the probabilities for next words

def lossfn(inputs, targets, hprev, bh, by):
    
    # Empty dictionaries
    xs = {} # Store the 1-hot encoded input words for each of the 25 timesteps
    hs = {} # Hidden state outputs
    ys = {} # Store the target variables
    ps = {} # Noramized probabilities for the targets, ys
    
    hs[-1] = np.copy(hprev) # Creating a copy of hprev in the index -1 for chaining purposes
    loss = 0
    
    # Forward pass
    # For each of the input
    for t in range(len(inputs)):
        
        # One hot encoding for the input
        xs[t] = np.zeros((vocab_size, 1))
        xs[t][inputs[t]] = 1
                
        # Compute the hidden states
        hs[t] = np.tanh(np.dot(Wxh, xs[t]) + np.dot(Whh, hs[t-1]) + bh)
        
        # Compute the target states
        ys[t] = np.dot(Why, hs[t]) + by
        
        # Compute the probabilities for the output
        hs[t] = np.tanh(np.dot(Wxh, xs[t]) + np.dot(Whh, hs[t-1]) + bh) 

        # unnormalized log probabilities for next words
        ys[t] = np.dot(Why, hs[t]) + by 
        
        # probabilities for next words
        ps[t] = np.exp(ys[t]) / np.sum(np.exp(ys[t]))  
        
        # Compute the log loss - Softmax (or cross entropy loss)
        loss += -np.log(ps[t][targets[t], 0])
        
    # Back propagation
    # Each weight & bias will have a gradient that is computed
    # Declare some variables of the same shapes as the corresponding forward passes variables
    dWxh = np.zeros_like(Wxh)
    dWhh = np.zeros_like(Whh)
    dWhy = np.zeros_like(Why)
    
    dbh = np.zeros_like(bh)
    dby = np.zeros_like(by)
    
    dhnext = np.zeros_like(hs[0])
    
    # The gradient is computed from backward to the first --> in the reverse order
    for t in reversed(range(len(inputs))):
        
        # Output probabilities for the 't'th element in reverse order
        dy = np.copy(ps[t])
        
        dy[targets[t]] -= 1
        
        dWhy += np.dot(dy, hs[t].T)
        
        dby += dy
        
        dh = np.dot(Why.T, dy) + dhnext # Backprop into h
        
        dhraw = (1 - hs[t] * hs[t]) * dh # Backprop through non-linearity
        
        bh += dhraw #derivative of hidden bias
        
        dWxh += np.dot(dhraw, xs[t].T) #derivative of input to hidden layer weight
        dWhh += np.dot(dhraw, hs[t-1].T) #derivative of hidden layer to hidden layer weight
        dhnext = np.dot(Whh.T, dhraw) 
  

    for dparam in [dWxh, dWhh, dWhy, dbh, dby]:
        np.clip(dparam, -5, 5, out=dparam) # clip to mitigate exploding gradients                                                                                                                 
      
    return loss, dWxh, dWhh, dWhy, dbh, dby, hs[len(inputs)-1]    

### Prediction Function

In [190]:
# def sample(h, seed_ix, n, op_filepath=None):
def predict(h, test_set, ground_truth, op_filepath=None):
    
    # At a given iteration, sample a list of integers and generate statements
    
    # h = memory state - Initiailly after training of n samples, it is the hidden state of t iterations of training
    # test_set - array - to be predicted
    # ground_truth - array - the actual ground_tuth
    
    
    # Convert the sentence into array of words
    # word_array = sentence_to_array(sentence)
    word_array = test_set
    
    # Convert the array of words as indices
    x_input = word_array_to_ixes(word_array, word_to_ix)
    
    # List to store indices of predicted words
    predicted_values = []
    
    # Do it for n words for which values are to be predicted
    for t in range(len(x_input)):
        
        x = x_input[t]
        x = x.reshape(len(x), 1)
        
        # print('Wxh Shape: ', Wxh.shape)
        # print('x Shape: ', x.shape)
        # print('Whh Shape: ', Whh.shape)
        # print('h Shape: ', h.shape)
        # print('bh Shape: ', bh.shape)
        
        # print('Why Shape: ', Why.shape)
        # print('by Shape: ', by.shape)
        # print('-------')
        
        # Compute the hidden state given an input, previous hidden state and Weights
        h = np.tanh(np.dot(Wxh, x) + np.dot(Whh, h) + bh)
        
        # Compute the output (unnormalized)
        y = np.dot(Why, h) + by
        # print('y Shape :', y.shape)
        
        # Compute the probabilities for the next words
        p = np.exp(y) / np.sum(np.exp(y))
        # print('p Shape :', p.shape)
        
        # Pick the output (here index) with the highest probabilities
        ix = np.random.choice(range(target_vocab_size), p=p.ravel())
        
        # index, value = max(enumerate(my_list), key=operator.itemgetter(1))
        
        # Create a vector for the selected index and make it 1
        # The current x will be the seed for the next iteration
        targ = np.zeros((target_vocab_size, 1))
        targ[ix] = 1
        
        predicted_value = ix_to_target[ix]
        
        if op_filepath is not None:
            write_text(filename=op_filepath, txt=predicted_value+'\n')
        
        # Add the predicted value to the list
        predicted_values.append(predicted_value)
        
    print(predicted_values)
    
    return predicted_values

In [191]:
ix_to_target

{0: 'bakbl',
 1: 'uhc',
 2: 'dj common',
 3: 'lt swguern',
 4: 'bakstan',
 5: 'bakdapl',
 6: 'bakjoco dapl',
 7: 'bakepp',
 8: 'wcscush',
 9: 'cal',
 10: 'blswti',
 11: 'wcs',
 12: 'baknt',
 13: 'baktren',
 14: 'wti',
 15: 'bakdore',
 16: 'niowti',
 17: 'pipeline space',
 18: 'bakfob',
 19: 'pony express space',
 20: 'sw',
 21: 'ahswcs',
 22: 'wcw',
 23: 'bakalex',
 24: 'csw',
 25: 'clkhard',
 26: 'bakbert',
 27: 'epnd space',
 28: 'bakjoco',
 29: 'ff',
 30: 'cil',
 31: 'uhccromer',
 32: 'wtimidwti',
 33: 'c5',
 34: 'baktren dapl',
 35: 'double h space',
 36: 'lsb',
 37: 'gsw',
 38: 'llkwcs',
 39: 'llk',
 40: 'bakdorelt swguern',
 41: 'brnwcs',
 42: 'sc',
 43: 'clkedm',
 44: 'whwcs',
 45: 'bakdivh',
 46: 'pso',
 47: 'bakned',
 48: 'dapl space',
 49: 'bakram dapl',
 50: 'baklitm',
 51: 'bakwilliam',
 52: 'clkedmwcs',
 53: 'ssp',
 54: 'sybedmwcs',
 55: 'clkhardwcs',
 56: 'bls',
 57: 'nio',
 58: 'wtimeh',
 59: 'bakmountrail',
 60: 'awb',
 61: 'llbwcs',
 62: 'bakepp dapl',
 63: 'mxsguern',

In [192]:
df_data.loc[df_data['market'] == 'dj common']

Unnamed: 0,word,market,sentence_idx
11803,paul,dj common,395
11804,laugh,dj common,395
11805,least,dj common,395
11806,good,dj common,395
11807,spirit,dj common,395
11808,mean,dj common,395
11809,long,dj common,395
11810,pay,dj common,395
11811,bills,dj common,395
11812,lol,dj common,395


In [99]:
def get_target_from_indices(ixes):
    return ' '.join(ix_to_target[ix] for ix in ixes)

In [102]:
def compute_accuracy(predicted_output, ground_truth, only_last_word = False):
    
    '''
    predicted_output = list of predictions that were generated by the RNN
    ground_truth = outputs to which the accuracy is computed against
    only_last_word = instead of computing the accuracy at each word, the accuracy is computed only for 
    the last word of the sentence
    '''
    
    if len(predicted_output) != len(ground_truth):
        print('Exception: The size of predicted output (%d) is different from ground truth (%d)' 
              % len(predicted_output), len(ground_truth))
        return -1.00
    
    pred_length = len(predicted_output)
    accurately_predicted = 0

    for idx in range(pred_length):
        if predicted_output[idx].strip() == ground_truth[idx].strip():
            accurately_predicted += 1
    
    return accurately_predicted / pred_length

### Training

In [189]:
n = 0
p = 0

# Memory variables for adagrad
mWxh = np.zeros_like(Wxh)
mWhh = np.zeros_like(Whh)
mWhy = np.zeros_like(Why)

mbh = np.zeros_like(bh)
mby = np.zeros_like(by)

smooth_loss = -np.log(1.0/vocab_size)*seq_length # loss at iteration 0 

training_iter = 10e5
sampling_iter = 10e2

while n < training_iter:
    
    # INPUT/TARGETS - Prepare inputs
    if n == 0 or p + seq_length+1 > len(train_inputs_txt):
        p = 0 # p = 0 is going back to the start of the data
        hprev = np.zeros((hidden_size, 1)) # At the start, the hidden state is initilized to 1
        
    inputs = [word_to_ix[ch] for ch in train_inputs_txt[p: p+seq_length]] # INPUTS
    targets = [target_to_ix[ch] for ch in train_targets_txt[p: p+seq_length]] # TARGETS
    
    # LOSS - Compute the loss
    loss, dWxh, dWhh, dWhy, dbh, dby, hprev = lossfn(inputs, targets, hprev, bh, by)
    smooth_loss = smooth_loss * 0.999 + loss * 0.001
    
    # PREDICTION - Compute the predicted after every sampling_iter iterations
    if n % sampling_iter == 0:
        
        loss_string = '(' + str(n) + ',' + str(smooth_loss) + ')'
        print(loss_string) # print progress
        write_text(OP_FILEPATH, loss_string)
        
        predict(hprev, test_inputs_txt, test_targets_txt, OP_FILEPATH)
        
    # OPTIMIZATION (ADAGRAD) perform parameter update with Adagrad                                                                                                                                                    
    for param, dparam, mem in zip([Wxh, Whh, Why, bh, by],
                                [dWxh, dWhh, dWhy, dbh, dby],
                                [mWxh, mWhh, mWhy, mbh, mby]):
        mem += dparam * dparam
        param += -learning_rate * dparam / np.sqrt(mem + 1e-8) # adagrad update
    
    p += seq_length
    n += 1 # Counter for the iterations

(0,216.68609125394556)
(7524, 5834)
['llbwcs', 'psy', 'ff', 'uhccromer', 'bakmountrail', 'lt swguern', 'wti', 'nioponcawti', 'sw', 'pony express space', 'clkhardwcs', 'nio', 'wti', 'lt swwr', 'syn', 'ahswcs', 'wcw', 'lt swwr', 'bakdivh', 'epnd space', 'bakalex', 'bak truck throughput', 'nioponcawti', 'wcscush', 'bakepp dapl', 'bakstan dapl', 'pony express space', 'wcw', 'wti', 'clkhardwcs', 'wtimidwti', 'bakepp', 'sybedmwcs', 'whwcs', 'bakalex', 'baknt', 'sw', 'wti', 'sw', 'wtimid', 'bakdivh', 'baktren dapl', 'bakneder', 'wtimidwti', 'c5', 'sc', 'bakjoco', 'wcscush', 'bls', 'midale', 'pso', 'csw', 'ssp', 'llk', 'sybedmwcs', 'bakfob', 'bakjoco dapl', 'baktren', 'sw', 'csw', 'wtimidwti', 'ahswcs', 'bakdivh', 'bakned', 'psy', 'bak truck throughput', 'bakdivh', 'uhccromer', 'baknt', 'uhc', 'gsw', 'ssp', 'bakalex', 'dj common', 'sybedmwcs', 'baktren dapl', 'bakstan dapl', 'blswti', 'syn', 'bakstan dapl', 'csw', 'bakdivh', 'bakned', 'clkhard', 'bakalex', 'awb', 'bakstan dapl', 'bakbert', 'll

KeyboardInterrupt: 

In [165]:
def word_array_to_ixes(word_array, word_to_ixes):
    
    x_len = len(word_array) # Number of words in test
    y_len = len(word_to_ixes) # Number of total vocab
    
    xs = [[0 for x in range(y_len)] for y in range(x_len)]
    
    # print(len(xs), len(xs[0]))
    # print('xlen', x_len, 'ylen', y_len)
    
    for idx in range(x_len):
        word = word_array[idx]
        word_index = word_to_ixes[word]
        # print('word', word, '- word_index', word_index, '- idx', idx)
        xs[idx][word_index] = 1
            
    xs = array(xs)
    return xs