In [14]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
import os
import re
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
import string
from sklearn.model_selection import train_test_split

# Data Reading

In [15]:
MULTILABEL_PATH = '../data/processed/aligned/multilabel/'
MULTILABEL_FILE = 'multilabel_aligned.csv'
MARKETCHAT_FILE = 'marketsandchats.csv'

In [23]:
# Read input market chat file
df_market_chat = pd.read_csv(MULTILABEL_PATH + MARKETCHAT_FILE, delimiter=',')

# Download basic packages

In [24]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /Users/talat/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/talat/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Preprocessing Methods

In [32]:
def _filter_tokenize (words, common_words):
    
    '''
    Tokenize the chats and doing the required filtering of chats
    '''
    
    stop_words = stopwords.words('english')
    regex_digit = re.compile('[+/-]?\d*\.?\d+')

    words = nltk.word_tokenize(words)
    # print(words)

    # Remove single-character tokens (mostly punctuation)
    words = [word for word in words if len(word) > 1]
    # print(words)
    
    # Remove numbers
    words = [word for word in words if not word.isnumeric()]
    # print(words)
    
    # Remove words with numbers
    words = [word for word in words if len(regex_digit.findall(word)) == 0]
    # print(words)
    
    # Lowercase all words (default_stopwords are lowercase too)
    words = [word.lower() for word in words]
    # print(words)
    
    # Strip newline and spaces
    words = [word.strip('\n\\n\r ') for word in words]
    # print(words)
    
    # Stemming words seems to make matters worse, disabled
    # stemmer = nltk.stem.snowball.SnowballStemmer('german')
    # words = [stemmer.stem(word) for word in words]

    # Remove stopwords
    words = [word for word in words if word not in stop_words]
    # print(words)
    
    # Remove common words
    words = [word for word in words if word not in common_words]
    # print(words)
    
    # Remove word if only one character
    words = [word for word in words if len(word) > 1]
    # print(words)
    
    words = ' '.join(words)

    return words

In [26]:
df_market_chat['message'].head()

0    there you are mr tofu Here I am, hugin trees, ...
1    there you are mr tofu Here I am, hugin trees, ...
2    yo I do like this lets try to use it bid July ...
3    yo I do like this lets try to use it bid July ...
4    yo I do like this lets try to use it bid July ...
Name: message, dtype: object

In [20]:
COMMON_WORDS = ['bid', 'offer', 'buy', 'sell', 'put', 'minus', 'plus', 'lifted', 'hit']
df_market_chat['message'] = df_market_chat['message'].apply(lambda x: _filter_tokenize(x, COMMON_WORDS))

In [21]:
df_market_chat['message'].head()

0    
1    
2    
3    
4    
Name: message, dtype: object

# Sequence Classification using LSTM

In [2]:
import numpy
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
# fix random seed for reproducibility
numpy.random.seed(7)

Using TensorFlow backend.
  return f(*args, **kwds)


Data Processing

In [3]:
# load the dataset but only keep the top n words, zero the rest
top_words = 5000
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=top_words)

Downloading data from https://s3.amazonaws.com/text-datasets/imdb.npz


In [11]:
X_test[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,   

In [4]:
# truncate and pad input sequences
max_chat_length = 500
X_train = sequence.pad_sequences(X_train, maxlen=max_chat_length)
X_test = sequence.pad_sequences(X_test, maxlen=max_chat_length)

In [6]:
# create the model
embedding_vecor_length = 32
model = Sequential()
model.add(Embedding(top_words, embedding_vecor_length, input_length=max_chat_length))
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=3, batch_size=64)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 500, 32)           160000    
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               53200     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 101       
Total params: 213,301
Trainable params: 213,301
Non-trainable params: 0
_________________________________________________________________
None
Train on 25000 samples, validate on 25000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x13a101400>

In [7]:
# Final evaluation of the model
scores = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Accuracy: 86.32%
