In [1]:
import pandas as pd
import numpy as np
import re
import string
import os
import pickle
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelBinarizer

Using TensorFlow backend.


In [2]:
def cleanup_str(st, numbers=False):
    if type(st) == bytes:
        try:
            st = st.decode('utf-8').strip().lower()
        except:
            print('unicode error: {}'.format(st))

    if numbers == True:
        keep = set(string.ascii_lowercase + string.digits + string.punctuation + ' ')
    else:
        keep = set(string.ascii_lowercase + string.punctuation + ' ')

    # clean string
    st = ''.join(x if x in keep else ' ' for x in st)
    # rem multiple spaces
    st = re.sub(' +', ' ', st)

    return st


# mapper: cleanup a pd column or list of strings
def cleanup_col(col, numbers=False):
    col = map(lambda x: cleanup_str(x, numbers=numbers), col)
    return list(col)

In [3]:
def binarize_tokenized(X, vocab_len):
    binarizer = LabelBinarizer()
    binarizer.fit(range(vocab_len))
    X = np.array([binarizer.transform(x) for x in X])

    return X


def char_preproc(X, Y, vocab_len, binarize=False):
    # -----------------------------
    # preproc X's------------------

    # cleanup
    X = cleanup_col(X, numbers=True)
    # split in arrays of characters
    #char_arrs = [[x for x in y] for y in X]

    # tokenize
    #tokenizer = Tokenizer(char_level=True)
    tokenizer = Tokenizer(char_level=False)
    tokenizer.fit_on_texts(X)

    # token sequences
    seq = tokenizer.texts_to_sequences(X)

    # pad to same length
    #seq = pad_sequences(seq, maxlen=140, padding='post', truncating='post', value=0)
    seq = pad_sequences(seq, maxlen=60, padding='post', truncating='post', value=0)

    # make to on-hot
    if binarize:
        X = binarize_tokenized(seq, vocab_len)
    else:
        X = seq

    # ----------------------------
    # preproce Y's and return data

    # one-hot encode Y's
    Y = np.array([[1, 0] if x == 1 else [0, 1] for x in Y])

    # generate and return final dataset
    data = Dataset(X, Y, shuffle=True, testsize=0.02)

    return data

In [4]:
def load_processed_data(load=True, binarize=False):
    table = None

    if os.path.isfile('data/processed/data-ready.pkl') and load:
        print("data exists - loading")

        with open('data/processed/data-ready.pkl', 'rb') as file:
            data = pickle.load(file)
    else:
        print("reading raw data and preprocessing..")
        table = pd.read_csv('data/rt-polarity.csv')
        data = char_preproc(table.text, table.label, 70, binarize)

        with open('data/processed/data-ready.pkl', 'wb') as file:
            pickle.dump(data, file)

    return (data, table)


class Dataset():
    def __init__(self, x, y=None, testsize=0.2, shuffle=False):

        lend = len(x)

        if testsize == None:
            self.x_data = x
            if y != None:
                self.y_data = y

            print('Single dataset of size {}'.format(lend))
        else:
            if shuffle:
                si = np.random.permutation(np.arange(lend))
                x = x[si]
                y = y[si]
                self.si = si

            if type(testsize) == int:
                testindex = testsize
            else:
                testindex = int(testsize * lend)

            self.x_train = x[testindex:]
            self.x_test = x[:testindex]
            self.y_train = y[testindex:]
            self.y_test = y[:testindex]
            self.testindex = testindex

            print('Train size: {}, test size {}'.format(len(self.y_train), len(self.y_test)))

In [5]:
with open('train_pos_full.txt') as f:
    content_pos = f.readlines()
    
with open('train_neg_full.txt', errors='ignore') as g:
    content_neg = g.readlines()
    
content = []
content.extend(content_pos)
content.extend(content_neg)
X = content
Y = [1] * len(content_pos) + [0] * len(content_neg)
data = char_preproc(X, Y, vocab_len = 100, binarize = False)
with open('/output/objs_word.pkl', 'wb') as f:  # Python 3: open(..., 'wb')
    pickle.dump(data, f)

Train size: 2450000, test size 50000


In [None]:
X = cleanup_col(X, numbers=True)
    # split in arrays of characters
    #char_arrs = [[x for x in y] for y in X]

    # tokenize
    #tokenizer = Tokenizer(char_level=True)
tokenizer = Tokenizer(char_level=False)
tokenizer.fit_on_texts(X)

    # token sequences
seq = tokenizer.texts_to_sequences(X)

    # pad to same length
    #seq = pad_sequences(seq, maxlen=140, padding='post', truncating='post', value=0)
seq = pad_sequences(seq, maxlen=50, padding='post', truncating='post', value=0)

In [None]:
seq.max()

In [5]:
import pickle
with open('/output/objs_word.pkl', 'rb') as f:  # Python 3: open(..., 'rb')
    data = pickle.load(f)

In [None]:
from datetime import datetime
from keras.layers import Dense, Input, Embedding, Dropout, Conv1D, MaxPooling1D
from keras.layers.core import Flatten
from keras.models import Model
from keras import regularizers
from keras.optimizers import RMSprop
from keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard
from functions import *


# conf and preprocess -----------------------------------------
# -------------------------------------------------------------

# settings ---------------------
# ------------------------------
print('IN SETTING')
EMBEDDING = True
TYPE = 'embedding' if EMBEDDING else 'standard'
MODELPATH ='models/char-conv-' + TYPE + '-{epoch:02d}-{val_acc:.3f}-{val_loss:.3f}.hdf5'
FILTERS = 500
LR = 0.0001 if EMBEDDING else 0.00001

CONV = [
    {'filters':200, 'kernel':8, 'strides':1, 'padding':'same', 'reg': 0, 'pool':3},
    {'filters':200, 'kernel':8, 'strides':1, 'padding':'same', 'reg': 0, 'pool':3},
    {'filters':160, 'kernel':8, 'strides':1, 'padding':'same', 'reg': 0, 'pool':2},
    {'filters':160, 'kernel':8, 'strides':1, 'padding':'same', 'reg': 0, 'pool':2},
    {'filters':120, 'kernel':8, 'strides':1, 'padding':'same', 'reg': 0, 'pool':1},
    {'filters':120, 'kernel':8, 'strides':1, 'padding':'same', 'reg': 0, 'pool':1},
    {'filters':80, 'kernel':8, 'strides':1, 'padding':'same', 'reg': 0, 'pool':''},
    {'filters':80, 'kernel':8, 'strides':1, 'padding':'same', 'reg': 0, 'pool':''}
]

CONV1 = [
    {'filters':950, 'kernel':8, 'strides':1, 'padding':'same', 'reg': 0, 'pool':3},
    {'filters':950, 'kernel':8, 'strides':1, 'padding':'same', 'reg': 0, 'pool':2},
    {'filters':950, 'kernel':8, 'strides':1, 'padding':'same', 'reg': 0, 'pool':1},
    {'filters':950, 'kernel':8, 'strides':1, 'padding':'same', 'reg': 0, 'pool':''}
]

# generate dataset -------------
# ------------------------------
print('IN DATA GENERATION')
#data, table = load_processed_data(False, not EMBEDDING)

print("input shape: ", np.shape(data.x_train))



# model architecture ------------------------------------------
# -------------------------------------------------------------


# input and embedding ----------
# ------------------------------
print('IN MODEL CREATION')
if EMBEDDING:

    inputlayer = Input(shape=(50,))
    network = Embedding(103127, 200, input_length=50)(inputlayer)

else:
    inputlayer = Input(shape=(140 ,70))
    network = inputlayer

# convolutional layers ---------
# ------------------------------

for C in CONV1:

    # conv layer
    network = Conv1D(filters=C['filters'], kernel_size=C['kernel'], \
                     strides=C['strides'], padding=C['padding'], activation='relu', \
                     kernel_regularizer=regularizers.l2(C['reg']))(network)

    if type(C['pool']) != int:
        continue

    # pooling layer
    network = MaxPooling1D(C['pool'])(network)

# fully connected --------------
# ------------------------------
network = Flatten()(network)
network = Dense(1024, activation='relu')(network)
network = Dropout(0)(network)

# output
ypred = Dense(2, activation='softmax')(network)


# training ----------------------------------------------------
# -------------------------------------------------------------


# callbacks --------------------
# ------------------------------

# tensorboard
print('IN FORMALITIES')
TB_DIR = 'logs/' + datetime.now().strftime("%Y-%m-%d-%H-%M-%S") + '_' + TYPE

os.makedirs(TB_DIR)
tensorboard = TensorBoard(log_dir=TB_DIR)

# early stopping and checkpoint
estopping = EarlyStopping(monitor='val_acc', patience=1000)
checkpoint = ModelCheckpoint(filepath=MODELPATH, save_best_only=True)

# model-------------------------
# ------------------------------

optimizer = RMSprop(lr=LR)


model = Model(inputs=inputlayer, outputs=ypred)
model.compile(loss='categorical_crossentropy',
              optimizer=optimizer,
              metrics=['acc'])

print(TB_DIR)
print(model.summary())

print('IN TRAINING')
# fit and run ------------------
# ------------------------------
try:
    hist = model.fit(data.x_train,
                     data.y_train,
                     validation_data=(data.x_test, data.y_test),
                     epochs=10,
                     batch_size=1000,
                     shuffle=False,
                     verbose=1,
                     callbacks=[estopping, tensorboard])

except KeyboardInterrupt:    
    print("training stopped")

IN SETTING
IN DATA GENERATION
input shape:  (2450000, 50)
IN MODEL CREATION
IN FORMALITIES
logs/2017-12-21-01-07-02_embedding
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         (None, 50)                0         
_________________________________________________________________
embedding_3 (Embedding)      (None, 50, 200)           20625400  
_________________________________________________________________
conv1d_9 (Conv1D)            (None, 50, 950)           1520950   
_________________________________________________________________
max_pooling1d_7 (MaxPooling1 (None, 16, 950)           0         
_________________________________________________________________
conv1d_10 (Conv1D)           (None, 16, 950)           7220950   
_________________________________________________________________
max_pooling1d_8 (MaxPooling1 (None, 8, 950)            0         
________________

In [None]:
model.save('word.h5')

In [None]:
with open('test_data.txt') as h:
    content_test = h.readlines()
for i in range(len(content_test)):
    content_test[i] = content_test[i].lstrip('1234567890,')

In [None]:
X_test = cleanup_col(content_test, numbers=True)
    # split in arrays of characters
#char_arrs = [[x for x in y] for y in X_test]

    # tokenize
tokenizer = Tokenizer(char_level=False)
tokenizer.fit_on_texts(X_test)

    # token sequences
seq = tokenizer.texts_to_sequences(X_test)

    # pad to same length
seq = pad_sequences(seq, maxlen=50, padding='post', truncating='post', value=0)
X_test = seq

In [None]:
ypred = model.predict(X_test, verbose = 1)

In [None]:
y = np.zeros((ypred.shape[0],))
count = 0
for i in range(ypred.shape[0]):
    if ypred[i][0] > ypred[i][1]:
        y[i] = 1
        count += 1
    else:
        y[i] = -1

In [None]:
count

In [None]:
import pandas as pd

In [None]:
sub = pd.DataFrame(y, columns=['Prediction'], index = range(1, 10001))
sub.index.name = 'Id'
sub = sub.astype(int)

In [None]:
sub.head()

In [None]:
sub.to_csv('sample_cnn.csv')

In [None]:
from keras.models import load_model
model = load_model('word.h5')

In [None]:
from datetime import datetime
from keras.layers import Dense, Input, Embedding, Dropout, Conv1D, MaxPooling1D
from keras.layers.core import Flatten
from keras.models import Model
from keras import regularizers
from keras.optimizers import RMSprop
from keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard
from functions import *
EMBEDDING = True
TYPE = 'embedding' if EMBEDDING else 'standard'
TB_DIR = 'logs/' + datetime.now().strftime("%Y-%m-%d-%H-%M-%S") + '_' + TYPE
tensorboard = TensorBoard(log_dir=TB_DIR)

# early stopping and checkpoint
estopping = EarlyStopping(monitor='val_acc', patience=1000)
try:
    hist = model.fit(data.x_train,
                     data.y_train,
                     validation_data=(data.x_test, data.y_test),
                     epochs=5,
                     batch_size=250,
                     shuffle=False,
                     verbose=1,
                     callbacks=[estopping, tensorboard])

except KeyboardInterrupt:    
    print("training stopped")

In [None]:
model.save('word.h5')