In [2]:
import pickle
from functions import *

with open('train_pos_full.txt', errors='ignore') as f:
    content_pos = f.readlines()
    
with open('train_neg_full.txt', errors='ignore') as g:
    content_neg = g.readlines()
    
content = []
content.extend(content_pos)
content.extend(content_neg)
X = content
Y = [1] * len(content_pos) + [0] * len(content_neg)
data = char_preproc(X, Y, vocab_len = 100, binarize = False)


with open('objs.pkl', 'wb') as f:  # Python 3: open(..., 'wb')
    pickle.dump(data, f)

Using TensorFlow backend.


Train size: 2450000, test size 50000


In [3]:
with open('/output/objs.pkl', 'wb') as f:  # Python 3: open(..., 'wb')
    pickle.dump(data, f)

In [1]:
import pickle
with open('/output/objs.pkl', 'rb') as f:  # Python 3: open(..., 'rb')
    data = pickle.load(f)

Using TensorFlow backend.


In [19]:
from datetime import datetime
from keras.layers import Dense, Input, Embedding, Dropout, Conv1D, MaxPooling1D
from keras.layers.core import Flatten
from keras.models import Model
from keras import regularizers
from keras.optimizers import RMSprop
from keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard
from functions import *


# conf and preprocess -----------------------------------------
# -------------------------------------------------------------

# settings ---------------------
# ------------------------------
print('IN SETTING')
EMBEDDING = True
TYPE = 'embedding' if EMBEDDING else 'standard'
MODELPATH ='models/char-conv-' + TYPE + '-{epoch:02d}-{val_acc:.3f}-{val_loss:.3f}.hdf5'
FILTERS = 500
LR = 0.0001 if EMBEDDING else 0.00001

CONV = [
    {'filters':200, 'kernel':8, 'strides':1, 'padding':'same', 'reg': 0, 'pool':3},
    {'filters':200, 'kernel':8, 'strides':1, 'padding':'same', 'reg': 0, 'pool':3},
    {'filters':160, 'kernel':8, 'strides':1, 'padding':'same', 'reg': 0, 'pool':2},
    {'filters':160, 'kernel':8, 'strides':1, 'padding':'same', 'reg': 0, 'pool':2},
    {'filters':120, 'kernel':8, 'strides':1, 'padding':'same', 'reg': 0, 'pool':1},
    {'filters':120, 'kernel':8, 'strides':1, 'padding':'same', 'reg': 0, 'pool':1},
    {'filters':80, 'kernel':8, 'strides':1, 'padding':'same', 'reg': 0, 'pool':''},
    {'filters':80, 'kernel':8, 'strides':1, 'padding':'same', 'reg': 0, 'pool':''}
]

CONV1 = [
    {'filters':500, 'kernel':8, 'strides':1, 'padding':'same', 'reg': 0, 'pool':3},
    {'filters':500, 'kernel':8, 'strides':1, 'padding':'same', 'reg': 0, 'pool':2},
    {'filters':500, 'kernel':8, 'strides':1, 'padding':'same', 'reg': 0, 'pool':1},
    {'filters':500, 'kernel':8, 'strides':1, 'padding':'same', 'reg': 0, 'pool':''},
    {'filters':500, 'kernel':8, 'strides':1, 'padding':'same', 'reg': 0, 'pool':''}
]


# generate dataset -------------
# ------------------------------
print('IN DATA GENERATION')
#data, table = load_processed_data(False, not EMBEDDING)

print("input shape: ", np.shape(data.x_train))



# model architecture ------------------------------------------
# -------------------------------------------------------------


# input and embedding ----------
# ------------------------------
print('IN MODEL CREATION')
if EMBEDDING:

    inputlayer = Input(shape=(140,))
    network = Embedding(70, 100, input_length=140)(inputlayer)

else:
    inputlayer = Input(shape=(140 ,70))
    network = inputlayer

# convolutional layers ---------
# ------------------------------

for C in CONV1:

    # conv layer
    network = Conv1D(filters=C['filters'], kernel_size=C['kernel'], \
                     strides=C['strides'], padding=C['padding'], activation='relu', \
                     kernel_regularizer=regularizers.l2(C['reg']))(network)

    if type(C['pool']) != int:
        continue

    # pooling layer
    network = MaxPooling1D(C['pool'])(network)

# fully connected --------------
# ------------------------------
network = Flatten()(network)
network = Dense(1024, activation='relu')(network)
network = Dropout(0)(network)

# output
ypred = Dense(2, activation='softmax')(network)


# training ----------------------------------------------------
# -------------------------------------------------------------


# callbacks --------------------
# ------------------------------

# tensorboard
print('IN FORMALITIES')
TB_DIR = 'logs/' + datetime.now().strftime("%Y-%m-%d-%H-%M-%S") + '_' + TYPE

os.makedirs(TB_DIR)
tensorboard = TensorBoard(log_dir=TB_DIR)

# early stopping and checkpoint
estopping = EarlyStopping(monitor='val_acc', patience=1000)
checkpoint = ModelCheckpoint(filepath=MODELPATH, save_best_only=True)

# model-------------------------
# ------------------------------

optimizer = RMSprop(lr=LR)


model = Model(inputs=inputlayer, outputs=ypred)
model.compile(loss='categorical_crossentropy',
              optimizer=optimizer,
              metrics=['acc'])

print(TB_DIR)
print(model.summary())

print('IN TRAINING')
# fit and run ------------------
# ------------------------------
try:
    hist = model.fit(data.x_train,
                     data.y_train,
                     validation_data=(data.x_test, data.y_test),
                     epochs=10,
                     batch_size=250,
                     shuffle=False,
                     verbose=1,
                     callbacks=[estopping, tensorboard])

except KeyboardInterrupt:    
    print("training stopped")

IN SETTING
IN DATA GENERATION
input shape:  (2450000, 140)
IN MODEL CREATION
IN FORMALITIES
logs/2017-12-20-15-57-00_embedding
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_5 (InputLayer)         (None, 140)               0         
_________________________________________________________________
embedding_5 (Embedding)      (None, 140, 100)          7000      
_________________________________________________________________
conv1d_27 (Conv1D)           (None, 140, 500)          400500    
_________________________________________________________________
max_pooling1d_21 (MaxPooling (None, 46, 500)           0         
_________________________________________________________________
conv1d_28 (Conv1D)           (None, 46, 500)           2000500   
_________________________________________________________________
max_pooling1d_22 (MaxPooling (None, 23, 500)           0         
_______________

In [20]:
model.save('/output/char10.h5')

In [2]:
from keras.models import load_model
from datetime import datetime
from keras.layers import Dense, Input, Embedding, Dropout, Conv1D, MaxPooling1D
from keras.layers.core import Flatten
from keras.models import Model
from keras import regularizers
from keras.optimizers import RMSprop
from keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard
from functions import *
EMBEDDING = True
TYPE = 'embedding' if EMBEDDING else 'standard'
TB_DIR = 'logs/' + datetime.now().strftime("%Y-%m-%d-%H-%M-%S") + '_' + TYPE

os.makedirs(TB_DIR)
tensorboard = TensorBoard(log_dir=TB_DIR)

# early stopping and checkpoint
estopping = EarlyStopping(monitor='val_acc', patience=1000)
model = load_model('/output/char10.h5')

In [6]:
try:
    hist = model.fit(data.x_train,
                     data.y_train,
                     validation_data=(data.x_test, data.y_test),
                     epochs=5,
                     batch_size=5000,
                     shuffle=False,
                     verbose=1,
                     callbacks=[estopping, tensorboard])

except KeyboardInterrupt:    
    print("training stopped")

Train on 2450000 samples, validate on 50000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [7]:
model.save('/output/char20.h5')

In [5]:
with open('test_data.txt') as h:
    content_test = h.readlines()

In [6]:
for i in range(len(content_test)):
    content_test[i] = content_test[i].lstrip('1234567890,')

In [7]:
X_test = cleanup_col(content_test, numbers=True)
    # split in arrays of characters
char_arrs = [[x for x in y] for y in X_test]

    # tokenize
tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(char_arrs)

    # token sequences
seq = tokenizer.texts_to_sequences(X_test)

    # pad to same length
seq = pad_sequences(seq, maxlen=140, padding='post', truncating='post', value=0)
X_test = seq

In [8]:
ypred = model.predict(X_test, verbose = 1)



In [9]:
ypred.shape

(10000, 2)

In [10]:
y = np.zeros((ypred.shape[0],))
count = 0
for i in range(ypred.shape[0]):
    if ypred[i][0] > ypred[i][1]:
        y[i] = 1
        count += 1
    else:
        y[i] = -1

In [11]:
count

4664

In [12]:
import pandas as pd

In [13]:
sub = pd.DataFrame(y, columns=['Prediction'], index = range(1, 10001))
sub.index.name = 'Id'
sub = sub.astype(int)

In [14]:
sub.head()

Unnamed: 0_level_0,Prediction
Id,Unnamed: 1_level_1
1,-1
2,-1
3,-1
4,1
5,-1


In [15]:
sub.to_csv('/output/sample_cnn.csv')