In [None]:
# coding: utf-8
from __future__ import print_function
import os
import numpy as np
np.random.seed(1337)

import pandas as pd
import theano
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.layers import Dense, Flatten
from keras.layers import Convolution1D, MaxPooling1D, Embedding
from keras.models import Model
from keras.layers import Input, Dropout
from keras.optimizers import SGD, Adadelta
from keras.models import Sequential
import sys

def setConfig(mod = 1, cross = 0, trains = 8000, tests = 100, c = 4, maxep = 70):
    global mode, cv, trainpath, trainsize, testpath, testsize, classes, maxepoch
    mode = mod
    cv = cross
    trainpath = trainp
    trainsize = trains
    testpath = testp
    testsize = tests
    classes = c
    maxepoch = maxep
    

In [None]:
def readVectorData(fileName, GLOVE_DIR = 'glove/'):
    embeddings_index = {}
    f = open(os.path.join(GLOVE_DIR, fileName))
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    f.close()
    
    return embeddings_index

In [None]:
def loadAuthData(authorList, doc_id):
    texts = []  # list of text samples
    labels_index = {}  # dictionary mapping label name to numeric id
    labels = []  # list of label ids
    import database_query
    # textToUse = pd.read_csv("suffle_4_6000.csv", names=["author_id", "doc_content"], dtype={'author_id': int})
    from sshtunnel import SSHTunnelForwarder
    PORT=5432
    authorList = [2,121,111]
    with SSHTunnelForwarder(('srn02.cs.cityu.edu.hk', 22),
                            ssh_username='stylometry',
                            ssh_password='stylometry',
                            remote_bind_address=('localhost', 5432),
                            local_bind_address=('localhost', 5400)):
        textToUse = database_query.getAuthData(5400, authorList, doc_id)
    labels = []
    texts = []
    for index, row in textToUse.iterrows():
        labels.append(authorList.index(row.author_id))
        texts.append(row.doc_content)
    labels_index = {}
    labels_index[0] = 0
    for i, auth in enumerate(authorList):
        labels_index[i] = auth
        
    print('Found %s texts.' % len(texts))
    return (texts, labels, labels_index)

In [None]:
def loadDocData(authorList, doc_id, labels_index):
    texts = []  # list of text samples
    labels_index = {}  # dictionary mapping label name to numeric id
    labels = []  # list of label ids
    import database_query
    # textToUse = pd.read_csv("suffle_4_6000.csv", names=["author_id", "doc_content"], dtype={'author_id': int})
    from sshtunnel import SSHTunnelForwarder
    PORT=5432
    with SSHTunnelForwarder(('srn02.cs.cityu.edu.hk', 22),
                            ssh_username='stylometry',
                            ssh_password='stylometry',
                            remote_bind_address=('localhost', 5432),
                            local_bind_address=('localhost', 5400)):
        textToUse = database_query.getDocData(5400, doc_id)
    labels = []
    texts = []
    for index, row in textToUse.iterrows():
        labels.append(authorList.index(row.author_id))
        texts.append(row.doc_content)
    labels_index = {}
    labels_index[0] = 0
    for i, auth in enumerate(authorList):
        labels_index[i] = auth
        
    print('Found %s texts.' % len(texts))
    return (texts)

In [1]:
def preProcessTrainVal(texts, labels, MAX_SEQUENCE_LENGTH = 1000, MAX_NB_WORDS = 20000, VALIDATION_SPLIT = 0.2):
    # finally, vectorize the text samples into a 2D integer tensor
    tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)
    tokenizer.fit_on_texts(texts)
    sequences = tokenizer.texts_to_sequences(texts)

    word_index = tokenizer.word_index
    print('Found %s unique tokens.' % len(word_index))

    X = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

    y = to_categorical(np.asarray(labels))
    print('Shape of data tensor:', data.shape)
    print('Shape of label tensor:', labels.shape)
    
    # split the data into a training set and a validation set
    indices = np.arange(data.shape[0])
    np.random.shuffle(indices)
    data = data[indices]
    labels = labels[indices]
    nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

    trainX = data[:-nb_validation_samples]
    trainY = labels[:-nb_validation_samples]
    valX = data[-nb_validation_samples:]
    valY = labels[-nb_validation_samples:]
    
    return (trainX, trainY, valX, valY)

In [None]:
def preProcessTest(texts, labels = None, MAX_SEQUENCE_LENGTH = 1000, MAX_NB_WORDS = 20000):
    # finally, vectorize the text samples into a 2D integer tensor
    tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)
    tokenizer.fit_on_texts(texts)
    sequences = tokenizer.texts_to_sequences(texts)

    word_index = tokenizer.word_index
    print('Found %s unique tokens.' % len(word_index))

    X = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
    
    print('Shape of data tensor:', data.shape)
    indices = np.arange(data.shape[0])
    np.random.shuffle(indices)
    data = data[indices]
    testX = data[:]
    
    if labels is not None:
        y = to_categorical(np.asarray(labels))
        print('Shape of label tensor:', labels.shape)
        labels = labels[indices]
        testY = labels[:]
        return (testX, testY)
        
    return (testX)

In [None]:
def compileModel(classes, EMBEDDING_DIM = 100, MAX_SEQUENCE_LENGTH = 1000, CONVOLUTION_FEATURE = 256, 
                 DENSE_FEATURE = 1024, DROP_OUT = 0.3):
    model = Sequential()
    
    model.add(Embedding(                              # Layer 0, Start
            input_dim=nb_words + 1,                   # Size to dictionary, has to be input + 1
            output_dim=EMBEDDING_DIM,                 # Dimensions to generate
            weights=[embedding_matrix],               # Initialize word weights
            input_length=MAX_SEQUENCE_LENGTH))        # Define length to input sequences in the first layer
    
    model.add(Convolution1D(                          # Layer 1,   Features: 256, Kernel Size: 7
            nb_filter=CONVOLUTION_FEATURE,            # Number of kernels or number of filters to generate
            filter_length=7,                          # Size of kernels
            border_mode='valid',                      # Border = 'valid', cause kernel to reduce dimensions
            activation='relu'))                       # Activation function to use
    
    model.add(MaxPooling1D(                           # Layer 1a,  Max Pooling: 3
            pool_length=3))                           # Size of kernels
    
    model.add(Convolution1D(                          # Layer 2,   Features: 256, Kernel Size: 7
            nb_filter=CONVOLUTION_FEATURE,            # Number of kernels or number of filters to generate
            filter_length=7,                          # Size of kernels
            border_mode='valid',                      # Border = 'valid', cause kernel to reduce dimensions
            activation='relu'))                       # Activation function to use
    
    model.add(MaxPooling1D(                           # Layer 2a,  Max Pooling: 3
            pool_length=3))                           # Size of kernels
    
    model.add(Convolution1D(                          # Layer 3,   Features: 256, Kernel Size: 3
            nb_filter=CONVOLUTION_FEATURE,            # Number of kernels or number of filters to generate
            filter_length=3,                          # Size of kernels
            border_mode='valid',                      # Border = 'valid', cause kernel to reduce dimensions
            activation='relu'))                       # Activation function to use
    
    model.add(Convolution1D(                          # Layer 4,   Features: 256, Kernel Size: 3
            nb_filter=CONVOLUTION_FEATURE,            # Number of kernels or number of filters to generate
            filter_length=3,                          # Size of kernels
            border_mode='valid',                      # Border = 'valid', cause kernel to reduce dimensions
            activation='relu'))                       # Activation function to use
    
    model.add(Convolution1D(                          # Layer 5,   Features: 256, Kernel Size: 3
            nb_filter=CONVOLUTION_FEATURE,            # Number of kernels or number of filters to generate
            filter_length=3,                          # Size of kernels
            border_mode='valid',                      # Border = 'valid', cause kernel to reduce dimensions
            activation='relu'))                       # Activation function to use
    
    model.add(Convolution1D(                          # Layer 6,   Features: 256, Kernel Size: 3
            nb_filter=CONVOLUTION_FEATURE,            # Number of kernels or number of filters to generate
            filter_length=3,                          # Size of kernels
            border_mode='valid',                      # Border = 'valid', cause kernel to reduce dimensions
            activation='relu'))                       # Activation function to use
    
    model.add(MaxPooling1D(                           # Layer 6a,  Max Pooling: 3
            pool_length=3))                           # Size of kernels
    
    model.add(Flatten())                              # Layer 7
    
    model.add(Dense(                                  # Layer 7a,  Output Size: 1024
            output_dim=DENSE_FEATURE,                 # Output dimension
            activation='relu'))                       # Activation function to use
    
    model.add(Dropout(DROP_OUT))
    
    model.add(Dense(                                  # Layer 8,   Output Size: 1024
            output_dim=DENSE_FEATURE,                 # Output dimension
            activation='relu'))                       # Activation function to use
    
    model.add(Dropout(DROP_OUT))
    
    model.add(Dense(                                  # Layer 9,  Output Size: Size Unique Labels, Final
            output_dim=classes,                       # Output dimension
            activation='softmax'))                    # Activation function to use
    
    sgd = SGD(lr=0.01, momentum=0.9, nesterov=True)

    adadelta = Adadelta(lr=1.0, rho=0.95, epsilon=1e-08)

    model.compile(loss='categorical_crossentropy', optimizer=sgd,
                  metrics=['accuracy'])
    
    return model

In [None]:
def fitModel(model, trainX, trainY, valX, valY, nb_epoch=25, batch_size=128):
    # Function to take input of data and return fitted model
    model.fit(trainX, trainY, validation_data=(testX, testY),
          nb_epoch=nb_epoch, batch_size=batch_size)
    

In [None]:
def predictModel(model, testX, batch_size=128):
    # Function to take input of data and return prediction model
    model.predict(testX, batch_size=batch_size)