In [None]:

# coding: utf-8
from __future__ import print_function
import os
import numpy as np
np.random.seed(123)

import pandas as pd
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.layers import Dense, Flatten
from keras.layers import Convolution1D, MaxPooling1D, Embedding
from keras.models import Model
from keras.layers import Input, Dropout
from keras.optimizers import SGD, Adadelta
from keras.models import Sequential
import sys

BASE_DIR = '../../'
GLOVE_DIR = BASE_DIR + 'glove/'


MAX_NB_WORDS = 20000
VALIDATION_SPLIT = 0.2
CONVOLUTION_FEATURE = 256
BORDER_MODE = 'valid'
DENSE_FEATURE = 1024
DROP_OUT = 0.4
LEARNING_RATE=0.0001
MOMENTUM=0.9
EPOCH=25
BATCH_SIZE=16
EMBEDDING_DIM = 200
embedfile = 'glove.6B.' + str(EMBEDDING_DIM) + 'd.txt'
doc_id = 161
author_id = 80
authorList = [11, 18, 80, 88, 64]
chunk_size = 1500
MAX_SEQUENCE_LENGTH = chunk_size
nb_epoch = 30
EPOCH = nb_epoch

In [None]:
# first, build index mapping words in the embeddings set
# to their embedding vector

print('Indexing word vectors.')

embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, embedfile))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

# second, prepare text samples and their labels
print('Processing text dataset')

In [None]:
texts = []  # list of text samples
labels_index = {}  # dictionary mapping label name to numeric id
labels = []  # list of label ids
import DatabaseQuery
# textToUse = pd.read_csv("suffle_4_6000.csv", names=["author_id", "doc_content"], dtype={'author_id': int})
from sshtunnel import SSHTunnelForwarder
PORT=5432
with SSHTunnelForwarder(('srn02.cs.cityu.edu.hk', 22),
                        ssh_username='stylometry',
                        ssh_password='stylometry',
                        remote_bind_address=('localhost', 5432),
                        local_bind_address=('localhost', 5400)):
    textToUse = DatabaseQuery.getWordAuthData(5400, authors = authorList, doc = doc_id,
                                              chunk_size = chunk_size)

In [None]:
textToUse.loc[1000]

In [None]:
labels = []
texts = []
size = []
authorList = textToUse.author_id.unique()
for auth in authorList:
    current = textToUse.loc[textToUse['author_id'] == auth]
    size.append(current.shape[0])
    print("Author: %5s  Size: %5s" % (auth, current.shape[0]))
print("Min: %s" % (min(size)))
print("Max: %s" % (max(size)))

authorList = authorList.tolist()

for auth in authorList:
    current = textToUse.loc[textToUse['author_id'] == auth]
    samples = min(size)
    current = current.sample(n = samples)
    textlist = current.doc_content.tolist()
    texts = texts + textlist
    labels = labels + [authorList.index(author_id) for author_id in current.author_id.tolist()]

labels_index = {}
labels_index[0] = 0
for i, auth in enumerate(authorList):
    labels_index[i] = auth

del textToUse

In [None]:
print('Found %s texts.' % len(texts))
print('Found %s labels.' % len(labels))

In [None]:
print(labels[10])
print(texts[10])

In [None]:
# finally, vectorize the text samples into a 2D integer tensor
tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

In [None]:
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

In [None]:
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

In [None]:
labels = to_categorical(np.asarray(labels))

In [None]:
labels[0]

In [None]:
data[0]

In [None]:
labels[10000]

In [None]:
data[10000]

In [None]:
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

In [None]:
# split the data into a training set and a validation set
# from sklearn.model_selection import KFold
# kfold = KFold(n_splits=6, shuffle=True, random_state=123)
from sklearn.model_selection import train_test_split
x_train, x_val, y_train, y_val = train_test_split(data, labels, test_size=VALIDATION_SPLIT)

In [None]:
print('Preparing embedding matrix.')

# prepare embedding matrix
nb_words = len(word_index)
embedding_matrix = np.zeros((nb_words + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    if i > MAX_NB_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

del embeddings_index

print('Training model.')

In [None]:
print(nb_words)
print(MAX_NB_WORDS)
print(len(word_index))

In [None]:
# def getModel():
model = Sequential()

model.add(Embedding(                          # Layer 0, Start
    input_dim=nb_words + 1,                   # Size to dictionary, has to be input + 1
    output_dim=EMBEDDING_DIM,                 # Dimensions to generate
    weights=[embedding_matrix],               # Initialize word weights
    input_length=MAX_SEQUENCE_LENGTH,         # Define length to input sequences in the first layer
    trainable=False))                         # Disable weight changes during training

model.add(Convolution1D(                      # Layer 1,   Features: 256, Kernel Size: 7
    nb_filter=CONVOLUTION_FEATURE,            # Number of kernels or number of filters to generate
    filter_length=7,                          # Size of kernels
    border_mode=BORDER_MODE,                  # Border = 'valid', cause kernel to reduce dimensions
    activation='relu'))                       # Activation function to use

model.add(MaxPooling1D(                       # Layer 1a,  Max Pooling: 3
    pool_length=3))                           # Size of kernels

model.add(Convolution1D(                      # Layer 2,   Features: 256, Kernel Size: 7
    nb_filter=CONVOLUTION_FEATURE,            # Number of kernels or number of filters to generate
    filter_length=7,                          # Size of kernels
    border_mode=BORDER_MODE,                  # Border = 'valid', cause kernel to reduce dimensions
    activation='relu'))                       # Activation function to use

model.add(MaxPooling1D(                       # Layer 2a,  Max Pooling: 3
    pool_length=3))                           # Size of kernels

model.add(Convolution1D(                      # Layer 3,   Features: 256, Kernel Size: 3
    nb_filter=CONVOLUTION_FEATURE,            # Number of kernels or number of filters to generate
    filter_length=3,                          # Size of kernels
    border_mode=BORDER_MODE,                  # Border = 'valid', cause kernel to reduce dimensions
    activation='relu'))                       # Activation function to use

model.add(Convolution1D(                      # Layer 4,   Features: 256, Kernel Size: 3
    nb_filter=CONVOLUTION_FEATURE,            # Number of kernels or number of filters to generate
    filter_length=3,                          # Size of kernels
    border_mode=BORDER_MODE,                  # Border = 'valid', cause kernel to reduce dimensions
    activation='relu'))                       # Activation function to use

model.add(Convolution1D(                      # Layer 5,   Features: 256, Kernel Size: 3
    nb_filter=CONVOLUTION_FEATURE,            # Number of kernels or number of filters to generate
    filter_length=3,                          # Size of kernels
    border_mode=BORDER_MODE,                  # Border = 'valid', cause kernel to reduce dimensions
    activation='relu'))                       # Activation function to use

model.add(Convolution1D(                      # Layer 6,   Features: 256, Kernel Size: 3
    nb_filter=CONVOLUTION_FEATURE,            # Number of kernels or number of filters to generate
    filter_length=3,                          # Size of kernels
    border_mode=BORDER_MODE,                  # Border = 'valid', cause kernel to reduce dimensions
    activation='relu'))                       # Activation function to use

model.add(MaxPooling1D(                       # Layer 6a,  Max Pooling: 3
    pool_length=3))                           # Size of kernels

model.add(Flatten())                          # Layer 7

model.add(Dense(                              # Layer 7a,  Output Size: 1024
    output_dim=DENSE_FEATURE,                 # Output dimension
    activation='relu'))                       # Activation function to use

model.add(Dropout(DROP_OUT))

model.add(Dense(                              # Layer 8,   Output Size: 1024
    output_dim=DENSE_FEATURE,                 # Output dimension
    activation='relu'))                       # Activation function to use

model.add(Dropout(DROP_OUT))

model.add(Dense(                              # Layer 9,  Output Size: Size Unique Labels, Final
    output_dim=len(labels_index),             # Output dimension
    activation='softmax'))                    # Activation function to use

# model = Model(start, end)

sgd = SGD(lr=LEARNING_RATE, momentum=MOMENTUM, nesterov=True)

model.compile(loss='categorical_crossentropy', optimizer=sgd,
              metrics=['accuracy'])

print("Done compiling.")
    
#     return model

In [None]:
# trainindices = []
# testindices = []
# for train, test in kfold.split(data):
#     print("train %s" % (str(train)))
#     print("test %s" % (str(test)))
#     trainindices.append(train)
#     testindices.append(test)

In [None]:
cvscores = []

# x_train, x_val, y_train, y_val = train_test_split(data, labels, test_size=VALIDATION_SPLIT)

# x_train = data[trainindices[0]]
# y_train = labels[trainindices[0]]
# x_val = data[testindices[0]]
# y_val = labels[testindices[0]]
# model = getModel()
history = model.fit(x_train, y_train, validation_data=(x_val, y_val),
                    nb_epoch=EPOCH, batch_size=BATCH_SIZE)
scores = model.evaluate(x_val, y_val, verbose=0)
print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
del model, x_train, y_train, x_val, y_val
cvscores.append(scores[1] * 100)

import time
time.sleep(60)
    

In [None]:
print("%.2f%% (+/- %.2f%%)" % (numpy.mean(cvscores), numpy.std(cvscores)))

In [None]:
texts = []  # list of text samples
labels = []  # list of label ids
import DatabaseQuery
# textToUse = pd.read_csv("suffle_4_6000.csv", names=["author_id", "doc_content"], dtype={'author_id': int})
from sshtunnel import SSHTunnelForwarder
PORT=5432
with SSHTunnelForwarder(('srn02.cs.cityu.edu.hk', 22),
                        ssh_username='stylometry',
                        ssh_password='stylometry',
                        remote_bind_address=('localhost', 5432),
                        local_bind_address=('localhost', 5400)):
    textToUse = DatabaseQuery.getWordDocData(5400, doc_id, chunk_size = chunk_size)
labels = []
texts = []
for index, row in textToUse.iterrows():
    labels.append(authorList.index(row.author_id))
    texts.append(row.doc_content)
        
print('Found %s texts.' % len(texts))

del textToUse


In [None]:
# finally, vectorize the text samples into a 2D integer tensor
sequences = tokenizer.texts_to_sequences(texts)

print('Found %s unique tokens.' % len(word_index))

X = pad_sequences(sequences, maxlen = chunk_size)

print('Shape of data tensor:', X.shape)

testX = X[:]


In [None]:
# Function to take input of data and return prediction model
predY = np.array(model.predict(testX, batch_size=BATCH_SIZE))
def entroPred(predY):
    predYList = predY[:]
    entro = []
    import math
    for row in predY:
        entroval = 0
        for i in row:
            entroval += (i * (math.log(i , 2)))
        entroval = -1 * entroval
        entro.append(entroval)
    yx = zip(entro, predY)
    yx = sorted(yx, key = lambda t: t[0])
    newPredY = [x for y, x in yx]
    predYEntroList = newPredY[:int(len(newPredY)*0.5)]
    predY = np.mean(predYEntroList, axis=0)
    return predY

In [None]:
print(labels_index)

In [None]:
for key, auth in labels_index.iteritems():
    if auth == author_id:
        loc = key
        
ans = predY[loc]

In [None]:
print(predY)