In [1]:

# coding: utf-8
from __future__ import print_function
import os
import numpy as np
np.random.seed(1337)

import pandas as pd
import theano
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.layers import Dense, Flatten
from keras.layers import Convolution1D, MaxPooling1D, Embedding
from keras.models import Model
from keras.layers import Input, Dropout
from keras.optimizers import SGD, Adadelta
from keras.models import Sequential
import sys

BASE_DIR = '../../'
GLOVE_DIR = BASE_DIR + 'glove/'

MAX_SEQUENCE_LENGTH = 1000
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.3
CONVOLUTION_FEATURE = 256
BORDER_MODE = 'valid'
DENSE_FEATURE = 1024
DROP_OUT = 0.3
LEARNING_RATE=0.0001
MOMENTUM=0.9
EPOCH=25
BATCH_SIZE=128
embed = 100
embedfile = 'glove.6B.100d.txt'
authorList = [43, 3]
doc_id = 160
chunk_size = 1000
nb_epoch = 30
EPOCH = nb_epoch

Using gpu device 0: GeForce GTX 950 (CNMeM is disabled, cuDNN 5005)
Using Theano backend.


In [2]:
# first, build index mapping words in the embeddings set
# to their embedding vector

print('Indexing word vectors.')

embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

# second, prepare text samples and their labels
print('Processing text dataset')

Indexing word vectors.
Found 400000 word vectors.
Processing text dataset


In [3]:
texts = []  # list of text samples
labels_index = {}  # dictionary mapping label name to numeric id
labels = []  # list of label ids
import DatabaseQuery
# textToUse = pd.read_csv("suffle_4_6000.csv", names=["author_id", "doc_content"], dtype={'author_id': int})
from sshtunnel import SSHTunnelForwarder
PORT=5432
with SSHTunnelForwarder(('srn02.cs.cityu.edu.hk', 22),
                        ssh_username='stylometry',
                        ssh_password='stylometry',
                        remote_bind_address=('localhost', 5432),
                        local_bind_address=('localhost', 5400)):
    textToUse = DatabaseQuery.getWordAuthData(5400, authors = authorList, doc = doc_id,
                                              chunk_size = chunk_size)

Execution completed
Read completed
Number of rows: 91
author_id       int64
doc_content    object
dtype: object
Data Frame created: Shape: (6539, 2)


In [4]:
labels = []
texts = []
size = []
authorList = textToUse.author_id.unique()
for auth in authorList:
    current = textToUse.loc[textToUse['author_id'] == auth]
    size.append(current.shape[0])
    print("Author: %5s  Size: %5s" % (auth, max(size)))
print("Min: %s" % (min(size)))
print("Max: %s" % (max(size)))

authorList = authorList.tolist()

Author:     3  Size:  3099
Author:    43  Size:  3440
Min: 3099
Max: 3440


In [5]:
labels = []
texts = []
maxRows = min(size)
for auth in authorList:
    current = textToUse.loc[textToUse['author_id'] == auth]
    current = current.sample(n = maxRows)
    textlist = current.doc_content.tolist()
    texts = texts + textlist
    labels = labels + [authorList.index(author_id) for author_id in current.author_id.tolist()]
labels_index = {}
labels_index[0] = 0
for i, auth in enumerate(authorList):
    labels_index[i] = auth

del textToUse

print('Found %s texts.' % len(texts))
print('Found %s labels.' % len(labels))

Found 6198 texts.
Found 6198 labels.


In [6]:
# finally, vectorize the text samples into a 2D integer tensor
tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

In [7]:
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

labels = to_categorical(np.asarray(labels))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

# split the data into a training set and a validation set
from sklearn.model_selection import train_test_split
x_train, x_val, y_train, y_val = train_test_split(data, labels, test_size=VALIDATION_SPLIT)


Found 60904 unique tokens.
Shape of data tensor: (6198, 1000)
Shape of label tensor: (6198, 2)


In [8]:
print(y_train[4])
print(len(x_train[4]))
print(type(x_train[4]))

[ 1.  0.]
1000
<type 'numpy.ndarray'>


In [9]:
print('Preparing embedding matrix.')

# prepare embedding matrix
nb_words = min(MAX_NB_WORDS, len(word_index))
embedding_matrix = np.zeros((nb_words + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    if i > MAX_NB_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

print('Training model.')

Preparing embedding matrix.
Training model.


In [10]:
print(y_train[4])
print(len(x_train[4]))

[ 1.  0.]
1000


In [11]:
print(nb_words)
print(MAX_NB_WORDS)
print(len(word_index))

20000
20000
60904


In [12]:

model = Sequential()

model.add(Embedding(                          # Layer 0, Start
    input_dim=nb_words + 1,                   # Size to dictionary, has to be input + 1
    output_dim=EMBEDDING_DIM,                 # Dimensions to generate
    weights=[embedding_matrix],               # Initialize word weights
    input_length=MAX_SEQUENCE_LENGTH))        # Define length to input sequences in the first layer

model.add(Convolution1D(                      # Layer 1,   Features: 256, Kernel Size: 7
    nb_filter=CONVOLUTION_FEATURE,            # Number of kernels or number of filters to generate
    filter_length=7,                          # Size of kernels
    border_mode=BORDER_MODE,                  # Border = 'valid', cause kernel to reduce dimensions
    activation='relu'))                       # Activation function to use

model.add(MaxPooling1D(                       # Layer 1a,  Max Pooling: 3
    pool_length=3))                           # Size of kernels

model.add(Convolution1D(                      # Layer 2,   Features: 256, Kernel Size: 7
    nb_filter=CONVOLUTION_FEATURE,            # Number of kernels or number of filters to generate
    filter_length=7,                          # Size of kernels
    border_mode=BORDER_MODE,                  # Border = 'valid', cause kernel to reduce dimensions
    activation='relu'))                       # Activation function to use

model.add(MaxPooling1D(                       # Layer 2a,  Max Pooling: 3
    pool_length=3))                           # Size of kernels

model.add(Convolution1D(                      # Layer 3,   Features: 256, Kernel Size: 3
    nb_filter=CONVOLUTION_FEATURE,            # Number of kernels or number of filters to generate
    filter_length=3,                          # Size of kernels
    border_mode=BORDER_MODE,                  # Border = 'valid', cause kernel to reduce dimensions
    activation='relu'))                       # Activation function to use

model.add(Convolution1D(                      # Layer 4,   Features: 256, Kernel Size: 3
    nb_filter=CONVOLUTION_FEATURE,            # Number of kernels or number of filters to generate
    filter_length=3,                          # Size of kernels
    border_mode=BORDER_MODE,                  # Border = 'valid', cause kernel to reduce dimensions
    activation='relu'))                       # Activation function to use

model.add(Convolution1D(                      # Layer 5,   Features: 256, Kernel Size: 3
    nb_filter=CONVOLUTION_FEATURE,            # Number of kernels or number of filters to generate
    filter_length=3,                          # Size of kernels
    border_mode=BORDER_MODE,                  # Border = 'valid', cause kernel to reduce dimensions
    activation='relu'))                       # Activation function to use

model.add(Convolution1D(                      # Layer 6,   Features: 256, Kernel Size: 3
    nb_filter=CONVOLUTION_FEATURE,            # Number of kernels or number of filters to generate
    filter_length=5,                          # Size of kernels
    border_mode=BORDER_MODE,                  # Border = 'valid', cause kernel to reduce dimensions
    activation='relu'))                       # Activation function to use

model.add(MaxPooling1D(                       # Layer 6a,  Max Pooling: 3
    pool_length=3))                           # Size of kernels

model.add(Flatten())                          # Layer 7

model.add(Dense(                              # Layer 7a,  Output Size: 1024
    output_dim=DENSE_FEATURE,                 # Output dimension
    activation='relu'))                       # Activation function to use

model.add(Dropout(DROP_OUT))

model.add(Dense(                              # Layer 8,   Output Size: 1024
    output_dim=DENSE_FEATURE,                 # Output dimension
    activation='relu'))                       # Activation function to use

model.add(Dropout(DROP_OUT))

model.add(Dense(                              # Layer 9,  Output Size: Size Unique Labels, Final
    output_dim=len(labels_index),             # Output dimension
    activation='softmax'))                    # Activation function to use

# model = Model(start, end)

sgd = SGD(lr=LEARNING_RATE, momentum=MOMENTUM, nesterov=True)

adadelta = Adadelta(lr=1.0, rho=0.95, epsilon=1e-08)

model.compile(loss='categorical_crossentropy', optimizer=sgd,
              metrics=['accuracy'])

print("Done compiling.")

Done compiling.


In [13]:
model.fit(x_train, y_train, validation_data=(x_val, y_val),
          nb_epoch=EPOCH, batch_size=BATCH_SIZE)


Train on 4338 samples, validate on 1860 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x7fd5e808d610>

In [14]:
texts = []  # list of text samples
labels = []  # list of label ids
import DatabaseQuery
# textToUse = pd.read_csv("suffle_4_6000.csv", names=["author_id", "doc_content"], dtype={'author_id': int})
from sshtunnel import SSHTunnelForwarder
PORT=5432
with SSHTunnelForwarder(('srn02.cs.cityu.edu.hk', 22),
                        ssh_username='stylometry',
                        ssh_password='stylometry',
                        remote_bind_address=('localhost', 5432),
                        local_bind_address=('localhost', 5400)):
    textToUse = DatabaseQuery.getWordDocData(5400, doc_id, chunk_size = chunk_size)
labels = []
texts = []
for index, row in textToUse.iterrows():
    labels.append(authorList.index(row.author_id))
    texts.append(row.doc_content)
        
print('Found %s texts.' % len(texts))

del textToUse


Execution completed
Read completed
Number of rows: 1
author_id       int64
doc_content    object
dtype: object
Data Frame created: Shape: (32, 2)
Found 32 texts.


In [15]:
# finally, vectorize the text samples into a 2D integer tensor
sequences = tokenizer.texts_to_sequences(texts)

print('Found %s unique tokens.' % len(word_index))

X = pad_sequences(sequences, maxlen = chunk_size)

print('Shape of data tensor:', X.shape)

testX = X[:]


Found 60904 unique tokens.
Shape of data tensor: (32, 1000)


In [19]:
# Function to take input of data and return prediction model
predY = np.array(model.predict(testX, batch_size=128))
def entroPred(predY):
    predYList = predY[:]
    entro = []
    import math
    for row in predY:
        entroval = 0
        for i in row:
            entroval += (i * (math.log(i , 2)))
        entroval = -1 * entroval
        entro.append(entroval)
    yx = zip(entro, predY)
    yx = sorted(yx, key = lambda t: t[0])
    newPredY = [x for y, x in yx]
    predYEntroList = newPredY[:int(len(newPredY)*0.9)]
    predY = np.mean(predYEntroList, axis=0)
    return predY

In [20]:
print(labels_index)

{0: 3, 1: 43}


In [21]:
print(entroPred(predY[:]))

[ 0.52368838  0.47631165]


In [22]:
print(predY)

[[ 0.53053093  0.46946901]
 [ 0.48672009  0.51327991]
 [ 0.52540791  0.47459206]
 [ 0.50872535  0.49127465]
 [ 0.49387869  0.50612134]
 [ 0.50727528  0.49272472]
 [ 0.53402877  0.46597123]
 [ 0.52285004  0.47714999]
 [ 0.50559354  0.49440646]
 [ 0.53350699  0.46649304]
 [ 0.5267632   0.4732368 ]
 [ 0.52736121  0.47263882]
 [ 0.53387505  0.46612498]
 [ 0.53431618  0.46568382]
 [ 0.53500378  0.46499625]
 [ 0.50748444  0.49251553]
 [ 0.52430636  0.47569361]
 [ 0.50413996  0.49586004]
 [ 0.53145212  0.46854788]
 [ 0.53449231  0.46550769]
 [ 0.52911252  0.47088751]
 [ 0.52741867  0.47258133]
 [ 0.53259265  0.46740735]
 [ 0.50229627  0.4977037 ]
 [ 0.50639015  0.49360982]
 [ 0.53932118  0.46067885]
 [ 0.53250027  0.46749973]
 [ 0.5178749   0.48212507]
 [ 0.50459504  0.49540496]
 [ 0.52137285  0.47862709]
 [ 0.53793979  0.46206024]
 [ 0.5207727   0.47922727]]
