# Genre Classification 
This is an example for document Genre classification using the Brown corpus (provided by NLTK).

The 5000 most common words are taken from the Brown Corpus. Then we create a 5000 dimensional bag of words and input this to a neural network. The network predicts the genre.

The input for the network is a 5000 dimensional vector. Each position correspondence to one of the most common words. The value is set to 1, if the word appears in a document. Otherwise to 0.

## Reading the corpus
Reads in the corpus and create a bag of word representation for each document.

In [1]:
import nltk
import gensim
import nltk.corpus
import random

from nltk.corpus import brown
from nltk.stem.porter import *
import numpy as np


np.random.seed(0)

num_max_words = 5000

stopwords = {}
for stopword in nltk.corpus.stopwords.words('english'):
    stopwords[stopword.lower()] = True
    

def preprocessDocument(words):
    stemmer = PorterStemmer()
    return [stemmer.stem(w.lower()) for w in words if len(w) >= 3 and w.lower() not in stopwords]

brown_words = brown.words()
fdist = nltk.FreqDist(preprocessDocument(brown_words))


max_words = []
for word, freq in fdist.most_common(num_max_words):
    max_words.append(word)
    
max_words = sorted(max_words)   

max_words_idx = {}
idx = 0

for max_word in max_words:
    max_words_idx[max_word] = idx
    idx += 1



def getBoW(words):
    outputvector = [0]*len(max_words)
    
    prepocessed = preprocessDocument(words)
    
    for word in prepocessed:
        if word in max_words_idx:
            idx = max_words_idx[word]
            outputvector[idx] = 1 
    
    return outputvector

    
    

Couldn't import dot_parser, loading of dot files will not be possible.


## Train / Test Set
This creates the train and test sets.

In [7]:
category2Idx = {}
idx = 0
for cat in brown.categories():
    category2Idx[cat] = idx
    idx += 1

file_ids = sorted(brown.fileids())
print "File IDs:",",".join(file_ids[0:10])

random.seed(4)
random.shuffle(file_ids)

train_file_ids, test_file_ids = file_ids[0:300],file_ids[300:]

print "Train File IDs:",",".join(train_file_ids[0:10])
print "Test File IDs:",",".join(test_file_ids[0:10])

train_x = []
train_y = []

test_x = []
test_y = []

for fileid in train_file_ids:
    category = brown.categories(fileid)[0]
    all_words = brown.words(fileid) 
    bow = getBoW(all_words)
    
    train_x.append(bow)
    train_y.append(category2Idx[category])

for fileid in test_file_ids:
    category = brown.categories(fileid)[0]
    all_words = brown.words(fileid) 
    bow = getBoW(all_words)
    
    test_x.append(bow)
    test_y.append(category2Idx[category])
    

train_x = np.asarray(train_x, dtype='int32')
train_y = np.asarray(train_y, dtype='int32')
test_x = np.asarray(test_x, dtype='int32')
test_y = np.asarray(test_y, dtype='int32')

File IDs: ca01,ca02,ca03,ca04,ca05,ca06,ca07,ca08,ca09,ca10
Train File IDs: ce29,ce05,ck08,cg07,ck26,cj05,cf07,cg65,cj70,cj11
Test File IDs: cn11,cg73,cp18,cf40,ca11,ca09,cf21,cj67,cg51,ch13


## Neural Network
Given the Training and Test sets, we now define a feed forward network. We use a 500 dimensional hidden layer with dropout of 0.5.

Feel free to try different hidden layer sizes and number of hidden layers.

In [19]:

from keras.layers import containers
import keras
from keras.models import Sequential
from keras.layers.core import Dense, Flatten, AutoEncoder, Dropout
from keras.optimizers import SGD
from keras.utils import np_utils

batch_size = 30
nb_epoch = 50
nb_classes = len(category2Idx)

model = Sequential()
model.add(Dense(500, input_dim=num_max_words, activation='tanh'))
model.add(Dropout(0.5))
model.add(Dense(nb_classes, activation='softmax'))

train_y_cat = np_utils.to_categorical(train_y, nb_classes)
test_y_cat = np_utils.to_categorical(test_y, nb_classes)


model.compile(loss='categorical_crossentropy', optimizer='Adam')
score = model.evaluate(test_x, test_y_cat, show_accuracy=True, verbose=0)
print('Test score before fine turning:', score[0])
print('Test accuracy before fine turning:', score[1])
model.fit(train_x, train_y_cat, batch_size=batch_size, nb_epoch=nb_epoch,
          show_accuracy=True, validation_data=(test_x, test_y_cat))
score = model.evaluate(test_x, test_y_cat, show_accuracy=True, verbose=0)
print('Test score after fine turning:', score[0])
print('Test accuracy after fine turning:', score[1])


    


('Test score before fine turning:', 2.7576393604278566)
('Test accuracy before fine turning:', 0.070000000000000007)
Train on 300 samples, validate on 200 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50

KeyboardInterrupt: 