# Data Retrieval

In [24]:
import pandas as pd

def data_retrieval(filepath):
    df = pd.read_csv(filepath)
    return df

df = data_retrieval('gutenberg_expanded.csv')
print(df)

       Unnamed: 0   author  label  \
0               0   austen      0   
1               1   austen      0   
2               2   austen      0   
3               3   austen      0   
4               4   austen      0   
...           ...      ...    ...   
18888       18888  whitman     11   
18889       18889  whitman     11   
18890       18890  whitman     11   
18891       18891  whitman     11   
18892       18892  whitman     11   

                                                    text  
0      [Emma Jane Austen 1816] VOLUME I CHAPTER I Emm...  
1      Even Miss Taylor ceased hold nominal office go...  
2      It wedding-day beloved friend Emma sat mournfu...  
3      The want Miss Taylor felt hour day.She recalle...  
4      She dearly loved father, companion her.He meet...  
...                                                  ...  
18888  Mirages More experiences sights, stranger, you...  
18889  The Unexpress'd How dare it?After cycles, poem...  
18890  More evolutionary

# Data Analysis

In [25]:
import numpy as np
# determining average word count per text
word_count = []
for i in df['text'].values:
    word_count.append(len(i.split()))
word_count = np.array(word_count)

In [26]:
def get_stats(var):
    """Print summary statistics for a variable of interest.
    
    Args:
    var: array. Numpy array containing values for the variable of interest.

    Returns:
    None
    """
    print("Min:", np.min(var))
    print("Max:", np.max(var))
    print("Mean:", np.mean(var))
    print("Median", np.median(var))
    print("1st percentile", np.percentile(var, 1))
    print("95th percentile", np.percentile(var, 95))
    print("99th percentile", np.percentile(var, 99))
    print("99.5th Percentile", np.percentile(var, 99.5))
    print("99.9th Percentile", np.percentile(var, 99.9))

In [27]:
print("Word count statistics")
get_stats(word_count)

Word count statistics
Min: 1
Max: 875
Mean: 55.006351558778384
Median 49.0
1st percentile 8.0
95th percentile 115.0
99th percentile 181.0799999999981
99.5th Percentile 221.0
99.9th Percentile 319.0800000000381


In [28]:
import seaborn as sns
import matplotlib.pyplot as plt
# Plot word count distribution
sns.distplot(word_count, kde = False, bins = 70, color = 'blue').set_title("Word Count Distribution")
plt.xlabel('Excerpt Length (Words)')
plt.ylabel('Count')
plt.xlim(0, 500)
plt.savefig("word_count.eps")

The PostScript backend does not support transparency; partially transparent artists will be rendered opaque.
The PostScript backend does not support transparency; partially transparent artists will be rendered opaque.
The PostScript backend does not support transparency; partially transparent artists will be rendered opaque.
The PostScript backend does not support transparency; partially transparent artists will be rendered opaque.
The PostScript backend does not support transparency; partially transparent artists will be rendered opaque.
The PostScript backend does not support transparency; partially transparent artists will be rendered opaque.
The PostScript backend does not support transparency; partially transparent artists will be rendered opaque.
The PostScript backend does not support transparency; partially transparent artists will be rendered opaque.
The PostScript backend does not support transparency; partially transparent artists will be rendered opaque.
The PostScript back

# Splitting the Data

In [29]:
from sklearn.model_selection import train_test_split
text = df['text'].values
labels = df['label'].values
# splitting data into 80-20 for training and testing
text_train, text_test_val, label_train, label_test_val = train_test_split(text,labels, test_size = 0.2, random_state = 42)
# splitting testing data into 50-50 for testing and validation
text_test, text_val, label_test, label_val = train_test_split(text_test_val, label_test_val, test_size=.5, random_state=42)
print(text_train.shape, label_train.shape)
print(text_test.shape, label_test.shape)
print(text_val.shape, label_val.shape)

(15114,) (15114,)
(1889,) (1889,)
(1890,) (1890,)


# Text Processing
## Input

In [30]:
from gensim.parsing.preprocessing import remove_stopwords

# removing stop words
def remove_stop_words_preprocessing(text_values):
    for i in range(len(text_values)):
        text_values[i] = remove_stopwords(text_values[i].lower())
    return text_values



In [31]:
# word embeddings
from keras.preprocessing.text import Tokenizer

def word_embeddings(text_values):
    tokenizer = Tokenizer(num_words=5000, oov_token="UNK")
    tokenizer.fit_on_texts(text_values)
    word_index = tokenizer.word_index
    print('Found %s unique tokens.' % len(word_index))
    return tokenizer

Using TensorFlow backend.


In [46]:
# Removing the stop words
X = remove_stop_words_preprocessing(text_train)
X_val_text = remove_stop_words_preprocessing(text_val)
X_test_text = remove_stop_words_preprocessing(text_test)

# Creating a tokenization
tokenizer = word_embeddings(X)

# Tokenizing the sentences
X_train = tokenizer.texts_to_sequences(X)
X_val = tokenizer.texts_to_sequences(X_val_text)
X_test = tokenizer.texts_to_sequences(X_test_text)

Found 41126 unique tokens.


In [47]:
# vocab size
vocab_size = len(tokenizer.word_index)+1
print(vocab_size)

41127


In [48]:
from keras.preprocessing.sequence import pad_sequences

max_sequence_length = 80
X_train = pad_sequences(X_train, padding="post", maxlen=max_sequence_length)
X_val = pad_sequences(X_val, padding="post", maxlen=max_sequence_length)
X_test = pad_sequences(X_test, padding="post", maxlen=max_sequence_length)
print('Shape of data tensor:', X_train.shape)

Shape of data tensor: (15114, 80)


## Output

In [49]:
Y_train = pd.get_dummies(label_train).values
Y_val = pd.get_dummies(label_val).values
Y_test = pd.get_dummies(label_test).values
print('Shape of label tensor: ', Y_train.shape)

Shape of label tensor:  (15114, 12)


# Keras Modeling

In [50]:
from keras.models import Sequential
from keras import layers
from keras import callbacks

embedding_dim = 50

callback = callbacks.EarlyStopping(monitor='loss', patience=5)
model = Sequential()
model.add(layers.Embedding(input_dim=vocab_size, 
                   output_dim=embedding_dim,
                  input_length=max_sequence_length))
model.add(layers.SpatialDropout1D(0.2))

model.add(layers.LSTM(100, dropout=0.2, recurrent_dropout=0.2))
# model.add(layers.Dense(12, activation='softmax'))
model.add(layers.Dense(12, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 80, 50)            2056350   
_________________________________________________________________
spatial_dropout1d_2 (Spatial (None, 80, 50)            0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 100)               60400     
_________________________________________________________________
dense_2 (Dense)              (None, 12)                1212      
Total params: 2,117,962
Trainable params: 2,117,962
Non-trainable params: 0
_________________________________________________________________


In [None]:
history = model.fit(X_train, Y_train, 
                    epochs=15, 
                    validation_data=(X_val, Y_val),
                    batch_size=25,
                    callbacks=[callback])

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 15114 samples, validate on 1890 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15

In [44]:
from sklearn.metrics import confusion_matrix
import numpy as np

accuracy = model.evaluate(X_test, Y_test)
prediction = model.predict(X_test)
confusion_matrix = confusion_matrix(Y_test.argmax(axis=1), prediction.argmax(axis=1))
print(confusion_matrix)

print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accuracy[0],accuracy[1]))

ValueError: Error when checking input: expected embedding_1_input to have shape (80,) but got array with shape (38855,)

## Comparing Against Other Models

### Logistics Regression

In [39]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
vectorizer.fit(text_train)
X_train = vectorizer.transform(text_train)
X_test  = vectorizer.transform(text_test)
X_val = vectorizer.transform(text_val)

In [40]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 


from sklearn.linear_model import LogisticRegression
from numpy import mean
from numpy import std
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score

classifier = LogisticRegression(multi_class='multinomial', solver="lbfgs")
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(classifier, X_train, label_train, scoring='accuracy', cv=cv, n_jobs=-1)
# classifier.fit(X_train, Y_train)
# score = classifier.score(X_test, Y_test)
print("Accuracy:", mean(n_scores))

Accuracy: 0.9603693943885853


In [None]:
# Multinomial / Binomial

In [None]:
# Ngram Distribution Testing

In [None]:
# SVM Model