Initial Exploration: https://www.kaggle.com/arthurtok/spooky-nlp-and-topic-modelling-tutorial/notebook

In [1]:
import base64
import numpy as np
import pandas as pd
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, HashingVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from matplotlib import pyplot as plt
from wordcloud import WordCloud, STOPWORDS
import codecs
import string

%matplotlib inline

In [2]:
import nltk
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

from nltk import word_tokenize
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

[nltk_data] Downloading package punkt to /Users/pcorr/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/pcorr/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/pcorr/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
import keras
from keras.models import Sequential
from keras.layers import Dense, GlobalAveragePooling1D, Embedding
from keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.recurrent import LSTM, GRU
import keras.backend as K
from keras.callbacks import EarlyStopping
from keras.layers.embeddings import Embedding
from keras.layers.normalization import BatchNormalization
from keras.preprocessing import sequence, text

from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.datasets import imdb

from sklearn.model_selection import train_test_split

np.random.seed(7)

Using TensorFlow backend.

compiletime version 3.5 of module 'tensorflow.python.framework.fast_tensor_util' does not match runtime version 3.6



In [4]:
import xgboost as xgb
from tqdm import tqdm
from sklearn.svm import SVC
from keras.utils import np_utils
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB



This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.



In [5]:
stopwords = nltk.corpus.stopwords.words('english')
lemm = WordNetLemmatizer()

In [6]:
# # Allow dataframes to be displayed side by side
# from IPython.display import display, HTML

# CSS = """
# .output {
#     flex-direction: row;
# }
# """

# HTML('<style>{}</style>'.format(CSS))

### Methods

In [7]:
# Use to tokenize and lemmatize a list of paragraphs where each paragraph is a string
def lemma_tokenizer(paragraphs):
    # use the standard scikit-learn tokenizer first
    standard_tokenizer = CountVectorizer().build_tokenizer()
    tokens = standard_tokenizer(paragraphs)
    # then use NLTK to perform lemmatisation on each token
    lemmatizer = nltk.stem.WordNetLemmatizer()
    lemma_tokens = []
    for token in tokens:
        lemma_tokens.append( lemmatizer.lemmatize(token) )
    return lemma_tokens

### Read In Data

In [8]:
train = pd.read_csv('../Data/train.csv')
test = pd.read_csv('../Data/test.csv')
# sample_submission = pd.read_csv('sample_submission.csv')

In [9]:
display(train.head())
display(test.head())
# display(sample_submission.head())

Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id17569,It never once occurred to me that the fumbling...,HPL
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP
3,id27763,How lovely is spring As we looked from Windsor...,MWS
4,id12958,"Finding nothing else, not even gold, the Super...",HPL


Unnamed: 0,id,text
0,id02310,"Still, as I urged our leaving Ireland with suc..."
1,id24541,"If a fire wanted fanning, it could readily be ..."
2,id00134,And when they had broken down the frail door t...
3,id27757,While I was thinking how I should possibly man...
4,id04081,I am not sure to what limit his knowledge may ...


In [10]:
print(train.shape)
print(test.shape)


(19579, 3)
(8392, 2)


### NLP - Tokenize, Stopwords, Lemmatize and Vectorize

#### TFIDFVectorizer

This is memory intensive. Doesn't just count the frequencies of words like CountVectoriser does. It weights words depending on how often they occur across documents. Eg. take 3 words that may occur. One often, one very infrequently and one inbetween. In this case they could be 'the','thing' and 'heuristic'

In [11]:
# vectorizer =  TfidfVectorizer(max_df=0.8, max_features=200000,
#                                  min_df=0.001, stop_words='english',
#                                  use_idf=True, tokenizer=lemma_tokenizer, ngram_range=(1,1))

In [12]:
# tfidf_matrix = vectorizer.fit_transform(eap)
# tfidf_list = tfidf_matrix.todense().tolist()

In [13]:
# df_tfidf = pd.DataFrame(tfidf_list, columns=vectorizer.get_feature_names())

In [14]:
# df_tfidf.head()

In [15]:
# plt.figure(figsize=(16,10))
# df_tfidf.astype(bool).sum(axis=0).sort_values(ascending=False)[-50:].plot.bar()
# plt.title('Frequency of least common words');

### Keras FastText 

Create labels

In [16]:
author_dict = {'EAP': 0, 'HPL' : 1, 'MWS' : 2}

In [17]:
y = np.array([author_dict[a] for a in train.author])
y = to_categorical(y)

In [18]:
y

array([[ 1.,  0.,  0.],
       [ 0.,  1.,  0.],
       [ 1.,  0.,  0.],
       ..., 
       [ 1.,  0.,  0.],
       [ 1.,  0.,  0.],
       [ 0.,  1.,  0.]])

#### Tokenize

In [19]:
def preprocessing(sentence_list):
    # Tokenize
    tokenized_list = []
    for sentence in sentence_list:
        tokenized_list.append(nltk.word_tokenize(sentence))

    # lower case, stopwords and lemmatize
    cleaned_list = []
    for tokenized_sentence in tokenized_list:
        cleaned_list.append([lemm.lemmatize(word.lower()) for word in tokenized_sentence if word.lower() not in stopwords])

    # Recreate text
    recreated_strings = []
    for sentence in cleaned_list:
        recreated_strings.append("".join([" "+i if not i.startswith("'") and i not in string.punctuation else i for i in sentence]).strip())
    return(recreated_strings)

In [20]:
# Apply NLP
preprocessed_train_text = preprocessing(train.text)

In [21]:
preprocessed_train_text[0]

'process, however, afforded mean ascertaining dimension dungeon; might make circuit, return point whence set, without aware fact; perfectly uniform seemed wall.'

In [22]:
# Keras Tokenizer
tokenizer = Tokenizer(lower=True, filters='')

In [23]:
# Train tokenizer
tokenizer.fit_on_texts(preprocessed_train_text)

In [24]:
preprocessed_train_seq = tokenizer.texts_to_sequences(preprocessed_train_text)


In [25]:
preprocessed_train_seq

[[5852,
  52,
  1281,
  104,
  11115,
  4735,
  20610,
  8,
  49,
  20611,
  229,
  128,
  1823,
  4736,
  38,
  817,
  20612,
  1191,
  5853,
  18,
  1576],
 [15, 835, 5854, 8, 239, 4737],
 [59,
  73,
  864,
  5226,
  9072,
  20613,
  865,
  2459,
  214,
  1282,
  1982,
  94,
  5226,
  7663,
  139,
  532,
  296,
  321,
  11116],
 [435,
  769,
  101,
  1283,
  3961,
  2619,
  2778,
  5227,
  680,
  4327,
  9073,
  436,
  934,
  14407,
  1052,
  101,
  437,
  483,
  70,
  9074,
  7664],
 [895,
  83,
  7665,
  7,
  3397,
  5228,
  2335,
  11117,
  6622,
  132,
  1733,
  2960,
  355,
  6623,
  1384,
  14408],
 [484,
  143,
  3398,
  277,
  51,
  533,
  290,
  7666,
  20614,
  3962,
  14409,
  285,
  1734,
  722,
  14410,
  378,
  14411,
  4738,
  1385,
  20615,
  15,
  866,
  3963,
  47,
  14412,
  1512,
  3169,
  20616,
  70,
  867,
  9075,
  1158,
  6624,
  48,
  2207,
  6625,
  578,
  1891,
  9076],
 [9077, 836, 1083, 94, 3654, 1192, 1892, 14413, 4328, 197, 54, 5229],
 [14414, 566, 206

In [26]:
maxlen = 256
padded_train_seq = pad_sequences(sequences=preprocessed_train_seq, maxlen=maxlen)

In [27]:
input_dim = np.max(padded_train_seq) + 1
embedding_dims = 20

In [28]:
model = Sequential()
model.add(Embedding(input_dim=input_dim, output_dim=embedding_dims))
model.add(GlobalAveragePooling1D())
model.add(Dense(3, activation='softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [29]:
epochs = 45
x_train, x_test, y_train, y_test = train_test_split(padded_train_seq, y, test_size=0.01)

n_samples = x_train.shape[0]

hist = model.fit(x_train, y_train,
                 batch_size=16,
                 validation_data=(x_test, y_test),
                 epochs=epochs,
                 callbacks=[EarlyStopping(patience=2, monitor='val_loss')])

Train on 19383 samples, validate on 196 samples
Epoch 1/45
Epoch 2/45
Epoch 3/45
Epoch 4/45
Epoch 5/45
Epoch 6/45
Epoch 7/45
Epoch 8/45
Epoch 9/45
Epoch 10/45
Epoch 11/45
Epoch 12/45
Epoch 13/45
Epoch 14/45
Epoch 15/45
Epoch 16/45
Epoch 17/45
Epoch 18/45
Epoch 19/45
Epoch 20/45


In [30]:
preprocessed_test_text = preprocessing(test.text)
preprocessed_test_seq = tokenizer.texts_to_sequences(preprocessed_test_text)
padded_test_seq = pad_sequences(sequences=preprocessed_test_seq, maxlen=maxlen)
y = model.predict_proba(padded_test_seq)

result = pd.read_csv('../Data/sample_submission.csv')
for author, i in author_dict.items():
    result[author] = y[:, i]

In [31]:
result.head()

Unnamed: 0,id,EAP,HPL,MWS
0,id02310,0.228407,0.059305,0.712288
1,id24541,0.967941,0.02947,0.002589
2,id00134,0.964863,0.034605,0.000532
3,id27757,0.567413,0.428866,0.003722
4,id04081,0.967509,0.017965,0.014525


In [None]:
result.to_csv('../predictions/predictions.csv', index=False)