# LSTM CLASSIFIER

In [1]:
""" Performing Classification analysis to a dataset of tweets. """
import numpy as np
import pandas as pd
# For sentiment analysis
from textblob import TextBlob
import matplotlib.pyplot as plt
# Train-test split
from sklearn.model_selection import train_test_split
# Import Naive Bayes
from sklearn.naive_bayes import ComplementNB, MultinomialNB
#import cross_validate
from sklearn.model_selection import cross_validate

In [2]:
import seaborn as sns
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D, GRU
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
from keras.callbacks import EarlyStopping
from keras.layers import Dropout
from keras.optimizers import Adam
import re
from nltk.corpus import stopwords
from nltk import word_tokenize
STOPWORDS = set(stopwords.words('english'))

Using TensorFlow backend.


# Read data and split in train/test

In [27]:
exchange = {"n": "novela", "e": "ensayo", "t": "teatro", "r": "terror"}

In [28]:
 #4 clases - 25% vs 40%
df = pd.read_csv('AWLASL.csv', sep=",", encoding="Latin-1")
df.head()
# len(df)

Unnamed: 0,text,ASL,AWL,class,text_pos,nouns,pronouns
0,es una buena pieza,4.0,3.75,n,AUX DET ADJ NOUN,0.25,0.0
1,quiï¿½n lo habï¿½a de decir viï¿½ndola tan mor...,22.0,4.681818181818182,n,PRON PRON AUX ADP VERB NOUN ADV ADJ ADV ADJ AD...,0.227273,0.090909
2,pero la miseria todo lo corrompe y solita no h...,35.0,4.181818181818182,n,CONJ DET NOUN DET PRON VERB CONJ ADJ ADV AUX A...,0.228571,0.057143
3,justo castigo vive dios,4.0,5.0,n,ADJ NOUN VERB NOUN,0.5,0.0
4,de las ideas contrarias a la libertad de los p...,10.0,4.666666666666667,n,ADP DET NOUN ADJ ADP DET NOUN ADP DET NOUN,0.3,0.0


In [29]:
def modify_input(x):
    data = []
    for ex in x:
        text, text_pos = ex
        res = text+" "+text_pos
        data.append(res[:-1])
    
    return np.array(data)

In [34]:
X_ = df["text"].values.astype(str)
X_raw = X_

# uncomment these lines to use text + text_pos as input
X_ = df[["text", "text_pos"]].values.astype(str)
# X_raw = modify_input(X_)
X_raw = X_

Y = df["class"].values.astype(str)
Y[Y == "3"] = "r"

In [35]:
X_train, X_test, Y_train, Y_test = train_test_split(X_raw,Y, test_size = 0.3, random_state = 42, shuffle=True)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(5390, 2) (5390,)
(2310, 2) (2310,)


# Preprocess inputs

In [36]:
# The maximum number of words to be used. (most frequent)
MAX_NB_WORDS = 50000
# Max number of words in each complaint.
MAX_SEQUENCE_LENGTH = 100
# This is fixed.
EMBEDDING_DIM = 100

In [37]:
# TOKENIZE THE TEXT
tokenizer_text = Tokenizer(num_words=MAX_NB_WORDS, filters='@[\]^_`{|}~', lower=True) # "#$%&()*+,-./:;<=>?
tokenizer_text.fit_on_texts(X_train[:, 0])
word_index = tokenizer.word_index
print('Found %s unique tokens in text.' % len(word_index))

# TOKENIZE THE POS TAGS
tokenizer_pos = Tokenizer(num_words=MAX_NB_WORDS, filters='@[\]^_`{|}~', lower=True) # "#$%&()*+,-./:;<=>?
tokenizer_pos.fit_on_texts(X_train[:, 1])
word_index = tokenizer.word_index
print('Found %s unique tokens in pos.' % len(word_index))

Found 22607 unique tokens in text.
Found 22607 unique tokens in pos.


In [8]:
X_text = tokenizer.texts_to_sequences(X_train[:, 0])
X_text = pad_sequences(X_train[:, 0], maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X.shape)

X_pos  = tokenizer.texts_to_sequences(X_train[:, 1])
X_pos  = pad_sequences(X_train[:, 1], maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X.shape)

Shape of data tensor: (7700, 100)


In [9]:
Y = pd.get_dummies(Y)
print('Shape of label tensor:', Y.shape)

Shape of label tensor: (7700, 5)


In [18]:
# Define the LSTM model
model = Sequential()
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X.shape[1]))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2, return_sequences=True))
model.add(LSTM(32, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(Y_test.shape[-1], activation='softmax'))

opt = Adam(amsgrad=True)
model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])

In [19]:
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 100, 100)          5000000   
_________________________________________________________________
spatial_dropout1d_4 (Spatial (None, 100, 100)          0         
_________________________________________________________________
lstm_4 (LSTM)                (None, 100, 64)           42240     
_________________________________________________________________
lstm_5 (LSTM)                (None, 32)                12416     
_________________________________________________________________
dense_3 (Dense)              (None, 5)                 165       
Total params: 5,054,821
Trainable params: 5,054,821
Non-trainable params: 0
_________________________________________________________________
None


In [20]:
n_epochs = 5
batch_size = 64

history = model.fit(X_train, Y_train, epochs=n_epochs, batch_size=batch_size, validation_split=0.05)

Train on 5120 samples, validate on 270 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5

KeyboardInterrupt: 

In [14]:
accr = model.evaluate(X_test,Y_test)
print('Test set\n  Loss: {:0.4f}\n  Accuracy: {:0.4f}'.format(accr[0],accr[1]))

Test set
  Loss: 1.077
  Accuracy: 0.651


In [None]:
new_complaint = []
seq = tokenizer.texts_to_sequences(new_complaint)
padded = pad_sequences(seq, maxlen=MAX_SEQUENCE_LENGTH)
pred = model.predict(padded)
labels = []
print(pred, labels[np.argmax(pred)])