In [6]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import keras
import keras.backend as K
from keras.layers import *
from keras.losses import *
from keras.models import *
from keras.callbacks import *
from keras.activations import *
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import WordPunctTokenizer
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
train = pd.read_csv(r"C:\Users\HPPC\train.tsv", sep='\t')
test = pd.read_csv(r"C:\Users\HPPC\test.tsv", sep='\t')

train.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [3]:
def get_preprocessing_func():
    tokenizer = WordPunctTokenizer()
    lemmatizer = WordNetLemmatizer()
    def preprocessing_func(sent):
        return [lemmatizer.lemmatize(w) for w in tokenizer.tokenize(sent)]
    return preprocessing_func

X = train['Phrase'].apply(get_preprocessing_func()).values
y = train['Sentiment'].values
X_test = test['Phrase'].apply(get_preprocessing_func()).values

In [13]:
def prepare_tokenizer_and_weights(X):
    tokenizer = Tokenizer(filters='')
    tokenizer.fit_on_texts(X)
    
    weights = np.zeros((len(tokenizer.word_index)+1, 300))
    with open(r"C:\Users\HPPC\crawl-300d-2M.vec",encoding ='utf-8') as f:
        next(f)
        for l in f:
            w = l.split(' ')
            if w[0] in tokenizer.word_index:
                weights[tokenizer.word_index[w[0]]] = np.array([float(x) for x in w[1:301]])
    return tokenizer, weights

In [15]:
tokenizer, weights = prepare_tokenizer_and_weights(np.append(X, X_test))
X_seq = tokenizer.texts_to_sequences(X)
MAX_LEN = max(map(lambda x: len(x), X_seq))
X_seq = pad_sequences(X_seq, MAX_LEN)
MAX_ID = len(tokenizer.word_index)
print('MAX_LEN=', MAX_LEN)
print('MAX_ID=', MAX_ID)

MAX_LEN= 64
MAX_ID= 16445


In [20]:
def make_fast_text():
    fast_text = Sequential()
    fast_text.add(InputLayer((MAX_LEN,))) 
    fast_text.add(Embedding(input_dim=MAX_ID+1, output_dim=300, weights=[weights], trainable=True))
    fast_text.add(SpatialDropout1D(0.5))
    fast_text.add(GlobalMaxPooling1D())
    fast_text.add(Dropout(0.5))
    fast_text.add(Dense(5,activation='softmax'))
    return fast_text

fast_texts = [make_fast_text() for i in range(3)]
fast_texts[0].summary()

for fast_text in fast_texts:
    X_seq_train, X_seq_valid, y_train, y_valid = train_test_split(X_seq, y, test_size=0.1)
    fast_text.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['acc'])
    fast_text.fit(X_seq_train, y_train, validation_data=(X_seq_valid, y_valid),
                 callbacks=[EarlyStopping(monitor='val_loss', patience=2, verbose=0)],
                 epochs=1, 
                 verbose=2)

Model: "sequential_13"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_13 (Embedding)     (None, 64, 300)           4933800   
_________________________________________________________________
spatial_dropout1d_13 (Spatia (None, 64, 300)           0         
_________________________________________________________________
global_max_pooling1d_13 (Glo (None, 300)               0         
_________________________________________________________________
dropout_13 (Dropout)         (None, 300)               0         
_________________________________________________________________
dense_13 (Dense)             (None, 5)                 1505      
Total params: 4,935,305
Trainable params: 4,935,305
Non-trainable params: 0
_________________________________________________________________


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 140454 samples, validate on 15606 samples
Epoch 1/1
 - 2211s - loss: 1.1777 - acc: 0.5295 - val_loss: 0.9529 - val_acc: 0.6150


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 140454 samples, validate on 15606 samples
Epoch 1/1
 - 2150s - loss: 1.1823 - acc: 0.5288 - val_loss: 0.9789 - val_acc: 0.5896


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 140454 samples, validate on 15606 samples
Epoch 1/1
 - 2384s - loss: 1.1739 - acc: 0.5308 - val_loss: 0.9680 - val_acc: 0.5996


In [21]:
y_prob = fast_text.predict(X_seq)
y_predict = np.argmax(y_prob, axis=1)
print(classification_report(y, y_predict))

  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

           0       0.00      0.00      0.00      7072
           1       0.49      0.31      0.38     27273
           2       0.64      0.93      0.76     79582
           3       0.55      0.38      0.45     32927
           4       0.74      0.03      0.07      9206

   micro avg       0.61      0.61      0.61    156060
   macro avg       0.48      0.33      0.33    156060
weighted avg       0.57      0.61      0.55    156060



  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
