## Utils

In [1]:
import os
import tensorflow as tf
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
import _pickle as pk

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


### Data Manager

In [2]:
class DataManager:
    def __init__(self):
        self.data = {}

    
    def add_data(self,name, data_path, with_label=True):
        '''
        Read data from data_path
        Args:
            name       : string, name of data
            with_label : bool, read data with label or without label
        Returns:
            None
        '''
        print ('Read data from %s...'%data_path)
        X, Y = [], []
        with open(data_path,'r') as f:
            for line in f:
                if with_label:
                    lines = line.strip().split(' +++$+++ ')
                    X.append(lines[1])
                    Y.append(int(lines[0]))
                else:
                    X.append(line)

        if with_label:
            self.data[name] = [X,Y]
        else:
            self.data[name] = [X]
            
    
    def tokenize(self, vocab_size):
        '''
        Build dictionary(tokenizer)
        Args:
            vocab_size : maximum number of word in yout dictionary
        Returns:
            None
        '''
        print ('Create new tokenizer')
        self. tokenizer = Tokenizer(num_words=vocab_size)
        for key in self.data:
            print ('Tokenizing %s'%key)
            texts = self.data[key][0]
            self.tokenizer.fit_on_texts(texts)

            
    def save_tokenizer(self, path):
        '''
        Save tokenizer to specified path
        '''
        print ('Save tokenizer to %s'%path)
        pk.dump(self.tokenizer, open(path, 'wb'))

        
    def load_tokenizer(self,path):
        '''
        Load tokenizer from specified path
        '''
        print ('Load tokenizer from %s'%path)
        self.tokenizer = pk.load(open(path, 'rb'))

        
    def to_sequence(self, maxlen):
        '''
        Convert words in data to index and pad to equal size
        Args:
            maxlen : max length after padding
        '''
        self.maxlen = maxlen
        for key in self.data:
            print ('Converting %s to sequences'%key)
            tmp = self.tokenizer.texts_to_sequences(self.data[key][0])
            self.data[key][0] = np.array(pad_sequences(tmp, maxlen=maxlen))

            
    def to_bow(self):
        '''
        Convert texts in data to BOW feature
        '''
        for key in self.data:
            print ('Converting %s to tfidf'%key)
            self.data[key][0] = self.tokenizer.texts_to_matrix(self.data[key][0],mode='count')

            
    def to_category(self):
        '''
        Convert label to category type, call this function if use categorical loss
        '''
        for key in self.data:
            if len(self.data[key]) == 2:
                self.data[key][1] = np.array(to_categorical(self.data[key][1]))
    
    
    def get_semi_data(self,name,label,threshold,loss_function) : 
        # if th==0.3, will pick label>0.7 and label<0.3
        label = np.squeeze(label)
        index = (label>1-threshold) + (label<threshold)
        semi_X = self.data[name][0]
        semi_Y = np.greater(label, 0.5).astype(np.int32)
        if loss_function=='binary_crossentropy':
            return semi_X[index,:], semi_Y[index]
        elif loss_function=='categorical_crossentropy':
            return semi_X[index,:], to_categorical(semi_Y[index])
        else :
            raise Exception('Unknown loss function : %s'%loss_function)

            
    def get_data(self,name):
        '''
        Get data by name
        '''
        return self.data[name]

    
    def split_data(self, name, ratio):
        '''
        Split data to two part by a specified ratio
        Args:
            name  : string, same as add_data
            ratio : float, ratio to split
        '''
        data = self.data[name]
        X = data[0]
        Y = data[1]
        data_size = len(X)
        val_size = int(data_size * ratio)
        return (X[val_size:],Y[val_size:]),(X[:val_size],Y[:val_size])

In [3]:
dm = DataManager()

In [4]:
dm.add_data("train", "training_label.txt")
dm.get_data("train")[0]

Read data from training_label.txt...


[['are wtf ... awww thanks !',
  'leavingg to wait for kaysie to arrive myspacin itt for now ilmmthek .!',
  'i wish i could go and see duffy when she comes to mamaia romania .',
  "i know eep ! i can ' t wait for one more day ....",
  'so scared and feeling sick . fuck ! hope someone at hr help ... wish it would be wendita or karen .',
  'my b day was thurs . i wanted 2 do 5 this weekend for my b day but i guess close enough next weekend . going alone',
  'e3 is in the trending topics only just noticed ive been tweeting on my iphone until now',
  'where did you get him from i know someone who would love that !',
  'dam just got buzzed by another huge fly ! this time it landed on my head ... not impressed',
  "tomorrowwwwwwwww !!! you ' ll love tomorrow ' s news !",
  "gonna try 2 sleep . damn garageband next to me won ' t let me tho",
  "wish weekend .. but not really also .. cuz next monday is exam and i haven ' t studied at all yet hate exam .. grr",
  "check this vid out .... you '

In [5]:
dm.tokenize(20000)
dm.to_sequence(38)

Create new tokenizer
Tokenizing train
Converting train to sequences


In [7]:
# explain tokenize to vector & padding
tmp = dm.tokenizer.texts_to_sequences(dm.get_data("train")[0])
tmp

[[39, 797, 456, 81],
 [19131, 3, 140, 11, 33617, 3, 2712, 23797, 10343, 11, 29, 33618],
 [1, 114, 1, 142, 41, 7, 66, 33619, 85, 108, 709, 3, 33620, 10344],
 [1, 59, 7008, 1, 31, 2, 15, 140, 11, 57, 86, 34],
 [19,
  783,
  7,
  188,
  180,
  512,
  98,
  238,
  26,
  2571,
  231,
  114,
  8,
  118,
  25,
  33621,
  103,
  4920],
 [6,
  369,
  34,
  28,
  2713,
  1,
  360,
  77,
  43,
  227,
  30,
  151,
  11,
  6,
  369,
  34,
  22,
  1,
  246,
  648,
  402,
  157,
  151,
  46,
  473],
 [1811,
  10,
  12,
  4,
  1725,
  2085,
  117,
  23,
  1744,
  650,
  101,
  755,
  16,
  6,
  378,
  312,
  29],
 [189, 127, 9, 37, 149, 54, 1, 59, 238, 156, 118, 47, 17],
 [2629,
  23,
  50,
  8716,
  123,
  201,
  784,
  906,
  30,
  53,
  8,
  2742,
  16,
  6,
  326,
  27,
  1909],
 [33622, 9, 2, 82, 47, 99, 2, 13, 436],
 [131, 267, 77, 115, 230, 33623, 157, 3, 18, 202, 2, 15, 203, 18, 352],
 [114,
  151,
  22,
  27,
  63,
  257,
  497,
  157,
  361,
  10,
  464,
  7,
  1,
  338,
  2,
  15,
  5175,
 

In [12]:
dm.get_data("train")[0]

array([[    0,     0,     0, ...,   797,   456,    81],
       [    0,     0,     0, ...,    11,    29, 33618],
       [    0,     0,     0, ...,     3, 33620, 10344],
       ...,
       [    0,     0,     0, ...,    48,    35,    43],
       [    0,     0,     0, ...,     2,    82,    25],
       [    0,     0,     0, ...,   142,   579,    45]], dtype=int32)

## hw4.py

In [6]:
import sys, argparse, os
import keras
import _pickle as pk
import readline
import numpy as np

from keras import regularizers
from keras.models import Model
from keras.layers import Input, GRU, LSTM, Dense, Dropout, Bidirectional
from keras.layers.embeddings import Embedding
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping, ModelCheckpoint

import keras.backend.tensorflow_backend as K
import tensorflow as tf

### Build Model

#### 宏毅Ver.

In [14]:
def simpleRNN(max_length, vocab_size, embedding_dim, dropout_rate, cell, hidden_size, loss_function="binary_crossentropy"):
    inputs = Input(shape=(max_length,))

    # Embedding layer
    embedding_inputs = Embedding(vocab_size, 
                                 embedding_dim, 
                                 trainable=True)(inputs)
    # RNN 
    return_sequence = False
    dropout_rate = dropout_rate
    if cell == 'GRU':
        RNN_cell = GRU(hidden_size, 
                       return_sequences=return_sequence, 
                       dropout=dropout_rate)
    elif cell == 'LSTM':
        RNN_cell = LSTM(hidden_size, 
                        return_sequences=return_sequence, 
                        dropout=dropout_rate)

    RNN_output = RNN_cell(embedding_inputs)

    # DNN layer
    outputs = Dense(hidden_size//2, activation='relu', kernel_regularizer=regularizers.l2(0.1))(RNN_output)
    outputs = Dropout(dropout_rate)(outputs)
    outputs = Dense(1, activation='sigmoid')(outputs)
        
    model =  Model(inputs=inputs,outputs=outputs)

    # optimizer
    adam = Adam()
    print ('Compile model...')

    # compile model
    model.compile( loss=loss_function, optimizer=adam, metrics=[ 'accuracy',])
    
    return model

#### DIY Ver.

In [7]:
inputs = Input(shape=(38,), name="inputs")
embedding = Embedding(20000, 128, mask_zero=True, trainable=True, name="embedding")(inputs)    # 將word vector壓到128維
rnn = LSTM(128, return_sequences=False, name="lstm")(embedding)    # 將LSTM的結果投到128維（latent dimension）
fc1 = Dense(64, activation="relu", name="fc1")(rnn)
fc2 = Dense(32, activation="relu", name="fc2")(fc1)
fc3 = Dense(16, activation="relu", name="fc3")(fc2)
outputs = Dense(1, activation="sigmoid", name="outputs")(fc3)

model = Model(inputs, outputs)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
inputs (InputLayer)          (None, 38)                0         
_________________________________________________________________
embedding (Embedding)        (None, 38, 128)           2560000   
_________________________________________________________________
lstm (LSTM)                  (None, 128)               131584    
_________________________________________________________________
fc1 (Dense)                  (None, 64)                8256      
_________________________________________________________________
fc2 (Dense)                  (None, 32)                2080      
_________________________________________________________________
fc3 (Dense)                  (None, 16)                528       
_________________________________________________________________
outputs (Dense)              (None, 1)                 17        
Total para

In [8]:
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

### Train Model

In [15]:
history = model.fit(dm.get_data("train")[0], dm.get_data("train")[1], batch_size=512, epochs=50, validation_split=0.3, shuffle=False)

Train on 160000 samples, validate on 40000 samples
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/

KeyboardInterrupt: 

### Predict

In [17]:
dm.add_data("train2", "training_label.txt")
tmp = dm.tokenizer.texts_to_sequences(dm.data["train2"][0][:10])

Read data from training_label.txt...


In [18]:
dm.data["train2"][0] = np.array(pad_sequences(tmp, 38))

In [22]:
model.predict(dm.data["train2"][0])

array([[9.9469107e-01],
       [1.0000000e+00],
       [2.4596845e-13],
       [1.0000000e+00],
       [2.9246577e-11],
       [1.1243518e-11],
       [1.0000000e+00],
       [1.0000000e+00],
       [6.1170773e-12],
       [9.9999988e-01]], dtype=float32)

In [25]:
new_txt = dm.tokenizer.texts_to_sequences(["fuck you asshole!"])

In [26]:
new_txt = np.array(pad_sequences(new_txt, 38))

In [28]:
model.predict(new_txt)

array([[0.0004192]], dtype=float32)