# Imports and Parameters

In [1]:
import numpy as np 
import pandas as pd 
from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D, Convolution1D, MaxPooling1D, Dropout, Flatten
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
import re

'''
Task 3: playing with NN framwork/keras and basic sentiment analysis
- use the following model as a baseline and improve it!
- export your metadata (just basic hyperparameters and outcomes for test data!)
- test data = 0.3 (not in this example, change it!)
- random_state = 4222
- no need to cross-validation!
'''

#Parameters
max_features = 500
batch_size = 16
epochs=2
verbose= 2
epochs=2
validation_size = 1000
random_state = 4222
    


  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


# Functions

In [9]:
#Load and transform data
def get_df():
    df = pd.read_csv('dataset_sentiment.csv') 
    df = df[['text','sentiment']]  #alternatively ,usecols=['text','sentiment'] above
    #Transform
    df = df[df.sentiment != "Neutral"]
    df['text'] = df['text'].apply(lambda x: x.lower())
    df['text'] = df['text'].apply(lambda x: x.replace('rt',' '))
    df['text'] = df['text'].apply((lambda x: re.sub('[^a-zA-z0-9\s]','',x)))
    return df
def get_TrainTestData(df, max_features=500):
    tok = Tokenizer(num_words=max_features, split=' ')
    tok.fit_on_texts(df['text'].values)
    # Take the texts and transform this data into numbers
    X = tok.texts_to_sequences(df['text'].values)
    X = pad_sequences(X)
    # Take 'Sentiments' as teacher data
    Y = pd.get_dummies(df['sentiment']).values
    #Split data
    print('X.shape[1]: ', X.shape[1], 'X.shape: ', X.shape)
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.30, random_state = random_state)
    
    #X2 = ['what are u going to say about that? the truth, wassock?!']
    #X2 = tok.texts_to_sequences(X2)
    #X2 = pad_sequences(X2, maxlen=26, dtype='int32', value=0)
    
    return X_train, X_test, Y_train, Y_test, X.shape[1]#, X2
def eval_ScoreAccuracy(X_test, Y_test, nn):  #, X2
    print('evaluate Score and Accuracy on X_test')
    X_validate = X_test[-validation_size:]
    Y_validate = Y_test[-validation_size:]
    X_test = X_test[:-validation_size]
    Y_test = Y_test[:-validation_size]
    score, accuracy = nn.evaluate(X_test, Y_test, verbose = 2, batch_size = batch_size)
    print("score: %.2f" % (score))
    print("acc: %.2f" % (accuracy))
    
    print('evaluate Positive and Negative Accuracy..')
    pos_cnt, neg_cnt, pos_ok, neg_ok = 0, 0, 0, 0
    for x in range(len(X_validate)):
        result = nn.predict(X_validate[x].reshape(1,X_test.shape[1]),batch_size=1,verbose = 2)[0]
        if np.argmax(result) == np.argmax(Y_validate[x]):
            if np.argmax(Y_validate[x]) == 0: neg_ok += 1
            else: pos_ok += 1
        if np.argmax(Y_validate[x]) == 0: neg_cnt += 1
        else: pos_cnt += 1

    print("pos_acc", pos_ok/pos_cnt*100, "%")
    print("neg_acc", neg_ok/neg_cnt*100, "%")
    #print('Predict on X2 now:')
    #print(X2)
    #print(nn.predict(X2, batch_size=1, verbose = 2)[0])

In [3]:
#Create Neural Network model
def get_Neural_Network_Model_default(X_shape_1):
    # Parameters
    max_features = 500
    embed_dim = 128
    lstm_out = 196
    dropout = 0.1
    dropout_1d = 0.4
    recurrent_dropout = 0.1
    validation_size = 1000
    batch_size = 16
    epochs=2
    verbose= 2
    nn = Sequential()
    nn.add(Embedding(max_features, embed_dim, input_length = X_shape_1))
    nn.add(SpatialDropout1D(dropout_1d))
    nn.add(LSTM(lstm_out, dropout=dropout, recurrent_dropout=recurrent_dropout))
    nn.add(Dense(2, activation='softmax'))
    nn.compile(loss = 'categorical_crossentropy', optimizer='adam', metrics = ['accuracy'])
    print(nn.summary())
    return nn

In [38]:
def get_Neural_Network_Model_Improved(X_shape_1):
    # Parameters
    max_features = 500
    embed_dim = 128

    num_filters = 80
    kernel_size = 5
    pool_size = 3
    lstm_out = 196
    dropout = 0.1
    recurrent_dropout = 0.1
    #validation_size = 1000
    #batch_size = 16
    #epochs=2
    #verbose= 2
    
    
    nn = Sequential() 
    nn.add(Embedding(max_features, embed_dim, input_length = X_shape_1))
    nn.add(Dropout(dropout))
    nn.add(Convolution1D(filters=num_filters,kernel_size=kernel_size, padding='valid',activation = 'relu', strides=1))
    nn.add(MaxPooling1D(pool_size=pool_size))
    nn.add(LSTM(lstm_out, recurrent_dropout=recurrent_dropout))
    nn.add(Dense(2, activation='softmax'))
    nn.compile(loss = 'categorical_crossentropy', optimizer='adam', metrics = ['accuracy'])
    print(nn.summary())
    
    
    # Add max pooling
    #model.add(MaxPooling1D(pool_size=pool_size))

    # Add LSTM
    #model.add(LSTM(lstm_output_size))

    # Add output
    #model.add(Dense(2, activation='softmax'))
    
    
    return nn

In [6]:
#see also https://blog.keras.io/category/tutorials.html

# Fit and Test

## Default Model

In [10]:
X_train, X_test, y_train, y_test, X_shape_1 = get_TrainTestData(get_df(), max_features=500)
nn = get_Neural_Network_Model_default(X_shape_1)
nn.fit(X_train, y_train, epochs = epochs, batch_size=batch_size, verbose=verbose)

X.shape[1]:  26 X.shape:  (10729, 26)
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 26, 128)           64000     
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 26, 128)           0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 196)               254800    
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 394       
Total params: 319,194
Trainable params: 319,194
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/2
 - 36s - loss: 0.4324 - acc: 0.8204
Epoch 2/2
 - 33s - loss: 0.3670 - acc: 0.8463


<keras.callbacks.History at 0x2314e3b29b0>

In [11]:
eval_ScoreAccuracy(X_test, y_test, nn) #, X2

evaluate Score and Accuracy on X_test
score: 0.36
acc: 0.85
evaluate Positive and Negative Accuracy..
pos_acc 32.19512195121951 %
neg_acc 97.61006289308176 %


## Improved model

In [39]:
nn2 = get_Neural_Network_Model_Improved(X_shape_1)
nn2.fit(X_train, y_train, epochs = epochs, batch_size=batch_size, verbose=verbose)
eval_ScoreAccuracy(X_test, y_test, nn2) #,X2

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_14 (Embedding)     (None, 26, 128)           64000     
_________________________________________________________________
dropout_14 (Dropout)         (None, 26, 128)           0         
_________________________________________________________________
conv1d_13 (Conv1D)           (None, 22, 80)            51280     
_________________________________________________________________
max_pooling1d_13 (MaxPooling (None, 7, 80)             0         
_________________________________________________________________
lstm_13 (LSTM)               (None, 196)               217168    
_________________________________________________________________
dense_15 (Dense)             (None, 2)                 394       
Total params: 332,842
Trainable params: 332,842
Non-trainable params: 0
_________________________________________________________________
None

In [None]:
# #Alternatively could use for Neural Network model:
# #see https://elitedatascience.com/keras-tutorial-deep-learning-in-python
# def get_Neural_Network_Model_ddddd(X_shape_1):
#     # Parameters
#     max_features = 500
#     embed_dim = 128
    
#     num_filters = 100
#     kernel_size = 3
#     dropout = 0.1

#     nn = Sequential()    
#     nn.add(Embedding(max_features, embed_dim, input_length = X_shape_1)) 
#     #nn.add(Convolution1D(nb_filter=32, filter_length=8, border_mode='valid',input_dim=128))  

    
    
#     nn.add(Convolution1D(filters=num_filters,kernel_size=kernel_size, padding='valid',activation = 'relu', strides=1))
#     nn.add(MaxPooling1D(pool_size=5))
#     nn.add(Dropout(0.25)) 
#     nn.add(Flatten())
#     nn.add(Dense(128, activation='relu'))
#     nn.add(Dropout(dropout))
#     nn.add(Dense(10, activation='softmax'))
    
#     nn.compile(loss = 'categorical_crossentropy', optimizer='adam', metrics = ['accuracy'])
#     print(nn.summary())
    
#     return nn

# #    nn.add(Embedding(max_features, embed_dim, input_length = sequence_length))
# #    nn.add(Convolution1D(filters=num_filters, kernel_size=kernel_size, padding="valid", activation="relu", strides=1))
# #    nn.add(MaxPooling1D(pool_size=2))
# #    nn.add(Flatten())
# #    nn.add(Dense(2, activation='softmax'))
# #    nn.compile(loss = 'categorical_crossentropy', optimizer='adam', metrics = ['accuracy'])



In [None]:
#nn3 = get_Neural_Network_Model_ddddd(X_shape_1)
#nn3.fit(X_train, y_train, epochs = epochs, batch_size=batch_size, verbose=verbose)
#eval_ScoreAccuracy(X_test, y_test, nn2)