In [1]:
import numpy as np 
import pandas as pd 
from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D,MaxPooling1D,Conv1D,Flatten
from keras.layers.recurrent import GRU
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
import re
from keras.models import model_from_json

Using TensorFlow backend.


In [3]:


'''
Task 3: playing with NN framwork/keras and basic sentiment analysis
- use the following model as a baseline and improve it!
- export your metadata (just basic hyperparameters and outcomes for test data!)
- test data = 0.3 (not in this example, change it!)
- random_state = 4222
- no need to cross-validation!
'''

# parameters
max_fatures = 500
embed_dim = 128
lstm_out = 196
dropout = 0.1
dropout_1d = 0.4
recurrent_dropout = 0.1
random_state = 1324
validation_size = 1000
#validation_size = 200
batch_size = 16
epochs=2
verbose= 2
test_s = 0.3

df = pd.read_csv('dataset_sentiment.csv')
df = df[['text','sentiment']]
print(df[0:10])

df = df[df.sentiment != "Neutral"]
df['text'] = df['text'].apply(lambda x: x.lower())
df['text'] = df['text'].apply(lambda x: x.replace('rt',' '))
df['text'] = df['text'].apply((lambda x: re.sub('[^a-zA-z0-9\s]','',x)))
 
#df = df.sample(2000)
    
tok = Tokenizer(num_words=max_fatures, split=' ')
tok.fit_on_texts(df['text'].values)
X = tok.texts_to_sequences(df['text'].values)
X = pad_sequences(X)

nn = Sequential()
nn.add(Embedding(max_fatures, embed_dim, input_length = X.shape[1]))
nn.add(SpatialDropout1D(dropout_1d))
nn.add(LSTM(lstm_out, dropout=dropout, recurrent_dropout=recurrent_dropout))
nn.add(Dense(2, activation='softmax'))

nn.compile(loss = 'categorical_crossentropy', optimizer='adam', metrics = ['accuracy'])
print(nn.summary())

Y = pd.get_dummies(df['sentiment']).values

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = test_s, random_state = random_state)
nn.fit(X_train, Y_train, epochs = epochs, batch_size=batch_size, verbose=verbose)

X_validate = X_test[-validation_size:]
Y_validate = Y_test[-validation_size:]
X_test = X_test[:-validation_size]
Y_test = Y_test[:-validation_size]

score, accuracy = nn.evaluate(X_test, Y_test, verbose = 2, batch_size = batch_size)
print("score: %.2f" % (score))
print("acc: %.2f" % (accuracy))

pos_cnt, neg_cnt, pos_ok, neg_ok = 0, 0, 0, 0
for x in range(len(X_validate)):
    result = nn.predict(X_validate[x].reshape(1,X_test.shape[1]),batch_size=1,verbose = 2)[0]
    if np.argmax(result) == np.argmax(Y_validate[x]):
        if np.argmax(Y_validate[x]) == 0: neg_ok += 1
        else: pos_ok += 1
    if np.argmax(Y_validate[x]) == 0: neg_cnt += 1
    else: pos_cnt += 1

print("pos_acc", pos_ok/pos_cnt*100, "%")
print("neg_acc", neg_ok/neg_cnt*100, "%")

#for new model 1
X_train_n1, X_test_n1, Y_train_n1, Y_test_n1 = train_test_split(X, Y, test_size = 0.3, random_state = 4222)
X_validate_n1 = X_test_n1[-validation_size:]
Y_validate_n1 = Y_test_n1[-validation_size:]
X_test_n1 = X_test_n1[:-validation_size]
Y_test_n1 = Y_test_n1[:-validation_size]
#end
#config = nn.get_config()
#newModel = Sequential.from_config(config)

newModel1 = Sequential()
newModel1.add(Embedding(max_fatures, 128, input_length = X.shape[1]))
newModel1.add(GRU(units=32, name = "gru_1",return_sequences=True))
newModel1.add(GRU(units=16, name = "gru_2",return_sequences=True ))
newModel1.add(GRU(units=8, name= "gru_3"))
newModel1.add(Dense(2, activation='sigmoid'))
newModel1.compile(loss = 'categorical_crossentropy', optimizer='adam', metrics = ['accuracy'])

print(newModel1.summary())
newModel1.fit(X_train_n1, Y_train_n1, epochs = epochs, batch_size=16, verbose=verbose)

score, accuracy = 0,0
score, accuracy = newModel1.evaluate(X_test_n1, Y_test_n1, verbose = 2, batch_size = 16)
print("score: %.2f" % (score))
print("acc: %.2f" % (accuracy))

pos_cnt, neg_cnt, pos_ok, neg_ok = 0, 0, 0, 0
for x in range(len(X_validate_n1)):
    result = newModel1.predict(X_validate_n1[x].reshape(1,X_test_n1.shape[1]),batch_size=1,verbose = 2)[0]
    if np.argmax(result) == np.argmax(Y_validate_n1[x]):
        if np.argmax(Y_validate_n1[x]) == 0: neg_ok += 1
        else: pos_ok += 1
    if np.argmax(Y_validate_n1[x]) == 0: neg_cnt += 1
    else: pos_cnt += 1
print("pos_acc", pos_ok/pos_cnt*100, "%")
print("neg_acc", neg_ok/neg_cnt*100, "%")

X2 = ['what are u going to say about that? the truth, wassock?!']
print(X2)
print('lelele')
X2 = tok.texts_to_sequences(X2)
X2 = pad_sequences(X2, maxlen=26, dtype='int32', value=0)
print(X2)


#for new model 2
X_train_n2, X_test_n2, Y_train_n2, Y_test_n2 = train_test_split(X, Y, test_size = 0.3, random_state = 4222)
X_validate_n2 = X_test_n2[-validation_size:]
Y_validate_n2 = Y_test_n2[-validation_size:]
X_test_n2 = X_test_n2[:-validation_size]
Y_test_n2 = Y_test_n2[:-validation_size]

newModel2 = Sequential()
newModel2.add(Embedding(max_fatures, embed_dim, input_length = X.shape[1]))
newModel2.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
newModel2.add(MaxPooling1D(pool_size=2))
newModel2.add(Flatten())
newModel2.add(Dense(250, activation='relu'))
newModel2.add(Dense(2, activation='softmax'))
newModel2.compile(loss = 'categorical_crossentropy', optimizer='adam', metrics = ['accuracy'])

print(newModel2.summary())
newModel2.fit(X_train_n2, Y_train_n2, epochs = epochs, batch_size=16, verbose=verbose)

score, accuracy = 0,0
score, accuracy = newModel2.evaluate(X_test_n2, Y_test_n2, verbose = 2, batch_size = 16)
print("score: %.2f" % (score))
print("acc: %.2f" % (accuracy))

pos_cnt, neg_cnt, pos_ok, neg_ok = 0, 0, 0, 0
for x in range(len(X_validate_n2)):
    result = newModel2.predict(X_validate_n2[x].reshape(1,X_test_n2.shape[1]),batch_size=1,verbose = 2)[0]
    if np.argmax(result) == np.argmax(Y_validate_n2[x]):
        if np.argmax(Y_validate_n2[x]) == 0: neg_ok += 1
        else: pos_ok += 1
    if np.argmax(Y_validate_n2[x]) == 0: neg_cnt += 1
    else: pos_cnt += 1
print("pos_acc", pos_ok/pos_cnt*100, "%")
print("neg_acc", neg_ok/neg_cnt*100, "%")

X2 = ['what are u going to say about that? the truth, wassock?!']
X2 = tok.texts_to_sequences(X2)
X2 = pad_sequences(X2, maxlen=26, dtype='int32', value=0)
print(X2)


#print(nn.predict(X2, batch_size=1, verbose = 2)[0])
#print(newModel1.predict(X2, batch_size=1, verbose = 2)[0])
#print(newModel2.predict(X2, batch_size=1, verbose = 2)[0])



                                                text sentiment
0  RT @NancyLeeGrahn: How did everyone feel about...   Neutral
1  RT @ScottWalker: Didn't catch the full #GOPdeb...  Positive
2  RT @TJMShow: No mention of Tamir Rice and the ...   Neutral
3  RT @RobGeorge: That Carly Fiorina is trending ...  Positive
4  RT @DanScavino: #GOPDebate w/ @realDonaldTrump...  Positive
5  RT @GregAbbott_TX: @TedCruz: "On my first day ...  Positive
6  RT @warriorwoman91: I liked her and was happy ...  Negative
7  Going on #MSNBC Live with @ThomasARoberts arou...   Neutral
8  Deer in the headlights RT @lizzwinstead: Ben C...  Negative
9  RT @NancyOsborne180: Last night's debate prove...  Negative
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 26, 128)           64000     
_________________________________________________________________
spatial_dropout1d_2 (Spatial (None, 26, 128

ValueError: Error when checking : expected embedding_4_input to have shape (26,) but got array with shape (25,)