In [29]:
import numpy as np 
import pandas as pd 
from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
from keras.models import Sequential
from keras.layers import Flatten
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers.embeddings import Embedding
import re

'''
Task 3: playing with NN framwork/keras and basic sentiment analysis
- use the following model as a baseline and improve it!
- export your metadata (just basic hyperparameters and outcomes for test data!)
- test data = 0.3 (not in this example, change it!)
- random_state = 4222
- no need to cross-validation!
'''

# parameters
max_features = 500
embed_dim = 256
lstm_out = 196
dropout = 0.2
dropout_1d = 0.4
recurrent_dropout = 0.1
random_state = 1324
validation_size = 1000
batch_size = 16
epochs=2
verbose= 2

df = pd.read_csv('dataset_sentiment.csv')
df = df[['text','sentiment']]
print(df[0:10])

df = df[df.sentiment != "Neutral"]
df['text'] = df['text'].apply(lambda x: x.lower())
df['text'] = df['text'].apply(lambda x: x.replace('rt',' '))
df['text'] = df['text'].apply((lambda x: re.sub('[^a-zA-z0-9\s]','',x)))
    
tok = Tokenizer(num_words=max_features, split=' ')
tok.fit_on_texts(df['text'].values)
X = tok.texts_to_sequences(df['text'].values)
X = pad_sequences(X)

                                                text sentiment
0  RT @NancyLeeGrahn: How did everyone feel about...   Neutral
1  RT @ScottWalker: Didn't catch the full #GOPdeb...  Positive
2  RT @TJMShow: No mention of Tamir Rice and the ...   Neutral
3  RT @RobGeorge: That Carly Fiorina is trending ...  Positive
4  RT @DanScavino: #GOPDebate w/ @realDonaldTrump...  Positive
5  RT @GregAbbott_TX: @TedCruz: "On my first day ...  Positive
6  RT @warriorwoman91: I liked her and was happy ...  Negative
7  Going on #MSNBC Live with @ThomasARoberts arou...   Neutral
8  Deer in the headlights RT @lizzwinstead: Ben C...  Negative
9  RT @NancyOsborne180: Last night's debate prove...  Negative


In [22]:
def lstm():
    nn = Sequential()
    nn.add(Embedding(max_features, embed_dim, input_length = X.shape[1]))
    nn.add(SpatialDropout1D(dropout_1d))
    nn.add(LSTM(lstm_out, dropout=dropout, recurrent_dropout=recurrent_dropout))
    nn.add(Dense(2, activation='softmax'))
    nn.compile(loss = 'categorical_crossentropy', optimizer='adam', metrics = ['accuracy'])
    print(nn.summary())
    return nn

In [23]:
def conv():
    #https://machinelearningmastery.com/predict-sentiment-movie-reviews-using-deep-learning/
    nn = Sequential()
    nn.add(Embedding(max_features, embed_dim, input_length= X.shape[1]))
    nn.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
    nn.add(MaxPooling1D(pool_size=2))
    nn.add(Flatten())
    nn.add(Dense(250, activation='relu'))
    nn.add(Dense(2, activation='softmax'))
    nn.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    print(nn.summary())
    return nn

In [24]:
def evaluation(nn):
    nn.fit(X_train, Y_train, epochs = epochs, batch_size=batch_size, verbose=verbose)
    score, accuracy = nn.evaluate(X_test, Y_test, verbose = 2, batch_size = batch_size)
    print("score: %.2f" % (score))
    print("acc: %.2f" % (accuracy))

    pos_cnt, neg_cnt, pos_ok, neg_ok = 0, 0, 0, 0
    for x in range(len(X_validate)):
        result = nn.predict(X_validate[x].reshape(1,X_test.shape[1]),batch_size=1,verbose = 2)[0]
        if np.argmax(result) == np.argmax(Y_validate[x]):
            if np.argmax(Y_validate[x]) == 0: neg_ok += 1
            else: pos_ok += 1
        if np.argmax(Y_validate[x]) == 0: neg_cnt += 1
        else: pos_cnt += 1

    print("pos_acc", pos_ok/pos_cnt*100, "%")
    print("neg_acc", neg_ok/neg_cnt*100, "%")

    X2 = ['what are u going to say about that? the truth, wassock?!']
    X2 = tok.texts_to_sequences(X2)
    X2 = pad_sequences(X2, maxlen=26, dtype='int32', value=0)
    print(X2)
    print(nn.predict(X2, batch_size=1, verbose = 2)[0])

In [25]:
Y = pd.get_dummies(df['sentiment']).values
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.30, random_state = random_state)
X_validate = X_test[-validation_size:]
Y_validate = Y_test[-validation_size:]
X_test = X_test[:-validation_size]
Y_test = Y_test[:-validation_size]

In [26]:
nn = lstm()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 26, 256)           128000    
_________________________________________________________________
spatial_dropout1d_7 (Spatial (None, 26, 256)           0         
_________________________________________________________________
lstm_7 (LSTM)                (None, 196)               355152    
_________________________________________________________________
dense_7 (Dense)              (None, 2)                 394       
Total params: 483,546
Trainable params: 483,546
Non-trainable params: 0
_________________________________________________________________
None


In [27]:
evaluation(nn)

Epoch 1/2
 - 38s - loss: 0.4289 - acc: 0.8198
Epoch 2/2
 - 37s - loss: 0.3591 - acc: 0.8531
score: 0.40
acc: 0.83
pos_acc 54.6875 %
neg_acc 91.58415841584159 %
[[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0  48  37
  311 189   4 144  22  16   1 281]]
[0.8772479  0.12275206]


In [30]:
nn2 = conv()
evaluation(nn2)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_9 (Embedding)      (None, 26, 256)           128000    
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 26, 32)            24608     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 13, 32)            0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 416)               0         
_________________________________________________________________
dense_8 (Dense)              (None, 250)               104250    
_________________________________________________________________
dense_9 (Dense)              (None, 2)                 502       
Total params: 257,360
Trainable params: 257,360
Non-trainable params: 0
_________________________________________________________________
None

In [11]:
from rdflib import Namespace, Graph, Literal
from rdflib.namespace import FOAF, OWL, XSD, RDFS, DCTERMS, DOAP, DC


prov = Namespace('http://www.w3.org/ns/prov#')
dcat = Namespace('http://www.w3.org/ns/dcat#')
mexalgo = Namespace('http://mex.aksw.org/mex-algo#')
mexperf = Namespace('http://mex.aksw.org/mex-perf#')
mexcore = Namespace('http://mex.aksw.org/mex-core#')
this = Namespace('http://mex.aksw.org/examples/')

In [24]:
g = Graph()
# Create Binding
g.bind('dct',DCTERMS)
g.bind('owl',OWL)
g.bind('foaf',FOAF)
g.bind('xsd', XSD)
g.bind('rdfs', RDFS)
g.bind('doap', DOAP)
g.bind('dc', DC)
g.bind('prov', prov)
g.bind('dcat', dcat)
g.bind('mexalgo',mexalgo)
g.bind('mexperf',mexperf)
g.bind('mexcore',mexcore)
g.bind('this',this)

In [25]:
g.add((this.nlp_ex3_nilutz,mexcore.Experiment, prov.Entity))
g.add((this.nlp_ex3_nilutz,mexcore.ApplicationContext, prov.Entity))
g.add((this.nlp_ex3_nilutz,DCTERMS.date, Literal('2018-05-25',datatype=XSD.date)))
g.add((this.nlp_ex3_nilutz,FOAF.givenName, Literal('Nico')))

#Configuration-1
g.add((this.configuration1,mexcore.ExperimentConfiguration, prov.Entity))
g.add((this.configuration1,prov.used, this.model1))
g.add((this.configuration1,prov.wasStartedBy, this.nlp_ex3_nilutz))

#Configuration-2
g.add((this.configuration2,mexcore.ExperimentConfiguration, prov.Entity))
g.add((this.configuration2,prov.used, this.model2))
g.add((this.configuration2,prov.wasStartedBy, this.nlp_ex3_nilutz))

#hyperparam1
g.add((this.hyerparameter_model1,mexalgo.HyperParameterCollection,prov.Entity))
g.add((this.hyerparameter1,RDFS.label,Literal('HyperParameterCollection')))
g.add((this.hyerparameter_model1,prov.hadMember,this.hyerparameter11))


#hyperparam11
g.add((this.hyerparameter1,mexalgo.HyperParameter,prov.Entity))
g.add((this.hyerparameter1,RDFS.label, Literal('LSTM')))
g.add((this.hyerparameter1,DCTERMS.identifier, Literal('LSTM')))
g.add((this.hyerparameter1,prov.value, Literal('196',datatype=XSD.float)))



#hyperparam2
g.add((this.hyerparameter_model2,mexalgo.HyperParameterCollection,prov.Entity))
g.add((this.hyerparameter2,RDFS.label,Literal('HyperParameterCollection')))
g.add((this.hyerparameter_model2,prov.hadMember,this.hyerparameter2))

g.add((this.hyerparameter2,mexalgo.HyperParameter,prov.Entity))
g.add((this.hyerparameter2,RDFS.label, Literal('conv')))
g.add((this.hyerparameter2,DCTERMS.identifier, Literal('conv')))
g.add((this.hyerparameter2,prov.value, Literal('26,32',datatype=XSD.float)))

#measure1
g.add((this.execution1,mexcore.ExecutionOverall,prov.Entity))
g.add((this.execution1,prov.generated,this.performance_measures1))
g.add((this.execution1,prov.used,this.test))
g.add((this.execution1,prov.used,this.hyerparameter_model1))
g.add((this.execution1,prov.used,this.model1))

g.add((this.performance_measures1,mexcore.PerformanceMeasure,prov.Entity))
g.add((this.performance_measures1,mexperf.score,Literal('0.40',datatype=XSD.float)))
g.add((this.performance_measures1,mexperf.accuracy,Literal('0.83',datatype=XSD.float)))
g.add((this.performance_measures1,prov.wasGeneratedBy,this.execution1))

#measure2
g.add((this.execution2,mexcore.ExecutionOverall,prov.Entity))
g.add((this.execution2,prov.generated,this.performance_measures2))
g.add((this.execution2,prov.used,this.test))
g.add((this.execution2,prov.used,this.model2))

g.add((this.performance_measures2,mexcore.PerformanceMeasure,prov.Entity))
g.add((this.performance_measures2,mexperf.score,Literal('0.40',datatype=XSD.float)))
g.add((this.performance_measures2,mexperf.accuracy,Literal('0.84',datatype=XSD.float)))
g.add((this.performance_measures2,prov.wasGeneratedBy,this.execution2))

#Model
g.add((this.model1,mexalgo.Algorithm,prov.Entity))
g.add((this.model1,RDFS.label,Literal('Keras-LSTM-NET')))
g.add((this.model1,mexalgo.hasHyperParameter,this.hyerparameter1))

g.add((this.model2,mexalgo.Algorithm,prov.Entity))
g.add((this.model2,RDFS.label,Literal('Keras-CONV-NET')))
g.add((this.model2,mexalgo.hasHyperParameter,this.hyerparameter2))

In [26]:
g.serialize('task3.ttl')