In [21]:
import numpy as np 
import pandas as pd 
from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D, SimpleRNN
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
import re

In [22]:
# parameters
max_fatures = 500
embed_dim = 128
lstm_out = 196
dropout = 0.1
dropout_1d = 0.4
recurrent_dropout = 0.1
random_state = 4222
validation_size = 1000
batch_size = 16
epochs=2
verbose= 2

In [23]:
df = pd.read_csv('dataset_sentiment.csv')
df = df[['text','sentiment']]
print(df[0:10])

df = df[df.sentiment != "Neutral"]
df['text'] = df['text'].apply(lambda x: x.lower())
df['text'] = df['text'].apply(lambda x: x.replace('rt',' '))
df['text'] = df['text'].apply((lambda x: re.sub('[^a-zA-z0-9\s]','',x)))

                                                text sentiment
0  RT @NancyLeeGrahn: How did everyone feel about...   Neutral
1  RT @ScottWalker: Didn't catch the full #GOPdeb...  Positive
2  RT @TJMShow: No mention of Tamir Rice and the ...   Neutral
3  RT @RobGeorge: That Carly Fiorina is trending ...  Positive
4  RT @DanScavino: #GOPDebate w/ @realDonaldTrump...  Positive
5  RT @GregAbbott_TX: @TedCruz: "On my first day ...  Positive
6  RT @warriorwoman91: I liked her and was happy ...  Negative
7  Going on #MSNBC Live with @ThomasARoberts arou...   Neutral
8  Deer in the headlights RT @lizzwinstead: Ben C...  Negative
9  RT @NancyOsborne180: Last night's debate prove...  Negative


In [24]:
tok = Tokenizer(num_words=max_fatures, split=' ')
tok.fit_on_texts(df['text'].values)
X = tok.texts_to_sequences(df['text'].values)
X = pad_sequences(X)

<h4>Baseline NN classifier</h4>

In [51]:
nn = Sequential()
nn.add(Embedding(max_fatures, embed_dim, input_length = X.shape[1]))
nn.add(SpatialDropout1D(dropout_1d))
nn.add(LSTM(lstm_out, dropout=dropout, recurrent_dropout=recurrent_dropout))
nn.add(Dense(2, activation='softmax'))
nn.compile(loss = 'categorical_crossentropy', optimizer='adam', metrics = ['accuracy'])
print(nn.summary())

Y = pd.get_dummies(df['sentiment']).values
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.30, random_state = random_state)
nn.fit(X_train, Y_train, epochs = epochs, batch_size=batch_size, verbose=verbose)

X_validate = X_test[-validation_size:]
Y_validate = Y_test[-validation_size:]
X_test = X_test[:-validation_size]
Y_test = Y_test[:-validation_size]
score, accuracy = nn.evaluate(X_test, Y_test, verbose = 2, batch_size = batch_size)
print("score: %.2f" % (score))
print("acc: %.2f" % (accuracy))

pos_cnt, neg_cnt, pos_ok, neg_ok = 0, 0, 0, 0
for x in range(len(X_validate)):
    result = nn.predict(X_validate[x].reshape(1,X_test.shape[1]),batch_size=1,verbose = 2)[0]
    if np.argmax(result) == np.argmax(Y_validate[x]):
        if np.argmax(Y_validate[x]) == 0: neg_ok += 1
        else: pos_ok += 1
    if np.argmax(Y_validate[x]) == 0: neg_cnt += 1
    else: pos_cnt += 1

print("pos_acc", pos_ok/pos_cnt*100, "%")
print("neg_acc", neg_ok/neg_cnt*100, "%")

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_22 (Embedding)     (None, 26, 128)           64000     
_________________________________________________________________
spatial_dropout1d_5 (Spatial (None, 26, 128)           0         
_________________________________________________________________
lstm_4 (LSTM)                (None, 196)               254800    
_________________________________________________________________
dense_12 (Dense)             (None, 2)                 394       
Total params: 319,194
Trainable params: 319,194
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/2
 - 25s - loss: 0.4398 - acc: 0.8144
Epoch 2/2
 - 23s - loss: 0.3636 - acc: 0.8446
score: 0.35
acc: 0.86
pos_acc 37.073170731707314 %
neg_acc 97.48427672955975 %


<h4>Improve the baseline by inserting RNN cells</h4>

In [60]:
nn = Sequential()
nn.add(Embedding(max_fatures, embed_dim, input_length = X.shape[1]))
nn.add(SimpleRNN(units=3))
nn.add(Dense(2, activation='softmax'))
nn.compile(loss = 'categorical_crossentropy', optimizer='adam', metrics = ['accuracy'])
print(nn.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_28 (Embedding)     (None, 26, 128)           64000     
_________________________________________________________________
simple_rnn_22 (SimpleRNN)    (None, 3)                 396       
_________________________________________________________________
dense_16 (Dense)             (None, 2)                 8         
Total params: 64,404
Trainable params: 64,404
Non-trainable params: 0
_________________________________________________________________
None


<h4>Train and test improved Neural Network</h4>

In [61]:
Y = pd.get_dummies(df['sentiment']).values
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.30, random_state = random_state)
nn.fit(X_train, Y_train, epochs = epochs, batch_size=batch_size, verbose=verbose)

X_validate = X_test[-validation_size:]
Y_validate = Y_test[-validation_size:]
X_test = X_test[:-validation_size]
Y_test = Y_test[:-validation_size]
score, accuracy = nn.evaluate(X_test, Y_test, verbose = 2, batch_size = batch_size)
print("score: %.2f" % (score))
print("acc: %.2f" % (accuracy))

pos_cnt, neg_cnt, pos_ok, neg_ok = 0, 0, 0, 0
for x in range(len(X_validate)):
    result = nn.predict(X_validate[x].reshape(1,X_test.shape[1]),batch_size=1,verbose = 2)[0]
    if np.argmax(result) == np.argmax(Y_validate[x]):
        if np.argmax(Y_validate[x]) == 0: neg_ok += 1
        else: pos_ok += 1
    if np.argmax(Y_validate[x]) == 0: neg_cnt += 1
    else: pos_cnt += 1

print("pos_acc", pos_ok/pos_cnt*100, "%")
print("neg_acc", neg_ok/neg_cnt*100, "%")

Epoch 1/2
 - 5s - loss: 0.4602 - acc: 0.8156
Epoch 2/2
 - 4s - loss: 0.3774 - acc: 0.8525
score: 0.39
acc: 0.85
pos_acc 39.51219512195122 %
neg_acc 95.22012578616352 %


<h4>Export metadata</h4>

In [64]:
from rdflib import Namespace, Graph, Literal
from rdflib.namespace import FOAF, OWL, XSD, RDFS, DCTERMS, DOAP, DC, RDF


prov = Namespace('http://www.w3.org/ns/prov#')
dcat = Namespace('http://www.w3.org/ns/dcat#')
mexalgo = Namespace('http://mex.aksw.org/mex-algo#')
mexperf = Namespace('http://mex.aksw.org/mex-perf#')
mexcore = Namespace('http://mex.aksw.org/mex-core#')
this = Namespace('http://mex.aksw.org/examples/')

g = Graph()
# Create Binding
g.bind('dct',DCTERMS)
g.bind('owl',OWL)
g.bind('foaf',FOAF)
g.bind('xsd', XSD)
g.bind('rdfs', RDFS)
g.bind('doap', DOAP)
g.bind('dc', DC)
g.bind('prov', prov)
g.bind('dcat', dcat)
g.bind('mexalgo',mexalgo)
g.bind('mexperf',mexperf)
g.bind('mexcore',mexcore)
g.bind('this',this)

g.add((this.pielka_task3,RDF.type,mexcore.Experiment))
g.add((this.pielka_task3,RDF.type,mexcore.ApplicationContext))
g.add((this.pielka_task3,RDFS.label, Literal('2468882')))
g.add((this.pielka_task3,DCTERMS.date, Literal('2018-05-30',datatype=XSD.date)))
g.add((this.pielka_task3,FOAF.givenName, Literal('Maren')))
g.add((this.pielka_task3,FOAF.mbox, Literal('maren.pielka@gmx.de')))

#Model-1: Baseline
g.add((this.configuration1,RDF.type,mexcore.ExperimentConfiguration))
g.add((this.configuration1,prov.used, this.model1))
g.add((this.configuration1,prov.wasStartedBy, this.pielka_task3))

#Model-2: Improved model
g.add((this.configuration2,RDF.type,mexcore.ExperimentConfiguration))
g.add((this.configuration2,prov.used, this.model2))
g.add((this.configuration2,prov.wasStartedBy, this.pielka_task3))

g.add((this.test,RDF.type,mexcore.Test))
g.add((this.test,RDFS.label,Literal('Test')))

g.add((this.hyerparameter_model1,RDF.type,mexalgo.HyperParameterCollection))
g.add((this.hyerparameter1,RDFS.label,Literal('HyperParameterCollection')))
g.add((this.hyerparameter_model1,prov.hadMember,this.hyerparameter1))
g.add((this.hyerparameter_model1,prov.hadMember,this.hyerparameter2))
g.add((this.hyerparameter_model1,prov.hadMember,this.hyerparameter3))


g.add((this.hyerparameter_model2,RDF.type,mexalgo.HyperParameterCollection))
g.add((this.hyerparameter_model2,RDFS.label,Literal('HyperParameterCollection')))
g.add((this.hyerparameter_model2,prov.hadMember,this.hyerparameter4))


g.add((this.hyerparameter1,RDF.type,mexalgo.HyperParameter))
g.add((this.hyerparameter1,RDFS.label, Literal('embedding_dimension')))
g.add((this.hyerparameter1,DCTERMS.identifier, Literal('embedding_dimension')))
g.add((this.hyerparameter1,prov.value, Literal('128',datatype=XSD.integer)))

g.add((this.hyerparameter2,RDF.type,mexalgo.HyperParameter))
g.add((this.hyerparameter2,RDFS.label, Literal('dropout')))
g.add((this.hyerparameter2,DCTERMS.identifier, Literal('dropout')))
g.add((this.hyerparameter2,prov.value, Literal('0.1',datatype=XSD.float)))

g.add((this.hyerparameter3,RDF.type,mexalgo.HyperParameter))
g.add((this.hyerparameter3,RDFS.label, Literal('recurrent_dropout')))
g.add((this.hyerparameter3,DCTERMS.identifier, Literal('recurrent_dropout')))
g.add((this.hyerparameter3,prov.value, Literal('0.1',datatype=XSD.float)))


g.add((this.hyerparameter4,RDF.type,mexalgo.HyperParameter))
g.add((this.hyerparameter4,RDFS.label, Literal('rnn_units')))
g.add((this.hyerparameter4,DCTERMS.identifier, Literal('rnn_units')))
g.add((this.hyerparameter4,prov.value, Literal('2', datatype=XSD.integer)))


g.add((this.dataset,RDF.type,mexcore.Dataset))
g.add((this.dataset,RDFS.label,Literal('Sentiment-Classification')))
g.add((this.dataset,DCTERMS.landingPage,Literal('https://github.com/SmartDataAnalytics/MA-INF-4222-NLP-Lab/blob/master/2018_SoSe/exercises/dataset_sentiment.csv')))


g.add((this.execution1,RDF.type,mexcore.ExecutionOverall))
g.add((this.execution1,prov.generated,this.performance_measures1))
g.add((this.execution1,prov.used,this.test))
g.add((this.execution1,prov.used,this.hyerparameter_model1))
g.add((this.execution1,prov.used,this.model1))

g.add((this.execution2,RDF.type,mexcore.ExecutionOverall))
g.add((this.execution2,prov.generated,this.performance_measures2))
g.add((this.execution2,prov.used,this.test))
g.add((this.execution2,prov.used,this.hyerparameter_model2))
g.add((this.execution2,prov.used,this.model2))


g.add((this.performance_measures1,RDF.type,mexcore.PerformanceMeasure))
g.add((this.performance_measures1,mexperf.score,Literal('0.35',datatype=XSD.float)))
g.add((this.performance_measures1,mexperf.accuracy,Literal('0.86',datatype=XSD.float)))
g.add((this.performance_measures1,mexperf.pos_accuracy,Literal('0.37',datatype=XSD.float)))
g.add((this.performance_measures1,mexperf.neg_accuracy,Literal('0.97',datatype=XSD.float)))
g.add((this.performance_measures1,prov.wasGeneratedBy,this.execution1))

g.add((this.performance_measures2,RDF.type,mexcore.PerformanceMeasure))
g.add((this.performance_measures2,mexperf.score,Literal('0.39',datatype=XSD.float)))
g.add((this.performance_measures2,mexperf.accuracy,Literal('0.85',datatype=XSD.float)))
g.add((this.performance_measures1,mexperf.pos_accuracy,Literal('0.40',datatype=XSD.float)))
g.add((this.performance_measures1,mexperf.neg_accuracy,Literal('0.95',datatype=XSD.float)))
g.add((this.performance_measures2,prov.wasGeneratedBy,this.execution2))


g.add((this.model1,RDF.type,mexalgo.Algorithm))
g.add((this.model1,RDFS.label,Literal('LSTM')))
g.add((this.model1,DCTERMS.identifier,Literal('LSTM')))
g.add((this.model1,mexalgo.hasHyperParameter,this.hyerparameter1))
g.add((this.model1,mexalgo.hasHyperParameter,this.hyerparameter2))
g.add((this.model1,mexalgo.hasHyperParameter,this.hyerparameter3))

g.add((this.model2,RDF.type,mexalgo.Algorithm))
g.add((this.model2,RDFS.label,Literal('TemporalConvolution')))
g.add((this.model2,DCTERMS.identifier,Literal('TemporalConvolution')))
g.add((this.model2,mexalgo.hasHyperParameter,this.hyerparameter4))

with open('task3_metadata.ttl','wb') as f:
    f.write(g.serialize(format='turtle'))