## Sentiment Analysis using Keras

In [1]:
import numpy as np 
import pandas as pd 
from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D, Convolution1D, MaxPooling1D, Flatten
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
import re

Using TensorFlow backend.


In [2]:
def lstm_model(max_features, embed_dim, sequence_length, recurrent_dropout):
    
    nn = Sequential()
    nn.add(Embedding(max_features, embed_dim, input_length = sequence_length))
    nn.add(SpatialDropout1D(dropout_1d))
    nn.add(LSTM(lstm_out, dropout=dropout, recurrent_dropout=recurrent_dropout))
    nn.add(Dense(2, activation='softmax'))
    nn.compile(loss = 'categorical_crossentropy', optimizer='adam', metrics = ['accuracy'])
    
    return nn

def conv_model(max_features, embed_dim, sequence_length, num_filters, kernel_size):
    
    nn = Sequential()
    nn.add(Embedding(max_features, embed_dim, input_length = sequence_length))
    nn.add(Convolution1D(filters=num_filters, kernel_size=kernel_size, padding="valid", activation="relu", strides=1))
    nn.add(MaxPooling1D(pool_size=2))
    nn.add(Flatten())
    nn.add(Dense(2, activation='softmax'))
    nn.compile(loss = 'categorical_crossentropy', optimizer='adam', metrics = ['accuracy'])
    
    return nn

def evaluate_model(nn, X_test, Y_test, X_validate, Y_validate, batch_size):
    score, accuracy = nn.evaluate(X_test, Y_test, verbose = 2, batch_size = batch_size)
    print("score: %.2f" % (score))
    print("acc: %.2f" % (accuracy))    
    pos_cnt, neg_cnt, pos_ok, neg_ok = 0, 0, 0, 0
    for x in range(len(X_validate)):
        result = nn.predict(X_validate[x].reshape(1,X_test.shape[1]),batch_size=1,verbose = 2)[0]
        if np.argmax(result) == np.argmax(Y_validate[x]):
            if np.argmax(Y_validate[x]) == 0: neg_ok += 1
            else: pos_ok += 1
        if np.argmax(Y_validate[x]) == 0: neg_cnt += 1
        else: pos_cnt += 1

    print("pos_acc", pos_ok/pos_cnt*100, "%")
    print("neg_acc", neg_ok/neg_cnt*100, "%")
    X2 = ['what are u going to say about that? the truth, wassock?!']
    X2 = tok.texts_to_sequences(X2)
    print('Sentence to ID')
    X2 = pad_sequences(X2, maxlen=26, dtype='int32', value=0)
    print(X2)
    print('Model-Prediction')
    print(nn.predict(X2, batch_size=1, verbose = 2)[0])

In [3]:
# parameters 
max_features = 500
embed_dim = 128
lstm_out = 196
dropout = 0.1
dropout_1d = 0.4
recurrent_dropout = 0.1
random_state = 4222
validation_size = 1000
batch_size = 16
epochs=2
verbose= 2

In [5]:
# Read and preprocess data

df = pd.read_csv('dataset_sentiment.csv')
df = df[['text','sentiment']]

df = df[df.sentiment != "Neutral"]
df['text'] = df['text'].apply(lambda x: x.lower())
df['text'] = df['text'].apply(lambda x: x.replace('rt',' '))
df['text'] = df['text'].apply((lambda x: re.sub('[^a-zA-z0-9\s]','',x)))
    
tok = Tokenizer(num_words=max_features, split=' ')
tok.fit_on_texts(df['text'].values)
X = tok.texts_to_sequences(df['text'].values)
X = pad_sequences(X)
Y = pd.get_dummies(df['sentiment']).values

In [12]:
# Split Dataset

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.30, random_state = random_state)

X_validate = X_test[-validation_size:]
Y_validate = Y_test[-validation_size:]
X_test = X_test[:-validation_size]
Y_test = Y_test[:-validation_size]

In [13]:
sequence_length = X.shape[1]
nn = lstm_model(max_features, embed_dim, sequence_length, recurrent_dropout)
nn.fit(X_train, Y_train, epochs = epochs, batch_size=batch_size, verbose=verbose)
evaluate_model(nn, X_test, Y_test, X_validate, Y_validate, batch_size)

Epoch 1/2
13s - loss: 0.4374 - acc: 0.8148
Epoch 2/2
13s - loss: 0.3659 - acc: 0.8461
score: 0.36
acc: 0.85
pos_acc 40.0 %
neg_acc 97.10691823899371 %
Sentence to ID
[[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0  48  37
  311 189   4 144  22  16   1 281]]
Model-Prediction
[ 0.95043731  0.04956275]


In [14]:
# Improvement in terms of computation as well as performance, achieved by using Convolution layer
nn1 = conv_model(max_features, embed_dim, sequence_length, num_filters=12, kernel_size=3)
nn1.fit(X_train, Y_train, epochs = epochs, batch_size=batch_size, verbose=verbose)
evaluate_model(nn1, X_test, Y_test, X_validate, batch_size)

Epoch 1/2
1s - loss: 0.4342 - acc: 0.8140
Epoch 2/2
1s - loss: 0.3289 - acc: 0.8579
score: 0.36
acc: 0.85
pos_acc 45.36585365853659 %
neg_acc 95.34591194968553 %
Sentence to ID
[[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0  48  37
  311 189   4 144  22  16   1 281]]
Model-Prediction
[ 0.92360032  0.07639971]


In [2]:
from rdflib import Namespace, Graph, Literal
from rdflib.namespace import FOAF, OWL, XSD, RDFS, DCTERMS, DOAP, DC, RDF


prov = Namespace('http://www.w3.org/ns/prov#')
dcat = Namespace('http://www.w3.org/ns/dcat#')
mexalgo = Namespace('http://mex.aksw.org/mex-algo#')
mexperf = Namespace('http://mex.aksw.org/mex-perf#')
mexcore = Namespace('http://mex.aksw.org/mex-core#')
this = Namespace('http://mex.aksw.org/examples/')

g = Graph()
# Create Binding
g.bind('dct',DCTERMS)
g.bind('owl',OWL)
g.bind('foaf',FOAF)
g.bind('xsd', XSD)
g.bind('rdfs', RDFS)
g.bind('doap', DOAP)
g.bind('dc', DC)
g.bind('prov', prov)
g.bind('dcat', dcat)
g.bind('mexalgo',mexalgo)
g.bind('mexperf',mexperf)
g.bind('mexcore',mexcore)
g.bind('this',this)

g.add((this.khan_task3,RDF.type,mexcore.Experiment))
g.add((this.khan_task3,RDF.type,mexcore.ApplicationContext))
g.add((this.khan_task3,RDFS.label, Literal('2255383')))
g.add((this.khan_task3,DCTERMS.date, Literal('2018-05-15',datatype=XSD.date)))
g.add((this.khan_task3,FOAF.givenName, Literal('Asif')))
g.add((this.khan_task3,FOAF.mbox, Literal('mak4086@gmail.com')))

#Model-1
g.add((this.configuration1,RDF.type,mexcore.ExperimentConfiguration))
g.add((this.configuration1,prov.used, this.model1))
g.add((this.configuration1,prov.wasStartedBy, this.khan_task3))

#Model-2
g.add((this.configuration2,RDF.type,mexcore.ExperimentConfiguration))
g.add((this.configuration2,prov.used, this.model2))
g.add((this.configuration2,prov.wasStartedBy, this.khan_task3))

g.add((this.test,RDF.type,mexcore.Test))
g.add((this.test,RDFS.label,Literal('Test')))

g.add((this.hyerparameter_model1,RDF.type,mexalgo.HyperParameterCollection))
g.add((this.hyerparameter1,RDFS.label,Literal('HyperParameterCollection')))
g.add((this.hyerparameter_model1,prov.hadMember,this.hyerparameter1))
g.add((this.hyerparameter_model1,prov.hadMember,this.hyerparameter2))
g.add((this.hyerparameter_model1,prov.hadMember,this.hyerparameter3))


g.add((this.hyerparameter_model2,RDF.type,mexalgo.HyperParameterCollection))
g.add((this.hyerparameter_model2,RDFS.label,Literal('HyperParameterCollection')))
g.add((this.hyerparameter_model2,prov.hadMember,this.hyerparameter4))
g.add((this.hyerparameter_model2,prov.hadMember,this.hyerparameter5))
g.add((this.hyerparameter_model2,prov.hadMember,this.hyerparameter6))


g.add((this.hyerparameter1,RDF.type,mexalgo.HyperParameter))
g.add((this.hyerparameter1,RDFS.label, Literal('embedding_dimension')))
g.add((this.hyerparameter1,DCTERMS.identifier, Literal('embedding_dimension')))
g.add((this.hyerparameter1,prov.value, Literal('128',datatype=XSD.integer)))

g.add((this.hyerparameter2,RDF.type,mexalgo.HyperParameter))
g.add((this.hyerparameter2,RDFS.label, Literal('dropout')))
g.add((this.hyerparameter2,DCTERMS.identifier, Literal('dropout')))
g.add((this.hyerparameter2,prov.value, Literal('0.1',datatype=XSD.float)))

g.add((this.hyerparameter3,RDF.type,mexalgo.HyperParameter))
g.add((this.hyerparameter3,RDFS.label, Literal('recurrent_dropout')))
g.add((this.hyerparameter3,DCTERMS.identifier, Literal('recurrent_dropout')))
g.add((this.hyerparameter3,prov.value, Literal('0.1',datatype=XSD.float)))


g.add((this.hyerparameter4,RDF.type,mexalgo.HyperParameter))
g.add((this.hyerparameter4,RDFS.label, Literal('embedding_dimension')))
g.add((this.hyerparameter4,DCTERMS.identifier, Literal('embedding_dimension')))
g.add((this.hyerparameter4,prov.value, Literal('128', datatype=XSD.integer)))

g.add((this.hyerparameter5,RDF.type,mexalgo.HyperParameter))
g.add((this.hyerparameter5,RDFS.label, Literal('convolution_filters')))
g.add((this.hyerparameter5,DCTERMS.identifier, Literal('convolution_filters')))
g.add((this.hyerparameter5,prov.value, Literal('12',datatype=XSD.integer)))

g.add((this.hyerparameter6,RDF.type,mexalgo.HyperParameter))
g.add((this.hyerparameter6,RDFS.label, Literal('filter_size')))
g.add((this.hyerparameter6,DCTERMS.identifier, Literal('filter_size')))
g.add((this.hyerparameter6,prov.value, Literal('3',datatype=XSD.integer)))


g.add((this.dataset,RDF.type,mexcore.Dataset))
g.add((this.dataset,RDFS.label,Literal('Sentiment-Classification')))
g.add((this.dataset,DCTERMS.landingPage,Literal('https://github.com/SmartDataAnalytics/MA-INF-4222-NLP-Lab/blob/master/2018_SoSe/exercises/dataset_sentiment.csv')))


g.add((this.execution1,RDF.type,mexcore.ExecutionOverall))
g.add((this.execution1,prov.generated,this.performance_measures1))
g.add((this.execution1,prov.used,this.test))
g.add((this.execution1,prov.used,this.hyerparameter_model1))
g.add((this.execution1,prov.used,this.model1))

g.add((this.execution2,RDF.type,mexcore.ExecutionOverall))
g.add((this.execution2,prov.generated,this.performance_measures2))
g.add((this.execution2,prov.used,this.test))
g.add((this.execution2,prov.used,this.hyerparameter_model2))
g.add((this.execution2,prov.used,this.model2))


g.add((this.performance_measures1,RDF.type,mexcore.PerformanceMeasure))
g.add((this.performance_measures1,mexperf.score,Literal('0.36',datatype=XSD.float)))
g.add((this.performance_measures1,mexperf.accuracy,Literal('0.85',datatype=XSD.float)))
g.add((this.performance_measures1,mexperf.pos_accuracy,Literal('0.40',datatype=XSD.float)))
g.add((this.performance_measures1,mexperf.neg_accuracy,Literal('0.97',datatype=XSD.float)))
g.add((this.performance_measures1,prov.wasGeneratedBy,this.execution1))

g.add((this.performance_measures2,RDF.type,mexcore.PerformanceMeasure))
g.add((this.performance_measures2,mexperf.score,Literal('0.36',datatype=XSD.float)))
g.add((this.performance_measures2,mexperf.accuracy,Literal('0.85',datatype=XSD.float)))
g.add((this.performance_measures1,mexperf.pos_accuracy,Literal('0.45',datatype=XSD.float)))
g.add((this.performance_measures1,mexperf.neg_accuracy,Literal('0.95',datatype=XSD.float)))
g.add((this.performance_measures2,prov.wasGeneratedBy,this.execution2))


g.add((this.model1,RDF.type,mexalgo.Algorithm))
g.add((this.model1,RDFS.label,Literal('LSTM')))
g.add((this.model1,DCTERMS.identifier,Literal('LSTM')))
g.add((this.model1,mexalgo.hasHyperParameter,this.hyerparameter1))
g.add((this.model1,mexalgo.hasHyperParameter,this.hyerparameter2))
g.add((this.model1,mexalgo.hasHyperParameter,this.hyerparameter3))

g.add((this.model2,RDF.type,mexalgo.Algorithm))
g.add((this.model2,RDFS.label,Literal('TemporalConvolution')))
g.add((this.model2,DCTERMS.identifier,Literal('TemporalConvolution')))
g.add((this.model2,mexalgo.hasHyperParameter,this.hyerparameter4))
g.add((this.model2,mexalgo.hasHyperParameter,this.hyerparameter5))
g.add((this.model2,mexalgo.hasHyperParameter,this.hyerparameter6))

with open('task3_metadata.ttl','wb') as f:
    f.write(g.serialize(format='turtle'))