# Sentiment Analysis

In [2]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D, MaxPooling1D
from keras.layers import Conv1D, GlobalMaxPooling1D, Activation, Dropout
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
from IPython import display
import re

### Load data

In [3]:
df = pd.read_csv('data/dataset_sentiment.csv')
df.head()

Unnamed: 0,id,candidate,candidate_confidence,relevant_yn,relevant_yn_confidence,sentiment,sentiment_confidence,subject_matter,subject_matter_confidence,candidate_gold,...,relevant_yn_gold,retweet_count,sentiment_gold,subject_matter_gold,text,tweet_coord,tweet_created,tweet_id,tweet_location,user_timezone
0,1,No candidate mentioned,1.0,yes,1.0,Neutral,0.6578,None of the above,1.0,,...,,5,,,RT @NancyLeeGrahn: How did everyone feel about...,,2015-08-07 09:54:46 -0700,629697200650592256,,Quito
1,2,Scott Walker,1.0,yes,1.0,Positive,0.6333,None of the above,1.0,,...,,26,,,RT @ScottWalker: Didn't catch the full #GOPdeb...,,2015-08-07 09:54:46 -0700,629697199560069120,,
2,3,No candidate mentioned,1.0,yes,1.0,Neutral,0.6629,None of the above,0.6629,,...,,27,,,RT @TJMShow: No mention of Tamir Rice and the ...,,2015-08-07 09:54:46 -0700,629697199312482304,,
3,4,No candidate mentioned,1.0,yes,1.0,Positive,1.0,None of the above,0.7039,,...,,138,,,RT @RobGeorge: That Carly Fiorina is trending ...,,2015-08-07 09:54:45 -0700,629697197118861312,Texas,Central Time (US & Canada)
4,5,Donald Trump,1.0,yes,1.0,Positive,0.7045,None of the above,1.0,,...,,156,,,RT @DanScavino: #GOPDebate w/ @realDonaldTrump...,,2015-08-07 09:54:45 -0700,629697196967903232,,Arizona


In [4]:
df = df[['text', 'sentiment']]

display.display(df['text'].describe())
display.display(df['sentiment'].describe())

count                                                 13871
unique                                                10402
top       RT @RWSurferGirl: Jeb Bush reminds me of eleva...
freq                                                    161
Name: text, dtype: object

count        13871
unique           3
top       Negative
freq          8493
Name: sentiment, dtype: object

In [5]:
df['sentiment'].value_counts()

Negative    8493
Neutral     3142
Positive    2236
Name: sentiment, dtype: int64

In [6]:
# Preprocess the data

# take only negative and positive sentiments
df = df[df.sentiment != 'Neutral']

# make lowercase text
df['text'] = df['text'].apply(lambda x: x.lower())

# remove RT from text
df['text'] = df['text'].apply(lambda x: x.replace('rt', ' '))

# remove everything except letters, numbers and space
df['text'] = df['text'].apply(lambda x: re.sub('[^a-zA-z0-9\s]', '', x))

In [7]:
df.head()

Unnamed: 0,text,sentiment
1,scottwalker didnt catch the full gopdebate l...,Positive
3,robgeorge that carly fiorina is trending ho...,Positive
4,danscavino gopdebate w realdonaldtrump deliv...,Positive
5,gregabbott_tx tedcruz on my first day i will...,Positive
6,warriorwoman91 i liked her and was happy whe...,Negative


In [8]:
df['text'].values

array(['  scottwalker didnt catch the full gopdebate last night here are some of scotts best lines in 90 seconds walker16 httptcozsff',
       '  robgeorge that carly fiorina is trending  hours after her debate  above any of the men in justcompleted gopdebate says shes on ',
       '  danscavino gopdebate w realdonaldtrump delivered the highest ratings in the history of presidential debates trump2016 httptco',
       ...,
       '  lrihendry tedcruz as president i will always tell the truth and do what i said i would do  gopdebates',
       '  jrehling gopdebate donald trump says that he doesnt have time for political correctness how does calling women fat pigs save him ',
       '  lrihendry tedcruz headed into the presidential debates go ted \n\ngopdebates httptco8s67pz8a4a'],
      dtype=object)

### Build a model

In [9]:
def train_and_evaluate_model(model, 
                             X,
                             Y,
                             epochs, 
                             batch_size,
                             tok):
    """Trains model and prints results.
    
    Args:
        model: Keras `Sequential` model,
        X:
        Y:
        epochs: `int` number of epochs
        batch_size: `int` batch size,
        max_features: `int` maximum features
    """
    
    validation_size = 1000
    # split data set
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, 
                                                        test_size=0.3,
                                                        random_state=4222)
    X_validate = X_test[-validation_size:]
    Y_validate = Y_test[-validation_size:]
    X_test = X_test[:-validation_size]
    Y_test = Y_test[:-validation_size]
    
    model.fit(X_train, Y_train, epochs=epochs, 
              batch_size=batch_size, verbose=2)
    
    score, accuracy = model.evaluate(X_test, Y_test, 
                                 batch_size=batch_size,
                                 verbose=2)
    print("Score: %.2f" % score)
    print("Accuracy: %.2f" % accuracy)  
    
    pos_cnt, neg_cnt, pos_ok, neg_ok = 0, 0, 0, 0
    for x in range(len(X_validate)):
        result = model.predict(
            X_validate[x].reshape(1, X_test.shape[1]),
            batch_size=1,
            verbose=2
        )[0]

        if np.argmax(result) == np.argmax(Y_validate[x]):
            if np.argmax(Y_validate[x]) == 0: 
                neg_ok += 1
            else:
                pos_ok += 1

        if np.argmax(Y_validate[x]) == 0:
            neg_cnt += 1
        else:
            pos_cnt += 1

    print("pos_acc", pos_ok/pos_cnt*100, '%')
    print("neg_acc", neg_ok/neg_cnt*100, '%')
    
#     X2 = ['what are u going to say about that? the truth, wassock?!']
#     X2 = tok.texts_to_sequences(X2)
#     X2 = pad_sequences(X2, maxlen=maxlen, dtype='int32', value=0)
#     print(X2)
#     print(model.predict(X2, batch_size=1, verbose=2)[0])

### Default Model

In [10]:
def default_model(df):
    """Trains model.
    Args:
        df: A Pandas `DataFrame`.
    """
    
    maxlen = 100
    embed_dim = 128
    lstm_out = 196
    dropout = 0.1
    dropout_1d = 0.4
    recurrent_dropout = 0.1
    batch_size = 16
    epochs = 2
    verbose = 2
    max_features = 500
    
    tok = Tokenizer(num_words=max_features, split=' ')
    tok.fit_on_texts(df['text'].values)
    X = tok.texts_to_sequences(df['text'].values)
    X = pad_sequences(X)
    
    # convert categorical variable into indicator variables
    Y = pd.get_dummies(df['sentiment']).values
    
    # Build a model
    model = Sequential()

    # Add embedding
    model.add(Embedding(max_features, embed_dim, 
                    input_length=X.shape[1]))

    # Add Spatial Dropout
    model.add(SpatialDropout1D(dropout_1d))

    # Add LSTM
    model.add(LSTM(lstm_out, 
               dropout=dropout, 
               recurrent_dropout=recurrent_dropout))

    # Add output
    model.add(Dense(2, activation='softmax'))

    # compile model
    model.compile(
        loss='categorical_crossentropy',
        optimizer='adam',
        metrics=['accuracy']
    )

    print(model.summary())
    
    train_and_evaluate_model(model, 
                             X,
                             Y,
                             epochs,
                             batch_size,
                             tok)

In [11]:
default_model(df)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 26, 128)           64000     
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 26, 128)           0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 196)               254800    
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 394       
Total params: 319,194
Trainable params: 319,194
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/2
 - 44s - loss: 0.4363 - acc: 0.8168
Epoch 2/2
 - 45s - loss: 0.3684 - acc: 0.8437
Score: 0.36
Accuracy: 0.86
pos_acc 35.12195121951219 %
neg_acc 97.86163522012579 %


### Recurrent Convolutional Neural Network Model

In [12]:
def rcnn(df):
    """Trains model.
    Args:
        df: A Pandas `DataFrame`.
    """
    
    # Embedding
    max_features = 5000
    maxlen = 100
    embedding_size = 256
    
    # Convolution
    kernel_size = 5
    filters = 64
    pool_size = 4
    
    # LSTM
    lstm_output_size = 70
    
    # Training
    batch_size = 32
    epochs = 2
    
    tok = Tokenizer(num_words=max_features, split=' ')
    tok.fit_on_texts(df['text'].values)
    X = tok.texts_to_sequences(df['text'].values)
    X = pad_sequences(X)
    
    # convert categorical variable into indicator variables
    Y = pd.get_dummies(df['sentiment']).values
    
    # Build a model
    model = Sequential()

    # Add embedding
    model.add(Embedding(max_features, embedding_size, 
                    input_length=X.shape[1]))

    # Add Dropout
    model.add(Dropout(0.1))
    
    # Add Convolution
    model.add(Conv1D(
        filters,
        kernel_size,
        padding='valid',
        activation='relu',
        strides=1
    ))
    
    # Add max pooling
    model.add(MaxPooling1D(pool_size=pool_size))

    # Add LSTM
    model.add(LSTM(lstm_output_size))

    # Add output
    model.add(Dense(2, activation='softmax'))

    # compile model
    model.compile(
        loss='categorical_crossentropy',
        optimizer='adam',
        metrics=['accuracy']
    )

    print(model.summary())
    
    train_and_evaluate_model(model, 
                             X,
                             Y,
                             epochs,
                             batch_size,
                             tok)

In [13]:
rcnn(df)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 29, 256)           1280000   
_________________________________________________________________
dropout_1 (Dropout)          (None, 29, 256)           0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 25, 64)            81984     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 6, 64)             0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 70)                37800     
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 142       
Total params: 1,399,926
Trainable params: 1,399,926
Non-trainable params: 0
_________________________________________________________________


##### Exporting mex

In [14]:
from rdflib import Namespace, Graph, Literal
from rdflib.namespace import FOAF, OWL, XSD, RDFS, DCTERMS, DOAP, DC, RDF

prov = Namespace('http://www.w3.org/ns/prov#')
dcat = Namespace('http://www.w3.org/ns/dcat#')
mexalgo = Namespace('http://mex.aksw.org/mex-algo#')
mexperf = Namespace('http://mex.aksw.org/mex-perf#')
mexcore = Namespace('http://mex.aksw.org/mex-core#')
this = Namespace('http://mex.aksw.org/examples/')

In [15]:
g = Graph()
g.bind('dct',DCTERMS)
g.bind('owl',OWL)
g.bind('foaf',FOAF)
g.bind('xsd', XSD)
g.bind('rdfs', RDFS)
g.bind('doap', DOAP)
g.bind('dc', DC)
g.bind('prov', prov)
g.bind('dcat', dcat)
g.bind('mexalgo',mexalgo)
g.bind('mexperf',mexperf)
g.bind('mexcore',mexcore)
g.bind('this',this)

g.add((this.torayeff_exp_rcnn,RDF.type,mexcore.Experiment))
g.add((this.torayeff_exp_rcnn,RDF.type,mexcore.ApplicationContext))
g.add((this.torayeff_exp_rcnn,RDFS.label, Literal('3067341')))
g.add((this.torayeff_exp_rcnn,DCTERMS.date, Literal('2018-05-28',datatype=XSD.date)))
g.add((this.torayeff_exp_rcnn,FOAF.givenName, Literal('Agajan')))
g.add((this.torayeff_exp_rcnn,FOAF.mbox, Literal('torayevagajan@gmail.com')))

# RCNN model
g.add((this.configuration_rcnn,RDF.type,mexcore.ExperimentConfiguration))
g.add((this.configuration_rcnn,prov.used, this.model2))
g.add((this.configuration_rcnn,prov.wasStartedBy, this.torayeff_exp_rcnn))

g.add((this.test,RDF.type,mexcore.Test))
g.add((this.test,RDFS.label,Literal('Test')))

# hyperparameters
g.add((this.hyperparameter_model_rcnn,RDF.type,mexalgo.HyperParameterCollection))
g.add((this.hyperparameter_model_rcnn,RDFS.label,Literal('HyperParameterCollection')))
g.add((this.hyperparameter_model_rcnn,prov.hadMember,this.hyerparameter1))
g.add((this.hyperparameter_model_rcnn,prov.hadMember,this.hyerparameter2))
g.add((this.hyperparameter_model_rcnn,prov.hadMember,this.hyerparameter3))
g.add((this.hyperparameter_model_rcnn,prov.hadMember,this.hyerparameter4))
g.add((this.hyperparameter_model_rcnn,prov.hadMember,this.hyerparameter5))
g.add((this.hyperparameter_model_rcnn,prov.hadMember,this.hyerparameter6))
g.add((this.hyperparameter_model_rcnn,prov.hadMember,this.hyerparameter7))

g.add((this.hyerparameter1,RDF.type,mexalgo.HyperParameter))
g.add((this.hyerparameter1,RDFS.label, Literal('max_features')))
g.add((this.hyerparameter1,DCTERMS.identifier, Literal('max_features')))
g.add((this.hyerparameter1,prov.value, Literal('5000',datatype=XSD.integer)))

g.add((this.hyerparameter2,RDF.type,mexalgo.HyperParameter))
g.add((this.hyerparameter2,RDFS.label, Literal('max_len')))
g.add((this.hyerparameter2,DCTERMS.identifier, Literal('max_len')))
g.add((this.hyerparameter2,prov.value, Literal('100',datatype=XSD.integer)))

g.add((this.hyerparameter3,RDF.type,mexalgo.HyperParameter))
g.add((this.hyerparameter3,RDFS.label, Literal('embedding_size')))
g.add((this.hyerparameter3,DCTERMS.identifier, Literal('embedding_size')))
g.add((this.hyerparameter3,prov.value, Literal('256',datatype=XSD.integer)))

g.add((this.hyerparameter4,RDF.type,mexalgo.HyperParameter))
g.add((this.hyerparameter4,RDFS.label, Literal('kernel_size')))
g.add((this.hyerparameter4,DCTERMS.identifier, Literal('kernel_size')))
g.add((this.hyerparameter4,prov.value, Literal('5',datatype=XSD.integer)))

g.add((this.hyerparameter5,RDF.type,mexalgo.HyperParameter))
g.add((this.hyerparameter5,RDFS.label, Literal('filters')))
g.add((this.hyerparameter5,DCTERMS.identifier, Literal('filters')))
g.add((this.hyerparameter5,prov.value, Literal('64',datatype=XSD.integer)))

g.add((this.hyerparameter6,RDF.type,mexalgo.HyperParameter))
g.add((this.hyerparameter6,RDFS.label, Literal('pool_size')))
g.add((this.hyerparameter6,DCTERMS.identifier, Literal('pool_size')))
g.add((this.hyerparameter6,prov.value, Literal('4',datatype=XSD.integer)))

g.add((this.hyerparameter7,RDF.type,mexalgo.HyperParameter))
g.add((this.hyerparameter7,RDFS.label, Literal('lstm_output_size')))
g.add((this.hyerparameter7,DCTERMS.identifier, Literal('lstm_output_size')))
g.add((this.hyerparameter7,prov.value, Literal('70',datatype=XSD.integer)))

# Dataset
g.add((this.dataset,RDF.type,mexcore.Dataset))
g.add((this.dataset,RDFS.label,Literal('Sentiment-Classification')))
g.add((this.dataset,DCTERMS.landingPage,Literal('https://github.com/SmartDataAnalytics/MA-INF-4222-NLP-Lab/blob/master/2018_SoSe/exercises/dataset_sentiment.csv')))

# Execution
g.add((this.execution1,RDF.type,mexcore.ExecutionOverall))
g.add((this.execution1,prov.generated,this.performance_measures1))
g.add((this.execution1,prov.used,this.test))
g.add((this.execution1,prov.used,this.hyerparameter_model_rcnn))
g.add((this.execution1,prov.used,this.model_rcnn))

g.add((this.performance_measures1,RDF.type,mexcore.PerformanceMeasure))
g.add((this.performance_measures1,mexperf.score,Literal('0.35',datatype=XSD.float)))
g.add((this.performance_measures1,mexperf.accuracy,Literal('0.85',datatype=XSD.float)))
g.add((this.performance_measures1,mexperf.pos_accuracy,Literal('0.51',datatype=XSD.float)))
g.add((this.performance_measures1,mexperf.neg_accuracy,Literal('0.94',datatype=XSD.float)))
g.add((this.performance_measures1,prov.wasGeneratedBy,this.execution1))

g.add((this.model1,RDF.type,mexalgo.Algorithm))
g.add((this.model1,RDFS.label,Literal('Embedding')))
g.add((this.model1,DCTERMS.identifier,Literal('Embedding')))
g.add((this.model1,mexalgo.hasHyperParameter,this.hyperparameter1))
g.add((this.model1,mexalgo.hasHyperParameter,this.hyperparameter2))
g.add((this.model1,mexalgo.hasHyperParameter,this.hyperparameter3))

g.add((this.model1,RDF.type,mexalgo.Algorithm))
g.add((this.model1,RDFS.label,Literal('Convolution1D')))
g.add((this.model1,DCTERMS.identifier,Literal('Convolution1D')))
g.add((this.model1,mexalgo.hasHyperParameter,this.hyperparameter4))
g.add((this.model1,mexalgo.hasHyperParameter,this.hyperparameter5))
g.add((this.model1,mexalgo.hasHyperParameter,this.hyperparameter6))

g.add((this.model1,RDF.type,mexalgo.Algorithm))
g.add((this.model1,RDFS.label,Literal('LSTM')))
g.add((this.model1,DCTERMS.identifier,Literal('LSTM')))
g.add((this.model1,mexalgo.hasHyperParameter,this.hyperparameter7))

with open('sentiment_exp.ttl','wb') as f:
    f.write(g.serialize(format='turtle'))