In [1]:
import sys,os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
%matplotlib inline
matplotlib.style.use('ggplot')
plt.rcParams['figure.figsize'] = (15, 5)
pd.set_option('display.max_colwidth', 300)



In [2]:
TWITTER_FILE = 'training-Obama-Romney-tweets.xlsx'

# read the data
obama_data = pd.read_excel(TWITTER_FILE,names = ['date','time','text','sentiment'],parse_cols = 4,sheetname = 'Obama')
romney_data = pd.read_excel(TWITTER_FILE,names = ['date','time','text','sentiment'],parse_cols = 4,sheetname = 'Romney')

def get_data(data):
    """ get and clean the data """
    data = data.iloc[1:]
    data['text'] = data['text'].values.astype('unicode')
    data['date'] = data['date'].values.astype('str')
    data['time'] = data['time'].values.astype('unicode')
    # remove rows with mixed sentiment
    data = data[data['sentiment'] < 2]
    data.index = range(len(data))
    
    return data

obama_data = get_data(obama_data)
romney_data = get_data(romney_data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


### Emoticon Converter

In [3]:
import re

emoticon_dictionary = {':)':' smileyface ','(:':' smileyface ','XD': ' happyface ',':D': ' smileyface ','>.<':' smileyface ',':-)':' smileyface ',';)':' winkface ',';D':' winkface ',':\'(':' cryingface '}

emoticons = [':\)','\(:','XD',':D','>\.<',':-\)',';\)',';D',':\'\(']

emoticon_pattern = re.compile(r'(' + '\s*|\s*'.join(emoticons) + r')')

# convert emoticons to words
def emoticon_converter(x):
    x = emoticon_pattern.sub(lambda i : emoticon_dictionary[i.group().replace(' ','')],x)   
    return x

obama_data['text'] = obama_data['text'].apply(emoticon_converter)
romney_data['text'] = romney_data['text'].apply(emoticon_converter)


### Hashtag Separator

In [4]:
def separate_hashtag(x):
    x = x.split()
    temp = []
    for i,word in enumerate(x):
        if '#' in word:
            if any(w.isupper for w in word):
                temp += re.findall('[A-Z][^A-Z]*',word)
            else:
                # Should add code
                pass
        else:
            temp.append(word)
    
    return ' '.join(temp)

obama_data['text'] = obama_data['text'].apply(separate_hashtag)
romney_data['text'] = romney_data['text'].apply(separate_hashtag)

### Clean data

In [5]:
# remove punctuations
punc = ['\:','\;','\?','\$','\.','\(','\)','\#','\=','\%','\-','\>','\<','\,','\"','\\','\&']
cond_1 = re.compile('|'.join(punc))
# remove tags
tags = ['<a>','</a>','<e>','</e>']
cond_2 = re.compile("|".join(tags))

def preprocess(data):
    """ preprocess the data"""
     # remove users
    data = data.apply(lambda x : re.sub(r'\@\s?\w+','',x))
    # remove hypertext 
    data = data.apply(lambda x : re.sub(r'http://\S+','',x))
    # remove tags
    data = data.apply(lambda x : re.sub(cond_2,'',x))
    # remove punctuations
    data = data.apply(lambda x : re.sub(cond_1,'',x))
    # remove digits
    data = data.apply(lambda x : re.sub(r'[0-9]+','',x))
    # convert to ascii
    data = data.apply(lambda x: x.encode('utf-8'))
    
    return data

obama_data['text'] = preprocess(obama_data['text'])
romney_data['text'] = preprocess(romney_data['text'])

### Stopwords Removal

In [6]:
import nltk

from nltk.corpus import stopwords

manual_stopwords_list = ['RT','MT']
stopwords_list = stopwords.words('english') + manual_stopwords_list


# stopwords list based on pos tags

remove_tags_nltkpos = ['IN','DT','PRP','CC']


def pos_tag_filter(x):
    x = x.split()
    s = nltk.pos_tag(x)
    for i,(_,tag) in enumerate(s):
        if tag in remove_tags_nltkpos:
            x[i] = ''
    return ' '.join(x)
    

# obama_data['text'] = obama_data['text'].apply(pos_tag_filter)
romney_data['text'] = romney_data['text'].apply(pos_tag_filter)

In [7]:
# to be checked 
obama_data['text'] = obama_data['text'].apply(lambda x : x.lower())
romney_data['text'] = romney_data['text'].apply(lambda x : x.lower())

### Tokenizer

In [8]:
# stemming
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.porter import PorterStemmer

class WordTokenizer(object):
    def __init__(self,stemmer='porter'):
        self.stemmer = stemmer
        if stemmer == 'wordnet':
            self.wnl = WordNetLemmatizer()
        if stemmer == 'porter':
            self.wnl = PorterStemmer()
        if stemmer == 'snowball':
            self.wnl = SnowballStemmer('english')
    def __call__(self,doc):
        if self.stemmer == 'wordnet':
            return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]
        else:
            return [self.wnl.stem(t) for t in word_tokenize(doc)]

### glove embedding

In [9]:
GLOVE_FILE = 'glove.twitter.27B/glove.twitter.27B.200d.txt'
EMBEDDING_DIM = 200 #size of word vector 

embeddings_index = {}
f = open(GLOVE_FILE)
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 1193514 word vectors.


## Naive Bayes and SVM

In [10]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC,libsvm,SVC
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import cross_val_score,cross_val_predict
from sklearn.decomposition import TruncatedSVD


from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import KFold,StratifiedKFold

def get_X_y(data):
    return data['text'],data['sentiment'].astype(int)

In [11]:
# create a pipeline

def model_pipeline(X):
    
    global WordTokenizer

    text_vector = Pipeline([('vect', CountVectorizer(tokenizer = WordTokenizer('wordnet'),stop_words = [],ngram_range = (1,2),max_features=10000)),
                    ('tfidf',TfidfTransformer())])
    svd_transform = TruncatedSVD(n_components = 1000,n_iter = 5)
    
    # transform the data
    X = text_vector.fit_transform(X)
    X_reduced = svd_transform.fit_transform(X)
    
    return X,X_reduced

In [12]:
from sklearn.metrics import classification_report,accuracy_score,f1_score

def classifiers_validate(X,X_reduced,y):

    classifier_scores = dict()

    def naive_classifier():
        return 'Naive_Bayes',MultinomialNB()

    def svm_classifier():
        return 'Linear_SVM',SVC(kernel = 'linear')

    classifiers_list = [naive_classifier(),svm_classifier()]

    for clf_name,clf in classifiers_list:
        # dont use reduced matrix for naive bayes
        if clf_name != 'Naive_Bayes':
                X = X_reduced
        classifier_scores[clf_name] = dict()
        classifier_scores[clf_name]['classification_pred'] = cross_val_predict(clf,X,y,cv = 10)
        
    for clf_name,_ in classifiers_list:
        print 'Classifier - {}'.format(clf_name)
        print 'accuracy is {}'.format(accuracy_score(y,classifier_scores[clf_name]['classification_pred']))
        print classification_report(y,classifier_scores[clf_name]['classification_pred'])

In [13]:
def classifier_classify(X,y,clfname = 'NaiveBayes'):
    if clfname == 'NaiveBayes':
        clf = MultinomialNB()
    else:
        clf = SVC(kernel = 'linear',probability=True)
    clf = clf.fit(X,y)
    return clf

def classifier_predict(clf,X):
    return clf.predict_proba(X)  

## GRU

In [14]:
import keras
from keras import backend as K
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding, Dropout
from keras.models import Model
from keras.models import Sequential
from keras.layers import LSTM, GRU
from keras.preprocessing.text import Tokenizer
from keras import optimizers
from keras import regularizers

Using Theano backend.


In [15]:
MAX_SEQUENCE_LENGTH = 30 #max number of sentences in a message
MAX_NB_WORDS = 20000 #cap vocabulary
TOKENIZER = 'keras' #or use nltk
STEMMER = 'wordnet'

In [16]:
def get_Ytrue_Ypred(model,x,y):
    #Y matrix is [1,0,0] for class 0, [0,1,0] for class 1, [0,0,1] for class -1
    convert_to_label ={0:0,1:1,2:-1}
    model_predictions = model.predict(x)
    y_pred = np.zeros(len(y))
    y_true = np.zeros(len(y))

    for i in range(len(y)):
        y_pred[i] = convert_to_label[np.argmax(model_predictions[i])]
        y_true[i] = convert_to_label[np.argmax(y[i])]

    return y_true,y_pred

In [17]:
class weighted_categorical_crossentropy(object):
    """
    A weighted version of keras.objectives.categorical_crossentropy
    
    Variables:
        weights: numpy array of shape (C,) where C is the number of classes
    
    Usage:
        loss = weighted_categorical_crossentropy(weights).loss
        model.compile(loss=loss,optimizer='adam')
    """
    
    def __init__(self,weights):
        self.weights = K.variable(weights)
        
    def loss(self,y_true, y_pred):
        # scale preds so that the class probas of each sample sum to 1
        y_pred /= y_pred.sum(axis=-1, keepdims=True)
        # clip
        y_pred = K.clip(y_pred, K.epsilon(), 1)
        # calc
        loss = y_true*K.log(y_pred)*self.weights
        loss =-K.sum(loss,-1)
        return loss

In [18]:
def kerasprocess_data(texts,labels):

    tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)
    tokenizer.fit_on_texts(texts)
    sequences = tokenizer.texts_to_sequences(texts) #list of lists, basically replaces each word with number

    tokens = []
  
    myTokenizer = WordTokenizer(STEMMER)
        
    for i in range(0,len(texts)):
        try:
            tokens.append(myTokenizer.__call__(texts[i]))
        except UnicodeDecodeError:
            pass
    word_dict = {}
    winx = 1
    mysequences = []
    tsq = []
    for i in range(0,len(tokens)):
        for token in tokens[i]:
            if token not in word_dict:
                word_dict[token] = winx
                winx += 1
            tsq.append(word_dict[token])
        mysequences.append(tsq)
        tsq = []

    word_index = tokenizer.word_index
    
    if TOKENIZER == 'nltk':
        word_index = word_dict
        sequences = mysequences

    #pad the data 
    data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
    Y = labels
    
    
    #prepare embedding matrix

    num_words = len(word_index)+1
    embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector
    
    return data,Y,embedding_matrix

In [19]:
def GRU_validate(data,Y,embedding_matrix,data_name = 'Obama'):

    #k fold cross validaiton
    avg_acc = []
    avg_f1 = []
    f_pos = []
    f_neg = []
    precision_pos = []
    precision_neg = []
    recall_pos = []
    recall_neg = []

    kf = StratifiedKFold(n_splits=2)
    labels = keras.utils.np_utils.to_categorical(Y,nb_classes=3)
    
    
    
    for train,test in kf.split(data,Y): #do the cross validation
        np.random.seed(1)
        x_train, x_val, y_train, y_val = data[train], data[test], labels[train], labels[test]

        if data_name == 'Obama':
            model = obama_build_model(embedding_matrix,len(labels[0]))
        else:
            model = romney_build_model(embedding_matrix,len(labels[0]))
        
        model.fit(x_train, y_train, nb_epoch=15, batch_size=64,verbose=0) #ep = 20 .5979
        y_true,y_pred = get_Ytrue_Ypred(model,x_val,y_val)
        avg_acc.append(accuracy_score(y_true,y_pred))
        avg_f1.append(f1_score(y_true,y_pred,average='macro'))      
        print classification_report(y_true,y_pred)
        precision, recall, fscore, support = score(y_true, y_pred)
        f_pos.append(fscore[2])
        f_neg.append(fscore[0])
        precision_pos.append(precision[2])
        precision_neg.append(precision[0])
        recall_pos.append(recall[2])
        recall_neg.append(recall[0])
    
    #print classification_report(y_true,y_pred)
    print 'Average f1-score = ', np.mean(np.array(avg_f1))
    print 'Overall Accuracy = ',100.0*np.mean(np.array(avg_acc)),'%'
    print 'positive f1-score = ', np.mean(np.array(f_pos))
    print 'negative f1-score = ', np.mean(np.array(f_neg))
    print 'positive precision = ', np.mean(np.array(precision_pos))
    print 'negative precision = ', np.mean(np.array(precision_neg))
    print 'positive recall = ', np.mean(np.array(recall_pos))
    print 'negative recall = ', np.mean(np.array(recall_neg))

In [20]:
# implement this
def GRU_classify(data,labels,embedding_matrix):
    pass

def GRU_predict(clf,data,embedding_matrix):
    pass

# Obama data

In [21]:
def obama_build_model(embedding_matrix,labels_len):
    np.random.seed(1)
    num_words = embedding_matrix.shape[0]
    l2 = regularizers.l2(0.01)
    l22 = regularizers.l2(0.01)
    model = Sequential()
    embedding_layer = Embedding(num_words,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=0)
    model.add(embedding_layer)
    model.add(GRU(10,return_sequences=False,dropout_W=0.6,dropout_U=0.5))
    weights = np.array([1,2,1]) #index 0 for class 0, index 1 for class 1, index 2 for class -1
    mloss = weighted_categorical_crossentropy(weights).loss
    sgd = optimizers.SGD(lr=0.1, decay=1e-6, momentum=0.8, nesterov=True)
    model.add(Dense(labels_len, activation='softmax'))
    model.compile(loss=mloss, optimizer='rmsprop')
    
    return model

In [22]:
# Naive Bayes and Linear SVM
X,y = get_X_y(obama_data)
X,X_reduced = model_pipeline(X)
classifiers_validate(X,X_reduced,y)

Classifier - Naive_Bayes
accuracy is 0.592761835131
             precision    recall  f1-score   support

         -1       0.56      0.72      0.63      1922
          0       0.57      0.49      0.53      1896
          1       0.68      0.56      0.61      1653

avg / total       0.60      0.59      0.59      5471

Classifier - Linear_SVM
accuracy is 0.583805520015
             precision    recall  f1-score   support

         -1       0.58      0.66      0.62      1922
          0       0.54      0.55      0.54      1896
          1       0.66      0.54      0.59      1653

avg / total       0.59      0.58      0.58      5471



In [24]:
# GRU
texts = obama_data['text']
labels = np.array(obama_data['sentiment'])

data,labels,embedding_matrix = kerasprocess_data(texts,labels)
GRU_validate(data,labels,embedding_matrix,data_name = 'Obama')

             precision    recall  f1-score   support

       -1.0       0.53      0.47      0.50       961
        0.0       0.47      0.22      0.30       948
        1.0       0.42      0.72      0.53       827

avg / total       0.48      0.46      0.44      2736

             precision    recall  f1-score   support

       -1.0       0.60      0.38      0.46       961
        0.0       0.57      0.16      0.25       948
        1.0       0.39      0.87      0.53       826

avg / total       0.52      0.45      0.41      2735

Average f1-score =  0.429889234747
Overall Accuracy =  45.6039922704 %
positive f1-score =  0.531779361716
negative f1-score =  0.480238317736
positive precision =  0.401476947068
negative precision =  0.564364219441
positive recall =  0.796172460335
negative recall =  0.422996878252


# Romney data

In [None]:
# change the hyperparameters
def romney_build_model(embedding_matrix,labels_len):
    np.random.seed(1)
    num_words = embedding_matrix.shape[0]
    l2 = regularizers.l2(0.01)
    l22 = regularizers.l2(0.01)
    model = Sequential()
    embedding_layer = Embedding(num_words,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=0)
    model.add(embedding_layer)
    model.add(GRU(10,return_sequences=False,dropout_W=0.6,dropout_U=0.5))
    weights = np.array([1,2,1]) #index 0 for class 0, index 1 for class 1, index 2 for class -1
    mloss = weighted_categorical_crossentropy(weights).loss
    sgd = optimizers.SGD(lr=0.1, decay=1e-6, momentum=0.8, nesterov=True)
    model.add(Dense(labels_len, activation='softmax'))
    model.compile(loss=mloss, optimizer='rmsprop')
    
    return model

In [None]:
# Naive bayes and SVM
X,y = get_X_y(romney_data)
X,X_reduced = model_pipeline(X)
classifiers_validate(X,X_reduced,y)

In [None]:
# GRU
texts = romney_data['text']
labels = np.array(romney_data['sentiment'])

data,labels,embedding_matrix = kerasprocess_data(texts,labels)
GRU_validate(data,labels,embedding_matrix,data_name = 'Romney')