In [1]:
import sys,os,string
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
%matplotlib inline
matplotlib.style.use('ggplot')
plt.rcParams['figure.figsize'] = (15, 5)
pd.set_option('display.max_colwidth', 300)



In [2]:
def get_data(data,mode = 'train'):
    """ get and clean the data """
    data = data.iloc[1:]
    data['text'] = data['text'].values.astype('unicode')
    # remove rows with mixed sentiment
    data = data[data['sentiment'] < 2]
    data.index = range(len(data))
    
    return data

### Emoticon Converter

In [3]:
import re

emoticon_dictionary = {':)':' smileyface ','(:':' smileyface ','XD': ' happyface ',':D': ' smileyface ','>.<':' smileyface ',':-)':' smileyface ',';)':' winkface ',';D':' winkface ',':\'(':' cryingface '}

emoticons = [':\)','\(:','XD',':D','>\.<',':-\)',';\)',';D',':\'\(']

emoticon_pattern = re.compile(r'(' + '\s*|\s*'.join(emoticons) + r')')

# convert emoticons to words
def emoticon_converter(x):
    x = emoticon_pattern.sub(lambda i : emoticon_dictionary[i.group().replace(' ','')],x)   
    return x


### Hashtag Separator

In [4]:
from hashTagSplit import *

def separate_hashtag(x):
    x = x.split()
    temp = []
    for i,word in enumerate(x):
        if '#' in word:
            if any(w.isupper() for w in word):
                temp += re.findall('[A-Z][^A-Z]*',word)
            else:
                if len(word) > 1:
                    temp += [split_hashtag(word[1:])]
        else:
            temp.append(word)
    
    return ' '.join(temp)

### Clean data

In [5]:
# remove punctuations
punc = ['\:','\;','\?','\$','\.','\(','\)','\=','\%','\-','\>','\<','\,','\"','\\','\&','\+']
cond_1 = re.compile('|'.join(punc))
# remove tags
tags = ['<a>','</a>','<e>','</e>']
cond_2 = re.compile("|".join(tags))

def preprocess(data):
    """ preprocess the data"""
     # remove users
    data = data.apply(lambda x : re.sub(r'\@\s?\w+','',x))
    # remove hypertext 
    data = data.apply(lambda x : re.sub(r'http://\S+','',x))
    # remove tags
    data = data.apply(lambda x : re.sub(cond_2,'',x))
    # remove punctuations
    data = data.apply(lambda x : re.sub(cond_1,'',x))
    # remove digits
    data = data.apply(lambda x : re.sub(r'[0-9]+','',x))
    # convert to ascii
    data = data.apply(lambda x: x.encode('utf-8'))
    printable = set(string.printable)
    for i in range(len(data)):
        data[i] = filter(lambda x: x in printable, data[i])
    
    return data

### Stopwords Removal

In [6]:
import nltk

from nltk.corpus import stopwords

manual_stopwords_list = ['RT','MT']
stopwords_list = stopwords.words('english') + manual_stopwords_list


# stopwords list based on pos tags
remove_tags_nltkpos = ['IN','DT','PRP','CC']


def pos_tag_filter(x):
    x = x.split()
    s = nltk.pos_tag(x)
    for i,(_,tag) in enumerate(s):
        if tag in remove_tags_nltkpos:
            x[i] = ''
    return ' '.join(x)

### Tokenizer

In [7]:
# stemming
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.porter import PorterStemmer

class WordTokenizer(object):
    def __init__(self,stemmer='porter'):
        self.stemmer = stemmer
        if stemmer == 'wordnet':
            self.wnl = WordNetLemmatizer()
        if stemmer == 'porter':
            self.wnl = PorterStemmer()
        if stemmer == 'snowball':
            self.wnl = SnowballStemmer('english')
    def __call__(self,doc):
        if self.stemmer == 'wordnet':
            return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]
        else:
            return [self.wnl.stem(t) for t in word_tokenize(doc)]

### glove embedding

In [8]:
GLOVE_FILE = 'glove.twitter.27B/glove.twitter.27B.200d.txt'
EMBEDDING_DIM = 200 #size of word vector 

embeddings_index = {}
f = open(GLOVE_FILE)
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 1193514 word vectors.


## Naive Bayes and SVM

In [9]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC,libsvm,SVC
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import cross_val_score,cross_val_predict
from sklearn.decomposition import TruncatedSVD


from sklearn.metrics import precision_recall_fscore_support as score,classification_report,accuracy_score
from sklearn.model_selection import KFold,StratifiedKFold

def get_X_y(data):
    return data['text'],data['sentiment'].astype(int)

In [10]:
# create a pipeline

def model_pipeline(X,WordTokenizer,text_vector = None, svd_transform = None,mode = 'train'):

    if mode == 'train':
        text_vector = Pipeline([('vect', CountVectorizer(tokenizer = WordTokenizer('wordnet'),stop_words = [],ngram_range = (1,2),max_features=10000)),
                    ('tfidf',TfidfTransformer())])
        svd_transform = TruncatedSVD(n_components = 1000,n_iter = 5)
        # transform the data
        X = text_vector.fit_transform(X)
        X_reduced = svd_transform.fit_transform(X)
        return X,X_reduced,text_vector,svd_transform
    else:
        X = text_vector.transform(X)
        X_reduced = svd_transform.transform(X)
        return X,X_reduced  

In [11]:
def classifier_train(X,y,clfname = 'NaiveBayes'):
    if clfname == 'NaiveBayes':
        clf = MultinomialNB()
    else:
        clf = SVC(kernel = 'linear',probability=True)
    clf = clf.fit(X,y)
    return clf

def classifier_predict(clf,X):
    return clf.predict_proba(X)  

## GRU

In [12]:
import keras
from keras import backend as K
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding, Dropout
from keras.models import Model
from keras.models import Sequential
from keras.layers import LSTM, GRU
from keras.preprocessing.text import Tokenizer
from keras import optimizers
from keras import regularizers

Using Theano backend.


In [13]:
MAX_SEQUENCE_LENGTH = 30 #max number of sentences in a message
MAX_NB_WORDS = 20000 #cap vocabulary
TOKENIZER = 'keras' #or use nltk
STEMMER = 'wordnet'

In [14]:
def get_Ytrue_Ypred(model,x,y):
    #Y matrix is [1,0,0] for class 0, [0,1,0] for class 1, [0,0,1] for class -1
    convert_to_label ={0:0,1:1,2:-1}
    model_predictions = model.predict(x)
    y_pred = np.zeros(len(y))
    y_true = np.zeros(len(y))

    for i in range(len(y)):
        y_pred[i] = convert_to_label[np.argmax(model_predictions[i])]
        y_true[i] = convert_to_label[np.argmax(y[i])]

    return y_true,y_pred

In [15]:
class weighted_categorical_crossentropy(object):
    """
    A weighted version of keras.objectives.categorical_crossentropy
    
    Variables:
        weights: numpy array of shape (C,) where C is the number of classes
    
    Usage:
        loss = weighted_categorical_crossentropy(weights).loss
        model.compile(loss=loss,optimizer='adam')
    """
    
    def __init__(self,weights):
        self.weights = K.variable(weights)
        
    def loss(self,y_true, y_pred):
        # scale preds so that the class probas of each sample sum to 1
        y_pred /= y_pred.sum(axis=-1, keepdims=True)
        # clip
        y_pred = K.clip(y_pred, K.epsilon(), 1)
        # calc
        loss = y_true*K.log(y_pred)*self.weights
        loss =-K.sum(loss,-1)
        return loss

In [16]:
def kerasprocess_data(texts,labels = None,tokenizer = None,mode = 'train'):
    if mode == 'train':
        tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)
        tokenizer.fit_on_texts(texts)
        word_index = tokenizer.word_index
    sequences = tokenizer.texts_to_sequences(texts) #list of lists, basically replaces each word with number

    #pad the data 
    data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
    
    if mode == 'train':
        #prepare embedding matrix
        num_words = len(word_index)+1
        embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
        for word, i in word_index.items():
            embedding_vector = embeddings_index.get(word)
            if embedding_vector is not None:
            # words not found in embedding index will be all-zeros.
                embedding_matrix[i] = embedding_vector
    
        return data,labels,embedding_matrix,tokenizer

    return data

In [17]:
def GRU_train(data,labels,embedding_matrix,data_name='Obama'):
    labels = keras.utils.np_utils.to_categorical(labels,nb_classes=3)
    
    if data_name == 'Obama':
        clf = obama_build_model(embedding_matrix,3)
    else:
        clf = romney_build_model(embedding_matrix,3)
    clf.fit(data,labels, nb_epoch=50, batch_size=64,verbose=0)
    return clf
    

def GRU_predict(clf,data):
    predict_probs = clf.predict(data)
    # keras predicts probabilites on 0,1,-1 should be -1,0,1
    predict_probs[:,[0,1,2]] = predict_probs[:,[2,0,1]]
    return predict_probs

# Obama data

In [18]:
def obama_build_model(embedding_matrix,labels_len):
    np.random.seed(1)
    num_words = embedding_matrix.shape[0]
    l2 = regularizers.l2(0.01)
    l22 = regularizers.l2(0.01)
    model = Sequential()
    embedding_layer = Embedding(num_words,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=0)
    model.add(embedding_layer)
    model.add(GRU(100,return_sequences=False,dropout_W=0.6,dropout_U=0.5))
    weights = np.array([1,2,1]) #index 0 for class 0, index 1 for class 1, index 2 for class -1
    mloss = weighted_categorical_crossentropy(weights).loss
    sgd = optimizers.SGD(lr=0.1, decay=1e-6, momentum=0.8, nesterov=True)
    model.add(Dense(labels_len, activation='softmax'))
    model.compile(loss=mloss, optimizer='rmsprop')
    
    return model

# Romney data

In [19]:
# change the hyperparameters
def romney_build_model(embedding_matrix,labels_len):
    np.random.seed(1)
    num_words = embedding_matrix.shape[0]
    l2 = regularizers.l2(0.01)
    l22 = regularizers.l2(0.01)
    model = Sequential()
    embedding_layer = Embedding(num_words,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=0)
    model.add(embedding_layer)
    model.add(GRU(100,return_sequences=False,dropout_W=0.6,dropout_U=0.5))
    weights = np.array([1,2,1]) #index 0 for class 0, index 1 for class 1, index 2 for class -1
    mloss = weighted_categorical_crossentropy(weights).loss
    sgd = optimizers.SGD(lr=0.1, decay=1e-6, momentum=0.8, nesterov=True)
    model.add(Dense(labels_len, activation='softmax'))
    model.compile(loss=mloss, optimizer='rmsprop')
    
    return model

# Final prediction

In [20]:
def obama_fullcommonpipeline(filename,mode = 'train'):
    if mode == 'train':
        obama_data = pd.read_excel(filename,names = ['date','time','text','sentiment'],parse_cols = 4,sheetname = 'Obama')
    else:
        obama_data = pd.read_excel(testfilename,sheetname = 'Obama')
        obama_data['text'] = obama_data['Anootated tweet']
        obama_data['sentiment'] = obama_data['Unnamed: 4']

    obama_data = get_data(obama_data,mode)
    obama_data['text'] = obama_data['text'].apply(emoticon_converter)
    obama_data['text'] = obama_data['text'].apply(separate_hashtag)
    obama_data['text'] = preprocess(obama_data['text'])
    obama_data['text'] = obama_data['text'].apply(pos_tag_filter)
    obama_data['text'] = obama_data['text'].apply(lambda x : x.lower())
    return obama_data

def obama_fulltrainpipeline(trainfilename):
    obama_data = obama_fullcommonpipeline(trainfilename)
    X,y = get_X_y(obama_data)
    X,X_reduced,text_vector,svd_transform = model_pipeline(X,WordTokenizer)
    bayes_clf = classifier_train(X,y)
    svm_clf = classifier_train(X_reduced,y,clfname = 'LinearSVM')
    
    texts = obama_data['text']
    labels = np.array(obama_data['sentiment'])

    data,labels,embedding_matrix,tokenizer = kerasprocess_data(texts,labels)
    gru_clf = GRU_train(data,labels,embedding_matrix)
    
    bookkeep = dict()
    bookkeep['text_vector'] = text_vector
    bookkeep['svd_transform'] = svd_transform
    bookkeep['tokenizer'] = tokenizer
    
    return bayes_clf,svm_clf,gru_clf,bookkeep

def obama_fullpredictpipeline(trainfilename,testfilename,q = None):
    obama_data = obama_fullcommonpipeline(testfilename,mode = 'test')
    bayes_clf,svm_clf,gru_clf,bookkeep = obama_fulltrainpipeline(trainfilename)
    X = obama_data['text']
    X,X_reduced = model_pipeline(X,WordTokenizer,text_vector = bookkeep['text_vector'],svd_transform = bookkeep['svd_transform'],mode = 'test')
    bayes_pred = classifier_predict(bayes_clf,X)
    svm_pred = classifier_predict(svm_clf,X_reduced)
    
    texts = obama_data['text']

    data = kerasprocess_data(texts,tokenizer = bookkeep['tokenizer'],mode = 'test')
    gru_pred = GRU_predict(gru_clf,data)

    ensemble_pred = (bayes_pred + svm_pred + gru_pred)/3
    
    if q:
        q.put(('obama',np.argmax(ensemble_pred,axis = 1) - 1))
        return

    return np.argmax(ensemble_pred,axis = 1) - 1

In [21]:
def romney_fullcommonpipeline(filename,mode = 'train'):
    if mode == 'train':
        romney_data = pd.read_excel(filename,names = ['date','time','text','sentiment'],parse_cols = 4,sheetname = 'Romney')
    else:
        romney_data = pd.read_excel(testfilename,sheetname = 'Romney')
        romney_data['text'] = romney_data['Anootated tweet']
        romney_data['sentiment'] = romney_data['Unnamed: 4']
    romney_data = get_data(romney_data,mode)
    romney_data['text'] = romney_data['text'].apply(emoticon_converter)
    romney_data['text'] = romney_data['text'].apply(separate_hashtag)
    romney_data['text'] = preprocess(romney_data['text'])
    romney_data['text'] = romney_data['text'].apply(pos_tag_filter)
    romney_data['text'] = romney_data['text'].apply(lambda x : x.lower())
    return romney_data

def romney_fulltrainpipeline(trainfilename):
    romney_data = romney_fullcommonpipeline(trainfilename)
    X,y = get_X_y(romney_data)
    _,X_reduced,text_vector,svd_transform = model_pipeline(X,WordTokenizer)
    svm_clf = classifier_train(X_reduced,y,clfname = 'LinearSVM')
    
    texts = romney_data['text']
    labels = np.array(romney_data['sentiment'])

    data,labels,embedding_matrix,tokenizer = kerasprocess_data(texts,labels)
    gru_clf = GRU_train(data,labels,embedding_matrix,data_name = 'Romney')
    
    bookkeep = dict()
    bookkeep['text_vector'] = text_vector
    bookkeep['svd_transform'] = svd_transform
    bookkeep['tokenizer'] = tokenizer
    
    return svm_clf,gru_clf,bookkeep

def romney_fullpredictpipeline(trainfilename,testfilename,q = None):
    romney_data = romney_fullcommonpipeline(testfilename,mode = 'test')
    svm_clf,gru_clf,bookkeep = romney_fulltrainpipeline(trainfilename)
    X = romney_data['text']
    X,X_reduced = model_pipeline(X,WordTokenizer,text_vector = bookkeep['text_vector'],svd_transform = bookkeep['svd_transform'],mode = 'test')
    svm_pred = classifier_predict(svm_clf,X_reduced)
    
    texts = romney_data['text']

    data = kerasprocess_data(texts,tokenizer = bookkeep['tokenizer'],mode = 'test')
    gru_pred = GRU_predict(gru_clf,data)
    
    ensemble_pred = (gru_pred + svm_pred)/2
    
    if q:
        q.put(('romney',np.argmax(ensemble_pred,axis = 1) - 1))
        return
    
    
    return np.argmax(ensemble_pred,axis = 1) - 1

In [22]:
trainfilename = 'training-Obama-Romney-tweets.xlsx'
testfilename = 'testing-Obama-Romney-tweets.xlsx'

In [23]:
from multiprocessing import Process,Queue

q = Queue()
p1 = Process(target = obama_fullpredictpipeline, args = (trainfilename,testfilename,q))
p1.start()
p2 = Process(target = romney_fullpredictpipeline, args = (trainfilename,testfilename,q))
p2.start()
p1.join()
p2.join()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [24]:
pred_val = dict()
while not q.empty():
    i = q.get()
    if i[0] == 'obama':
        pred_val['obama'] = i[1]
    else:
        pred_val['romney'] = i[1]

In [25]:
obama_data = pd.read_excel(testfilename,sheetname = 'Obama')
obama_data['text'] = obama_data['Anootated tweet']
obama_data['sentiment'] = obama_data['Unnamed: 4']
obama_data = get_data(obama_data,mode = 'test')
obama_true_val = obama_data['sentiment'].astype(int).as_matrix()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [26]:
romney_data = pd.read_excel(testfilename,sheetname = 'Romney')
romney_data['text'] = romney_data['Anootated tweet']
romney_data['sentiment'] = romney_data['Unnamed: 4']
romney_data = get_data(romney_data,mode = 'test')
romney_true_val = romney_data['sentiment'].astype(int).as_matrix()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [27]:
print 'Obama report'
print 'Overall Accuracy is {}'.format(accuracy_score(obama_true_val,pred_val['obama']))
print classification_report(obama_true_val,pred_val['obama'],digits = 5)

Obama report
Overall Accuracy is 0.594566888775
             precision    recall  f1-score   support

         -1    0.58343   0.72674   0.64725       688
          0    0.58748   0.46843   0.52124       681
          1    0.61887   0.58591   0.60194       582

avg / total    0.59542   0.59457   0.58975      1951



In [28]:
print 'Romney report'
print 'Overall Accuracy is {}'.format(accuracy_score(romney_true_val,pred_val['romney']))
print classification_report(romney_true_val,pred_val['romney'],digits = 5)

Romney report
Overall Accuracy is 0.635789473684
             precision    recall  f1-score   support

         -1    0.66723   0.82917   0.73943       960
          0    0.57803   0.36036   0.44395       555
          1    0.58726   0.55065   0.56836       385

avg / total    0.62497   0.63579   0.61846      1900

