In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import csv
from sklearn.pipeline import FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import Perceptron
from sklearn.svm import LinearSVC
import re
from sklearn.neighbors import KNeighborsClassifier as knn
from sklearn.ensemble import RandomForestClassifier as rf
from sklearn.neural_network import MLPClassifier as cnn

In [4]:
#Object to save words and their embeddings
class WordEmbedding:
    
    def __init__(self, n): #initialize empty, with a dimension size variable
        self.dimensions = n
        self.wordDict = {}
        
    def __init__(self, fileLocation): #initialize from file
        self.wordDict = {}
        with open(fileLocation, encoding="utf-8") as f:
            word_n, self.dimensions = [int(x) for x in f.readline().rstrip().split(" ")]
            for line in f:
                inputWord = line.rstrip().split(" ")
                floatArr = [float(x) for x in inputWord[1:]]
                self.wordDict[inputWord[0]] = np.array(floatArr)
        
    def addWord(self, word, vector): #vector must be Numpy float array
        if len(vector) == self.dimensions and word not in self.wordDict:
            self.wordDict[word] = vector
        else:
            return False #turn into a real error message
        
    def getWordVector(self, word):
        if word in self.wordDict:
            return self.wordDict[word]
        else:
            return False #make a real error message
    
    def cosine_similarity(self, v_1, v_2):
        upper = np.dot(v_1, v_2)
        lower = math.sqrt(np.dot(v_1,v_1)) * math.sqrt(np.dot(v_2,v_2))
        sim = upper / lower
        return sim
    
    def wordSim(self, word1, word2):
        return self.cosine_similarity(self.wordDict[word1],self.wordDict[word2])
    
    #subclass
    class OrderedListTuple:
        def __init__(self, max_size):
            self.content = []
            self.max_size = max_size

        def get (self, LIST, index):
            return LIST[index]
    
        def get_value(self, el):
            return el[1]

        def find_pos (self, element):
            index = 0
            while (index <= len(self.content)-1) and self.get_value(self.get(self.content, index)) > self.get_value(element):
                index += 1
            return index

        def insert_element (self, element):
            pos = self.find_pos (element)
            self.content.insert (pos, element)
            if len(self.content) > self.max_size:
                self.content.pop()
                
    def mostSimilar(self, word, listSize=30):
        outputList = self.OrderedListTuple(listSize)
        v1 = self.wordDict[word]
        for w in self.wordDict:
            if w != word:
                v2 = self.wordDict[w]
                sim = self.cosine_similarity(v1,v2)
                newTuple = (w,sim)
                outputList.insert_element(newTuple)

        return outputList.content
    
    def embedAlgebra(self, w1,w2,w3, n=1):
        searchVector = self.wordDict[w1] + self.wordDict[w2] - self.wordDict[w3]
        
        outputList = self.OrderedListTuple(n)
        for w in self.wordDict:
            v = self.wordDict[w]
            sim = self.cosine_similarity(searchVector,v)
            newTuple = (w,sim)
            outputList.insert_element(newTuple)

        return outputList.content
    
embeddings = WordEmbedding("wiki.en.vec.short")

In [5]:
data_path1 = '../dataset/Tweets-airline-sentiment.csv'
#data_path2 = '../dataset/labeledTrainData_head12000.tsv'

In [6]:
data = pd.read_csv(data_path1)

In [7]:
#data.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


In [8]:
text = data['text']

In [9]:
label = data['airline_sentiment']

In [10]:
label_tags = label.unique()
print(label_tags)

['neutral' 'positive' 'negative']


In [11]:
#replace text label with one-hot-labels
new_label = []

In [12]:
for l in label:
    if l == label_tags[0]:
        new_label.append(0)
    elif l == label_tags[1]:
        new_label.append(1)
    else:
        new_label.append(2)

In [13]:
new_label = np.asarray(new_label)

In [14]:
#get rid of '@airline_company_name
new_text = []

In [15]:
for t in text:
    new_text.append(re.sub('^@\w+ *','', t))

In [16]:
new_text = np.asarray(new_text)

In [17]:
new_text.shape, new_label.shape

((14640,), (14640,))

# Word Embeds Data

In [57]:
max_tweet_len = 5

In [58]:
embed_text = []

for t in new_text:
    words = t.split()
    embeds = []
    
    for w in words:
        w = w.casefold()
        #print(w)
        w = w.strip(",.:;_-@#!")
        #print(w)
        if w in embeddings.wordDict:
            embeds.append(embeddings.getWordVector(w))
            
    vec_embed = np.asarray(embeds)
    
    #print(t)
    
    if vec_embed.shape[0] > max_tweet_len:
        vec_embed = vec_embed[:max_tweet_len, :]
    else:
        #print(vec_embed.shape)
        temp_vec = np.zeros((max_tweet_len, 300))
        if vec_embed.shape[0] > 0:
            temp_vec[max_tweet_len - vec_embed.shape[0]:, :] = vec_embed[:,:]
        vec_embed = temp_vec
    
    embed_text.append(vec_embed)

embed_text = np.asarray(embed_text)

print(embed_text.shape)

(14640, 5, 300)


In [59]:
flat_embeds = np.reshape(embed_text, (embed_text.shape[0], -1))
print(flat_embeds.shape)

(14640, 1500)


# model

In [60]:
NB = MultinomialNB()
pc = Perceptron()
svm = LinearSVC()
lr = LogisticRegression()
random_forest  = rf()
KNN = knn(n_neighbors=3)
CNN = cnn()

# Word Embeds

In [61]:
skf = StratifiedKFold(n_splits=5)

In [62]:
for clf in [pc, svm, lr]:
    acc = []
    for train_index, test_index in skf.split(flat_embeds, new_label):
        x_train,x_test = flat_embeds[train_index], flat_embeds[test_index]
        y_train, y_test = new_label[train_index], new_label[test_index]
        clf.fit(x_train, y_train)
        acc.append(clf.score(x_test, y_test))
    acc = np.asarray(acc)
    print(acc.mean())

0.607359637233
0.678897582854
0.687435602624


In [63]:
for clf in [KNN, CNN, random_forest]:
    acc = []
    for train_index, test_index in skf.split(flat_embeds, new_label):
        x_train,x_test = flat_embeds[train_index], flat_embeds[test_index]
        y_train, y_test = new_label[train_index], new_label[test_index]
        clf.fit(x_train, y_train)
        acc.append(clf.score(x_test, y_test))
    acc = np.asarray(acc)
    print(acc.mean())

0.630056092732
0.680603856893
0.669607614045


# Unigram

In [17]:
UniVec = CountVectorizer(max_features = 500, ngram_range = (1,1))
uni = UniVec.fit_transform(new_text)

In [18]:
skf = StratifiedKFold(n_splits=5)


In [24]:
for clf in [NB, pc, svm, lr]:
    acc = []
    for train_index, test_index in skf.split(uni, new_label):
        x_train,x_test = uni[train_index], uni[test_index]
        y_train, y_test = new_label[train_index], new_label[test_index]
        clf.fit(x_train, y_train)
        acc.append(clf.score(x_test, y_test))
    acc = np.asarray(acc)
    print(acc.mean())

0.7301962457671689




0.7111350376695293
0.7618886777939342
0.7667370754555864


In [27]:
for clf in [KNN, CNN, random_forest]:
    acc = []
    for train_index, test_index in skf.split(uni, new_label):
        x_train,x_test = uni[train_index], uni[test_index]
        y_train, y_test = new_label[train_index], new_label[test_index]
        clf.fit(x_train, y_train)
        acc.append(clf.score(x_test, y_test))
    acc = np.asarray(acc)
    print(acc.mean())

0.49044845012281224
0.7411243931760308
0.7334056494059733


# Bigram

In [28]:
BiVec = CountVectorizer(max_features = 500, ngram_range = (2,2))
Bi = BiVec.fit_transform(new_text)
skf = StratifiedKFold(n_splits=5)

In [29]:
for clf in [NB, pc, svm, lr, KNN, random_forest]:
    acc = []
    for train_index, test_index in skf.split(uni, new_label):
        x_train,x_test = uni[train_index], uni[test_index]
        y_train, y_test = new_label[train_index], new_label[test_index]
        clf.fit(x_train, y_train)
        acc.append(clf.score(x_test, y_test))
    acc = np.asarray(acc)
    print(acc.mean())

0.7301962457671689
0.7111350376695293




0.7618886777939342
0.7667370754555864
0.49044845012281224
0.7298525695649101


In [30]:
for clf in [CNN]:
    acc = []
    for train_index, test_index in skf.split(uni, new_label):
        x_train,x_test = uni[train_index], uni[test_index]
        y_train, y_test = new_label[train_index], new_label[test_index]
        clf.fit(x_train, y_train)
        acc.append(clf.score(x_test, y_test))
    acc = np.asarray(acc)
    print(acc.mean())

0.743923727176688


# Uni&Bigram

In [31]:
MixVec = CountVectorizer(max_features = 500, ngram_range = (1,2))
Mix = BiVec.fit_transform(new_text)
skf = StratifiedKFold(n_splits=5)

In [32]:
for clf in [NB, pc, svm, lr, KNN, random_forest]:
    acc = []
    for train_index, test_index in skf.split(uni, new_label):
        x_train,x_test = uni[train_index], uni[test_index]
        y_train, y_test = new_label[train_index], new_label[test_index]
        clf.fit(x_train, y_train)
        acc.append(clf.score(x_test, y_test))
    acc = np.asarray(acc)
    print(acc.mean())

0.7301962457671689
0.7111350376695293




0.7618886777939342
0.7667370754555864
0.49044845012281224
0.7326511330773402


In [33]:
for clf in [CNN]:
    acc = []
    for train_index, test_index in skf.split(uni, new_label):
        x_train,x_test = uni[train_index], uni[test_index]
        y_train, y_test = new_label[train_index], new_label[test_index]
        clf.fit(x_train, y_train)
        acc.append(clf.score(x_test, y_test))
    acc = np.asarray(acc)
    print(acc.mean())

0.7448794744777792
