In [26]:
import numpy as np
import pandas as pd
import re
#np.random.seed(13)

from sklearn.model_selection import StratifiedKFold as SKF
from sklearn.pipeline import FeatureUnion
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import Perceptron
from sklearn.svm import LinearSVC
import re
from sklearn.neighbors import KNeighborsClassifier as knn
from sklearn.ensemble import RandomForestClassifier as rf
from sklearn.neural_network import MLPClassifier as cnn

import keras.backend as K
from keras.models import Sequential
from keras.layers import Dense, Embedding, Lambda
from keras.utils.data_utils import get_file
from keras.utils import np_utils
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.image import ImageDataGenerator
import gensim

In [91]:
path = '../../dataset/Tweets-airline-sentiment.csv'   # ../ = upper directory

print (u'\U00002764')
print (u'\U0001f44d')
print (u'\U0001f604')
print (u'\U0001f601')
print (u'\U0001f621')

❤
👍
😄
😁
😡


In [93]:
data = pd.read_csv(path)
text = data['text']
label = data['airline_sentiment']
label_tags = label.unique()

#replace text label with one-hot-labels
#new_label= []
#for l in label:
#    if l == label_tags[0]:
#        new_label.append(np.array([0,0,1]))
#    elif l == label_tags[1]:
#        new_label.append(np.array([0,1,0]))
#    else:
#        new_label.append(np.array([1,0,0]))
# above is one-hot-labels, represent labels in matrics, but can not be divided by stratifield kfold

new_text = []
for line in text:
    line = re.sub('[0-9]','', line)
    #line = re.sub('^@\w+ *','', line)  # without [], ^ means match from the start
    line = line.lower()
    new_text.append(re.sub('@\w+ *','', line))    #clean text and get rid of company name
    
new_text = new_text[:500]  # testing purpose
new_label = label[:100]
new_text

['what said.',
 "plus you've added commercials to the experience... tacky.",
 "i didn't today... must mean i need to take another trip!",
 'it\'s really aggressive to blast obnoxious "entertainment" in your guests\' faces &amp; they have little recourse',
 "and it's a really big bad thing about it",
 "seriously would pay $ a flight for seats that didn't have this playing.\r\nit's really the only bad thing about flying va",
 'yes, nearly every time i fly vx this “ear worm” won’t go away :)',
 'really missed a prime opportunity for men without hats parody, there. https://t.co/mwpggrezp',
 "well, i didn't…but now i do! :-d",
 "it was amazing, and arrived an hour early. you're too good to me.",
 'did you know that suicide is the second leading cause of death among teens -',
 'i &lt; pretty graphics. so much better than minimal iconography. :d',
 "this is such a great deal! already thinking about my nd trip to &amp; i haven't even gone on my st trip yet! ;p",
 "i'm flying your #fabulous #se

In [72]:
new_text[20:30]

['why are your first fares in May over three times more than other carriers when all seats are available to select???',
 'I love this graphic. http://t.co/UT5GrRwAaA',
 'I love the hipster innovation. You are a feel good brand.',
 'will you be making BOS&gt;LAS non stop permanently anytime soon?',
 'you guys messed up my seating.. I reserved seating with my friends and you guys gave my seat away ... 😡 I want free internet',
 "status match program.  I applied and it's been three weeks.  Called and emailed with no response.",
 "What happened 2 ur vegan food options?! At least say on ur site so i know I won't be able 2 eat anything for next 6 hrs #fail",
 "do you miss me? Don't worry we'll be together very soon.",
 "amazing to me that we can't get any cold air from the vents. #VX358 #noair #worstflightever #roasted #SFOtoBOS",
 'LAX to EWR - Middle seat on a red eye. Such a noob maneuver. #sendambien #andchexmix']

In [29]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(new_text)
corpus = tokenizer.texts_to_sequences(new_text)  #Transforms each text in texts in a sequence of integers.
#Only top "num_words" most frequent words will be taken into account.
#Only words known by the tokenizer will be taken into account.
# help(Tokenizer.texts_to_sequences)

#corpus = new_text  # Qian added this line

nb_samples = sum(len(s) for s in corpus)    # each s is a word(integer), this is just counting total words in corpus

# The training phase is by means of the fit_on_texts method and you can see the word index using the word_index property
# http://www.orbifold.net/default/2017/01/10/embedding-and-tokenizer-in-keras/
V = len(tokenizer.word_index) + 1    
# help(Tokenizer)

dim = 100   # for output dimension
window_size = 2   # this is create window for CBOW, so it takes order into consideration

In [30]:
#help(Tokenizer)
#tokenizer.word_index

In [31]:
# corpus   # for each line in twitter, it generate vectors or some kind number for each word
# help(Tokenizer)
# tokenizer.word_index
# help(sequence.pad_sequences)

In [33]:
def generate_data(corpus, window_size, V):
    maxlen = window_size*2  # left 2 + right 2 so max 4 words
    for words in corpus:
        L = len(words)    # how many words are in each line in the corpus. 'words' here means 'line'
        #print(words,L) 
        for index, word in enumerate(words):
            contexts = []
            labels   = []            
            s = index - window_size   # start
            e = index + window_size + 1   # end
            # the window size works in a way: it only looks at the left (window size) words and the right (window size) words.
            # here is looks at the left 2 and the right 2 words of a selected index word.
            
            contexts.append([words[i] for i in range(s, e) if 0 <= i < L and i != index])
            # above: it looks at all the words from start to end in the selected range, and without look at the index word 
            
            labels.append(word)
            
            x = sequence.pad_sequences(contexts, maxlen=maxlen) # pad sequence to the same length. add 0 to short sentences
            # all sentences has to be the same length otherwise it won't work. 
              
            y = np_utils.to_categorical(labels, V)
            yield (x, y)

In [34]:
cbow = Sequential()  # create a sequence of actions below
cbow.add(Embedding(input_dim=V, output_dim=dim, input_length=window_size*2))
cbow.add(Lambda(lambda x: K.mean(x, axis=1), output_shape=(dim,)))
cbow.add(Dense(V, activation='softmax'))

In [35]:
cbow.compile(loss='categorical_crossentropy', optimizer='adadelta')

In [36]:
for ite in range(10):
    loss = 0.
    for x, y in generate_data(corpus, window_size, V):
        loss += cbow.train_on_batch(x, y)
    print(ite, loss)

0 10165.541665792465
1 9408.21622055769
2 9199.064646676183
3 9210.15296278894
4 9220.577135890722
5 9156.270807668567
6 9075.55371569097
7 9001.695325359702
8 8930.751029491425
9 8862.534367240965


In [37]:
f = open('vectors.txt' ,'w', encoding = 'utf8')
f.write('{} {}\n'.format(V-1, dim))

8

In [38]:
vectors = cbow.get_weights()[0]
for word, i in tokenizer.word_index.items():
    # calculate all word vectors and save them in a text file
    str_vec = ' '.join(map(str, list(vectors[i, :])))
    f.write('{} {}\n'.format(word, str_vec))
f.close()

In [39]:
w2v = gensim.models.KeyedVectors.load_word2vec_format('./vectors.txt', binary=False)

In [40]:
w2v.most_similar(positive=['good'])

[('another', 0.531622052192688),
 ('virginamerica', 0.5115504264831543),
 ('asap', 0.49019569158554077),
 ('fly', 0.4668366312980652),
 ('maneuver', 0.4366486370563507),
 ('6', 0.4357474446296692),
 ('flying', 0.4260895550251007),
 ('booking', 0.4212118089199066),
 ('number', 0.41515031456947327),
 ('shown', 0.41311725974082947)]

In [41]:
w2v.most_similar(positive=['bad'])

[('of', 0.7153064608573914),
 ('thing', 0.6794859170913696),
 ('than', 0.6789637804031372),
 ('page', 0.6720508337020874),
 ('about', 0.6446419358253479),
 ('service', 0.6380912065505981),
 ('any', 0.6375299692153931),
 ('and', 0.63398677110672),
 ('ago', 0.6317700147628784),
 ('from', 0.6181586980819702)]

In [42]:
class WordEmbedding:
    
    def __init__(self, n): #initialize empty, with a dimension size variable
        self.dimensions = n
        self.wordDict = {}
        
    def __init__(self, fileLocation): #initialize from file
        self.wordDict = {}
        with open(fileLocation, encoding="utf-8") as f:
            word_n, self.dimensions = [int(x) for x in f.readline().rstrip().split(" ")]
            for line in f:
                inputWord = line.rstrip().split(" ")
                floatArr = [float(x) for x in inputWord[1:]]
                self.wordDict[inputWord[0]] = np.array(floatArr)
        
    def addWord(self, word, vector): #vector must be Numpy float array
        if len(vector) == self.dimensions and word not in self.wordDict:
            self.wordDict[word] = vector
        else:
            return False #turn into a real error message
        
    def getWordVector(self, word):
        if word in self.wordDict:
            return self.wordDict[word]
        else:
            return False #make a real error message
    
    def cosine_similarity(self, v_1, v_2):
        upper = np.dot(v_1, v_2)
        lower = math.sqrt(np.dot(v_1,v_1)) * math.sqrt(np.dot(v_2,v_2))
        sim = upper / lower
        return sim
    
    def wordSim(self, word1, word2):
        return self.cosine_similarity(self.wordDict[word1],self.wordDict[word2])
    
    #subclass
    class OrderedListTuple:
        def __init__(self, max_size):
            self.content = []
            self.max_size = max_size

        def get (self, LIST, index):
            return LIST[index]
    
        def get_value(self, el):
            return el[1]

        def find_pos (self, element):
            index = 0
            while (index <= len(self.content)-1) and self.get_value(self.get(self.content, index)) > self.get_value(element):
                index += 1
            return index

        def insert_element (self, element):
            pos = self.find_pos (element)
            self.content.insert (pos, element)
            if len(self.content) > self.max_size:
                self.content.pop()
                
    def mostSimilar(self, word, listSize=30):
        outputList = self.OrderedListTuple(listSize)
        v1 = self.wordDict[word]
        for w in self.wordDict:
            if w != word:
                v2 = self.wordDict[w]
                sim = self.cosine_similarity(v1,v2)
                newTuple = (w,sim)
                outputList.insert_element(newTuple)
        return outputList.content
    
    def embedAlgebra(self, w1,w2,w3, n=1):
        searchVector = self.wordDict[w1] + self.wordDict[w2] - self.wordDict[w3]
        
        outputList = self.OrderedListTuple(n)
        for w in self.wordDict:
            v = self.wordDict[w]
            sim = self.cosine_similarity(searchVector,v)
            newTuple = (w,sim)
            outputList.insert_element(newTuple)

        return outputList.content
    
embeddings = WordEmbedding("vectors_1000.txt")

In [43]:
max_tweet_len = 20

embed_text = []

for t in new_text:
    words = t.split()
    embeds = []
    
    for w in words:
        w = w.casefold()
        #print(w)
        w = w.strip(",.:;_-@#!")
        #print(w)
        if w in embeddings.wordDict:
            embeds.append(embeddings.getWordVector(w))
            
    vec_embed = np.asarray(embeds)
    
    #print(t)
    
    if vec_embed.shape[0] > max_tweet_len:
        vec_embed = vec_embed[:max_tweet_len, :]
    else:
        #print(vec_embed.shape)
        temp_vec = np.zeros((max_tweet_len, 100))  # output dimention = 100
        
        if vec_embed.shape[0] > 0:
            temp_vec[max_tweet_len - vec_embed.shape[0]:, :] = vec_embed[:,:]
        vec_embed = temp_vec
    
    embed_text.append(vec_embed)

embed_text = np.asarray(embed_text)

print(embed_text.shape)   # this is 3D dimension, need to change to 2d

(100, 20, 100)


In [44]:
flat_embeds = np.reshape(embed_text, (embed_text.shape[0], -1))
print(flat_embeds.shape)  # shape in 2d

(100, 2000)


In [45]:
NB = MultinomialNB()
pc = Perceptron()
svm = LinearSVC()
lr = LogisticRegression()
random_forest  = rf()
KNN = knn(n_neighbors=5)
CNN = cnn()

In [52]:
from sklearn.model_selection import StratifiedKFold as SKF
skf = SKF(n_splits = 5)
X =  flat_embeds
y = label[:100]
for clf in [pc, svm, lr, KNN, CNN, random_forest]:
    acc = []
    for train_index, test_index in skf.split(X, y):       
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        clf.fit(X_train, y_train)
        acc.append(clf.score(X_test, y_test))
    acc = np.asarray(acc)
    print(clf, acc.mean())

Perceptron(alpha=0.0001, class_weight=None, eta0=1.0, fit_intercept=True,
      max_iter=None, n_iter=None, n_jobs=1, penalty=None, random_state=0,
      shuffle=True, tol=None, verbose=0, warm_start=False)
0.4393734335839599
LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)




0.43689223057644117
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
0.4669924812030075
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')
0.38979949874686715
MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)
0.3957894736842105
RandomForestClassifier(bootstrap=True, class_weight=None, c

In [None]:
UniVec = CountVectorizer(max_features = 100, ngram_range = (1,1))
uni = UniVec.fit_transform(new_text)