In [9]:
import numpy as np
import pandas as pd
import re
np.random.seed(13)

from sklearn.feature_extraction.text import CountVectorizer

import keras.backend as K
from keras.models import Sequential
from keras.layers import Dense, Embedding, Lambda
from keras.utils.data_utils import get_file
from keras.utils import np_utils
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.image import ImageDataGenerator
import gensim


In [4]:
path = 'Tweets-airline-sentiment.csv'

In [5]:
def read_data(data_path, feature = 'Unigram', max_feature_num = 500):
#feature: the text feature, could be 'Unigram, Bigram, Trigram or Mixing Unigram with Bigram'
	data = pd.read_csv(path)
	text = data['text']
	label = data['airline_sentiment']
	label_tags = label.unique()
	#replace text label with one-hot-labels
	new_label= []
	for l in label:
		if l == label_tags[0]:
			new_label.append(np.array([0,0,1]))
		elif l == label_tags[1]:
			new_label.append(np.array([0,1,0]))
		else:
			new_label.append(np.array([1,0,0]))
	#get rid of '@airline_company_name
	new_text = []
	for t in text:
		new_text.append(re.sub('^@\\w+ *','', t))
	if feature == 'Unigram':
		Vec = CountVectorizer(max_features = max_feature_num, ngram_range=(1,1))
		out = Vec.fit_transform(new_text)
	elif feature == 'Bigram':
		Vec = CountVectorizer(max_features = max_feature_num, ngram_range=(2,2))
		out = Vec.fit_transform(new_text)
	elif feature == 'Trigram':
		Vec = CountVectorizer(max_features= max_feature_num, ngram_range=(3,3))
		out = Vec.fit_transform(new_text)
	else:
	# mix bigram and unigram
		Vec = CountVectorizer(max_features = max_feature_num, ngram_range = (1,2))
		out = Vec.fit_transform(new_text)
	new_label = np.asarray(new_label)
	return out, new_label

In [10]:
text, label = read_data(data_path = path, feature='Unigram', max_feature_num=500)

text = np.asarray(text.todense())

newt = []
height, width = 10,50


for t in text:
    newt.append(t.reshape((height, width)))
    
text = np.asarray(newt)

text.shape[1:]    # total 500 lines. treat the text as image, make them into 10x50 shape, just a random decesion.

(10, 50)

In [12]:
batch_size = 32
num_epochs = 1000
input_shape = (text.shape[1],text.shape[2],1)
verbose = 1
number_classes = 3
patience = 50
# data generator
data_generator = ImageDataGenerator(
                        featurewise_center=False,
                        featurewise_std_normalization=False,
                        rotation_range=10,
                        width_shift_range=0.1,
                        height_shift_range=0.1,
                        zoom_range=.1,
                        horizontal_flip=True,
                        )

In [2]:
#corpus = open(path).readlines()[:300]
#corpus = [sentence for sentence in corpus if sentence.count(' ') >= 2]

#tokenizer = Tokenizer()
#tokenizer.fit_on_texts(corpus)
#corpus = tokenizer.texts_to_sequences(corpus)
#nb_samples = sum(len(s) for s in corpus)
#V = len(tokenizer.word_index) + 1
#dim = 100
window_size = 2   #??? is this for bi-gram? 

In [3]:
def generate_data(corpus, window_size, V):
    maxlen = window_size*2
    for words in corpus:
        L = len(words)
        for index, word in enumerate(words):
            contexts = []
            labels   = []            
            s = index - window_size
            e = index + window_size + 1
            
            contexts.append([words[i] for i in range(s, e) if 0 <= i < L and i != index])
            labels.append(word)

            x = sequence.pad_sequences(contexts, maxlen=maxlen)
            y = np_utils.to_categorical(labels, V)
            yield (x, y)

In [4]:
cbow = Sequential()
cbow.add(Embedding(input_dim=V, output_dim=dim, input_length=window_size*2))
cbow.add(Lambda(lambda x: K.mean(x, axis=1), output_shape=(dim,)))
cbow.add(Dense(V, activation='softmax'))

In [5]:
cbow.compile(loss='categorical_crossentropy', optimizer='adadelta')

In [6]:
for ite in range(10):
    loss = 0.
    for x, y in generate_data(corpus, window_size, V):
        loss += cbow.train_on_batch(x, y)

    print(ite, loss)

0 17897.1324615
1 16611.4819975
2 16054.5737875
3 15975.4586413
4 16078.7989137
5 16168.2691295
6 16193.6849765
7 16191.0186955
8 16183.2132714
9 16174.5272493


In [7]:
f = open('vectors.txt' ,'w')
f.write('{} {}\n'.format(V-1, dim))

8

In [8]:
vectors = cbow.get_weights()[0]
for word, i in tokenizer.word_index.items():
    str_vec = ' '.join(map(str, list(vectors[i, :])))
    f.write('{} {}\n'.format(word, str_vec))
f.close()

In [9]:
w2v = gensim.models.KeyedVectors.load_word2vec_format('./vectors.txt', binary=False)

In [10]:
w2v.most_similar(positive=['the'])

[('a', 0.8029883503913879),
 ('her', 0.6420163512229919),
 ('i', 0.6390146613121033),
 ('it', 0.6209530830383301),
 ('this', 0.6207232475280762),
 ('my', 0.6043308973312378),
 ('you', 0.5718163847923279),
 ('to', 0.5632857084274292),
 ('about', 0.5545670986175537),
 ('down', 0.5478866696357727)]

In [11]:
w2v.most_similar(positive=['alice'])

[('said', 0.522140383720398),
 ('she', 0.4846324920654297),
 ('thought', 0.4466657042503357),
 ('you', 0.4399128556251526),
 ('say', 0.4043766260147095),
 ('now', 0.40042510628700256),
 ('her', 0.3967324495315552),
 ('that', 0.38552236557006836),
 ('because', 0.37965667247772217),
 ('dark', 0.3769078254699707)]