In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory
from collections import defaultdict
from collections import Counter
import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.


In [None]:
df = pd.read_csv("../input/preprocessed-with-emoji/preprocessed_with_emoji.csv")

In [None]:
comment = []
for i in range(len(df)):
    comment.append(df.Comment[i])

In [None]:
def load_data(max_size, min_occurrences=10):
    """
    Load data from a text file and creates the numpy arrays
    used by the autoencoder.
    :return: a tuple (sentences, sizes, vocabulary).
        sentences is a 2-d matrix padded with EOS
        sizes is a 1-d array with each sentence size
        vocabulary is a list of words positioned according to their indices
    """
    sentences = []
    sizes = []
    longest_sent_size = 0
    index = [0]  # hack -- use a mutable object to be
                 # accessed inside the nested function
                 # at first, 0 means padding/EOS

    def on_new_word():
        index[0] += 1
        return index[0]
    word_dict = defaultdict(on_new_word)

    
    for line in comment:
            #line = line.decode('utf-8')
            tokens = line.split()
#             print(tokens)
            sent_size = len(tokens)
            #if sent_size > max_size:
            #    continue
            sentences.append([word_dict[token]
                             for token in tokens])
            sizes.append(sent_size)
            if sent_size > longest_sent_size:
                longest_sent_size = sent_size

    reverse_word_dict = {v: k for k, v in word_dict.items()}
    reverse_word_dict[0] = '</s>'
    # we initialize the matrix now that we know the number of sentences
    sentence_matrix = np.full((len(sentences), longest_sent_size),
                              0, np.int32)

    for i, sent in enumerate(sentences):
        sentence_array = np.array(sent)
        sentence_matrix[i, :sizes[i]] = sentence_array

    # count occurrences of tokens on the remaining sentences
    # counter: index -> num_occurences
    counter = Counter(sentence_matrix.flat)

    # 0 signs the EOS token, it should be counted once per sentence
    counter[0] = len(sentence_matrix)

    # these words will be replaced by the unk token
    unk_words = [(w, counter[w]) for w in counter
                 if counter[w] < min_occurrences]
    unk_count = sum(item[1] for item in unk_words)
    unk_index = len(counter)   # make the unknown index the last one
    counter[unk_index] = unk_count
    reverse_word_dict[unk_index] = '<unk>'

    # now we sort word indices by frequency (this works better with some
    # sampling techniques such as Noise Constrastive Estimation)
    replacements = {}
    word_list = []
    for new_index, (old_index, count) in enumerate(counter.most_common()):
        if count < min_occurrences:
            # we can break the loop because the next ones
            # have equal or lower counts
            break

        replacements[old_index] = new_index
        word_list.append(reverse_word_dict[old_index])

    new_unk_index = replacements[unk_index]
    replacements_with_unk = defaultdict(lambda: new_unk_index,
                                        replacements)
    original_shape = sentence_matrix.shape
    replaced = np.array([replacements_with_unk[w]
                         for w in sentence_matrix.flat],
                        dtype=np.int32)
    sentence_matrix = replaced.reshape(original_shape)

    sizes_array = np.array(sizes, dtype=np.int32)
    return sentence_matrix, sizes_array, word_list

x,y,z=load_data( 50, min_occurrences=10)

In [None]:
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(char_level=False)
tweets_token=[]
texts = ["The sun is shining in June!","September is grey.","Life is beautiful in August.","I like it","This and other things?"]
tokenizer.fit_on_texts(df.Comment)
print(tokenizer.word_index)
for i in range(1,len(df["Comment"])+1):
    b=tokenizer.texts_to_sequences([comment[i-1]])
    tweets_token.append(b)
tweets_tokens=[]
for i in range(1,len(df["Comment"])+1):
    tweets_tokens.append(tweets_token[i-1][0])
    
len(tweets_tokens)

In [None]:
from numpy import array
from numpy import argmax
from keras.utils import to_categorical
# define example
data = df["Agg_Level"]
data = array(data)
print(data)
# one hot encode
encoded = to_categorical(data)
print(encoded)
# invert encoding
inverted = argmax(encoded[0])
encoded[0]


In [None]:
from __future__ import print_function
import numpy as np

from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional, GlobalAveragePooling1D
from keras.datasets import imdb


max_features = 23909
# cut texts after this number of words
# (among top max_features most common words)
maxlen = 23909
batch_size = 500
print('Loading data...')
x_train, y_train, x_test, y_test = tweets_tokens[:10000], encoded[:10000], x[10000:], encoded[10000:]
print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')

print('Pad sequences (samples x time)')
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)
y_train = np.array(y_train)
y_test = np.array(y_test)

model = Sequential()
model.add(Embedding(23909, 128, input_length=maxlen))
model.add(GlobalAveragePooling1D())
# model.add(Dropout(0.5))
model.add(Dense(batch_size*2, activation='tanh'))
model.add(Dense(batch_size*2, activation='tanh'))
model.add(Dense(batch_size, activation='tanh'))
model.add(Dense(batch_size, activation='tanh'))
model.add(Dense(batch_size, activation='relu'))
model.add(Dense(3, activation='softmax'))

# try using different optimizers and different optimizer configs
model.compile('adam', 'categorical_crossentropy', metrics=['accuracy'])

print('Train...')
model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=10,
          validation_data=[x_test, y_test])

In [None]:
# words1=[]
# import pip
# #pip.main(['install', '--user', 'tweet-preprocessor'])
# import sys
# #!{sys.executable} -m pip install gensim
# import numpy as np
# def loadGloveModel(gloveFile):
#     print ("Loading Glove Model")
#     f = open(gloveFile,'r',encoding="utf8")
#     model = {}
#     for line in f:
#         splitLine = line.split()
#         word = splitLine[0]
#         words1.append(word)
#         embedding = np.array([float(val) for val in splitLine[1:]])
#         model[word] = embedding
#     print ("Done.",len(model)," words loaded!")
#     return model
# model1=loadGloveModel("glove.twitter.27B.200d.txt")

In [None]:
# import numpy as np
# import pandas as pd
# from collections import defaultdict
# import re

# from bs4 import BeautifulSoup

# import sys
# import os

# os.environ['KERAS_BACKEND']='theano'

# from keras.preprocessing import sequence
# from keras.utils.np_utils import to_categorical

# from keras.layers import Embedding
# from keras.layers import Dense, Input, Flatten
# from keras.layers import Conv1D, MaxPooling1D, Embedding, Dropout, LSTM, GRU, Bidirectional, TimeDistributed
# from keras.models import Model

# from keras import backend as K
# from keras.engine.topology import Layer, InputSpec
# from keras import initializers

# # cut texts after this number of words
# # (among top max_features most common words)
# maxlen = 21206
# batch_size = 32
# EMBEDDING_DIM=21206

# embedding_layer = Embedding(21206,
#                             200,
#                             weights=[embedding_matrix],
#                             input_length=maxlen,
#                             trainable=True)

# print('Loading data...')
# x_train, y_train, x_test, y_test = x[:10000], df["Agg_Level"][:10000], x[10000:], df["Agg_Level"][10000:]
# print(len(x_train), 'train sequences')
# print(len(x_test), 'test sequences')

# print('Pad sequences (samples x time)')
# x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
# x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
# print('x_train shape:', x_train.shape)
# print('x_test shape:', x_test.shape)
# y_train = np.array(y_train)
# y_test = np.array(y_test)

# embedding_matrix = np.zeros((21207, 200))
# for  index, word in enumerate(z):
#     if index > 21205:
#         break
#     else:
#         if word in df.Comment:
#             embedding_vector = model1[word]
#             embedding_matrix[index] = embedding_vector
            
# print(embedding_matrix)      

# class AttLayer(Layer):
#     def __init__(self, **kwargs):
#         self.init = initializers.get('normal')
#         #self.input_spec = [InputSpec(ndim=3)]
#         super(AttLayer, self).__init__(**kwargs)

#     def build(self, input_shape):
#         assert len(input_shape)==3
#         #self.W = self.init((input_shape[-1],1))
#         self.W = self.init((input_shape[-1],))
#         #self.input_spec = [InputSpec(shape=input_shape)]
#         self.trainable_weights = [self.W]
#         super(AttLayer, self).build(input_shape)  # be sure you call this somewhere!

#     def call(self, x, mask=None):
#         eij = K.tanh(K.dot(x, self.W))
        
#         ai = K.exp(eij)
#         weights = ai/K.sum(ai, axis=1).dimshuffle(0,'x')
        
#         weighted_input = x*weights.dimshuffle(0,1,'x')
#         return weighted_input.sum(axis=1)

#     def get_output_shape_for(self, input_shape):
#         return (input_shape[0], input_shape[-1])

# sentence_input = Input(shape=(maxlen,))
# embedded_sequences = embedding_layer(sentence_input)
# l_lstm = Bidirectional(GRU(100, return_sequences=False))(embedded_sequences)
# #l_dense = TimeDistributed(Dense(100))(l_lstm)
# l_att = AttLayer()(l_dense)
# sentEncoder = Model(sentence_input, l_att)

# review_input = Input(shape=(maxlen), dtype='int32')
# #review_encoder = TimeDistributed(sentEncoder)(review_input)
# l_lstm_sent = Bidirectional(GRU(100, return_sequences=True))(review_input)
# l_dense_sent = TimeDistributed(Dense(100))(l_lstm_sent)
# l_att_sent = AttLayer()(l_dense_sent)
# preds = Dense(1, activation='softmax')(l_att_sent)
# model = Model(review_input, preds)

# model.compile(loss='categorical_crossentropy',
#               optimizer='rmsprop',
#               metrics=['acc'])

# print("model fitting - Hierachical attention network")
# model.fit(x_train, y_train, validation_data=(x_val, y_val),
#           nb_epoch=10, batch_size=50)

In [None]:
import urllib2

data = urllib2.urlopen("https://nlp.stanford.edu/projects/glove/glove.twitter.27B.zip") # read only 20 000 chars
data = data.split("\n") # then split it into lines

for line in data:
    print(line)