In [10]:
import sys, os, re, csv, codecs, numpy as np, pandas as pd
import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers

In [33]:
comp = 'C://Users//tpurohit//Desktop//AV//sentiment//'
EMBEDDING_FILE=f'{comp}/glove.6B.50d.txt'
TRAIN_DATA_FILE=f'{comp}train.csv'
TEST_DATA_FILE=f'{comp}test.csv'

In [14]:
embed_size = 50 # how big is each word vector
max_features = 20000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 200 # max number of words in a comment to use

In [15]:
train = pd.read_csv(TRAIN_DATA_FILE)
test = pd.read_csv(TEST_DATA_FILE)

In [21]:
list_sentences_train = train["tweet"].fillna("_na_").values
list_classes = [0,1]
y = train["label"].values
list_sentences_test = test["tweet"].fillna("_na_").values

In [23]:
list_sentences_train

array([' @user when a father is dysfunctional and is so selfish he drags his kids into his dysfunction.   #run',
       "@user @user thanks for #lyft credit i can't use cause they don't offer wheelchair vans in pdx.    #disapointed #getthanked",
       '  bihday your majesty', ...,
       'listening to sad songs on a monday morning otw to work is sad  ',
       '@user #sikh #temple vandalised in in #calgary, #wso condemns  act  ',
       'thank you @user for you follow  '], dtype=object)

In [30]:
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(list_sentences_train))
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)
X_t = pad_sequences(list_tokenized_train, maxlen=maxlen)
X_te = pad_sequences(list_tokenized_test, maxlen=maxlen)

In [149]:
X_t.c


array([[    0,     0,     0, ...,    95,  8136,   480],
       [    0,     0,     0, ...,  8138, 16534, 10553],
       [    0,     0,     0, ...,    62,    26,  3422],
       ...,
       [    0,     0,     0, ...,    78,    11,   121],
       [    0,     0,     0, ...,  1650,  1651,   679],
       [    0,     0,     0, ...,     9,     6,   181]])

In [40]:
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.strip().split()) for o in open(EMBEDDING_FILE,encoding="utf8"))

In [42]:
all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
emb_mean,emb_std

(0.020940498, 0.6441043)

In [58]:
word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

In [64]:
embedding_matrix

array([[-0.23778784, -1.21685408,  0.01773516, ..., -0.97288039,
         1.03382583,  0.62368766],
       [ 0.65028   , -0.1691    ,  1.12370002, ...,  0.20633   ,
         0.89665997,  0.30050001],
       [ 0.41800001,  0.24968   , -0.41242   , ..., -0.18411   ,
        -0.11514   , -0.78580999],
       ...,
       [ 0.71058437,  0.82690056,  1.29065299, ..., -0.02014223,
        -0.79903963,  0.08973579],
       [ 0.12353   , -0.12846   , -0.2572    , ...,  0.9799    ,
         0.85833001, -0.044509  ],
       [-0.31576861, -0.38353088,  0.23133222, ...,  0.677951  ,
         1.07705488, -0.93719132]])

In [143]:
inp = Input(shape=(maxlen,))
x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
x = Bidirectional(LSTM(60, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x)
x = GlobalMaxPool1D()(x)
x = Dense(50, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(1, activation="sigmoid")(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [144]:
model.fit(X_t, y, batch_size=20, epochs=3, validation_split=0.1);

Train on 28765 samples, validate on 3197 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


In [145]:
y_test = model.predict([X_te], batch_size=1024, verbose=1)
sample_submission = pd.read_csv(f'{comp}sample_submission.csv')

#sample_submission[list_classes] = y_test
#sample_submission.to_csv('submission.csv', index=False)



In [146]:
i=0
for i in range(len(y_test)):
    if y_test[i] < 0.5: y_test[i] =int(0)
    else: y_test[i] =int(1)
    i =i+1

ytest = y_test.astype(int).tolist()

output = pd.DataFrame( data={"id":test["id"], "label":ytest} )
# Use pandas to write the comma-separated output file
output.to_csv( "C://Users//tpurohit//Desktop//AV//sentiment//sample.csv", index=False, quoting=3 )