In [1]:
# Authentication for loading data from Google Drive
# Import packages
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
from google.colab import drive
from os import path

In [2]:
# Authenticate User
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
auth_drive = GoogleDrive(gauth)

In [3]:
DRIVE_PATH = '/content/drive'
drive.mount(DRIVE_PATH)

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


In [4]:
DATA_PATH = path.join(DRIVE_PATH, 'My Drive', 'LinkedIn_Articles', 'Datasets', 'Twitter_Real_or_Not')
OUTPUT_PATH = path.join(DRIVE_PATH, 'My Drive', 'LinkedIn_Articles', 'NLP & EDA')

In [17]:
import numpy as np
import pandas as pd
import csv
import matplotlib.pyplot as plt
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers.embeddings import Embedding
from keras.models import Sequential
from keras import layers
from keras.optimizers import Adam
from keras.losses import binary_crossentropy

In [7]:
df_train = pd.read_csv(path.join(DATA_PATH, 'train_cleaned.csv'), index_col='id')
df_test = pd.read_csv(path.join(DATA_PATH, 'test_cleaned.csv'), index_col='id')

In [19]:
MAXLEN = 100
VOCAB_SIZE = 50000
VEC_DIMENSIONS = 300

In [9]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df_train['text_cleaned'])

train_sequences = tokenizer.texts_to_sequences(df_train['text_cleaned'])
train_data = pad_sequences(train_sequences, maxlen=MAXLEN)

In [18]:
glove = pd.read_table(path.join(DATA_PATH, 'glove.840B.300d.txt'), sep=" ", index_col=0, header=None, quoting=csv.QUOTE_NONE)

In [20]:
embedding_matrix = np.zeros((VOCAB_SIZE, VEC_DIMENSIONS))

In [22]:
for word, index in tokenizer.word_index.items():
  if index > VOCAB_SIZE:
    break
  else:
    if word in glove.index:
      embedding_matrix[index] = np.asarray(glove.loc[word])

In [23]:
embedding_matrix

array([[ 0.      ,  0.      ,  0.      , ...,  0.      ,  0.      ,
         0.      ],
       [ 0.27204 , -0.06203 , -0.1884  , ...,  0.13015 , -0.18317 ,
         0.1323  ],
       [ 0.043798,  0.024779, -0.20937 , ..., -0.30099 , -0.14584 ,
         0.28188 ],
       ...,
       [ 0.      ,  0.      ,  0.      , ...,  0.      ,  0.      ,
         0.      ],
       [ 0.      ,  0.      ,  0.      , ...,  0.      ,  0.      ,
         0.      ],
       [ 0.      ,  0.      ,  0.      , ...,  0.      ,  0.      ,
         0.      ]])

In [24]:
embedding_matrix.shape

(50000, 300)

In [50]:
nn = Sequential()
nn.add(Embedding(VOCAB_SIZE, VEC_DIMENSIONS, input_length=MAXLEN, weights=[embedding_matrix], trainable=False))
nn.add(layers.LSTM(300, return_sequences=True))
nn.add(layers.LSTM(200, return_sequences=True))
nn.add(layers.LSTM(100, return_sequences=True))
nn.add(layers.LSTM(100))
nn.add(layers.Dense(1, activation='sigmoid'))
nn.compile(optimizer=Adam(lr=0.001), loss=binary_crossentropy, metrics=['accuracy'])

In [51]:
nn.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 100, 300)          15000000  
_________________________________________________________________
lstm_6 (LSTM)                (None, 100, 300)          721200    
_________________________________________________________________
lstm_7 (LSTM)                (None, 100, 200)          400800    
_________________________________________________________________
lstm_8 (LSTM)                (None, 100, 100)          120400    
_________________________________________________________________
lstm_9 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 101       
Total params: 16,322,901
Trainable params: 1,322,901
Non-trainable params: 15,000,000
__________________________________

In [None]:
nn.fit(train_data, df_train['target'], validation_split=0.2, epochs = 30)

Epoch 1/30

In [30]:
test_sequences = tokenizer.texts_to_sequences(df_test['text_cleaned'])
test_data = pad_sequences(test_sequences, maxlen=MAXLEN)

In [31]:
test_target = nn.predict_classes(test_data)

Instructions for updating:
Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).


In [44]:
test_target.flatten()

array([1, 1, 1, ..., 1, 1, 1], dtype=int32)

In [33]:
df_test.index

Int64Index([    0,     2,     3,     9,    11,    12,    21,    22,    27,
               29,
            ...
            10838, 10845, 10856, 10857, 10858, 10861, 10865, 10868, 10874,
            10875],
           dtype='int64', name='id', length=3263)

In [46]:
submission = pd.DataFrame({'id': df_test.index, 'target': test_target.flatten()})

In [49]:
submission.to_csv(path.join(DATA_PATH, 'submission.csv'), index=False)