In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn import feature_extraction, linear_model, model_selection, preprocessing

import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras.layers import Input, Flatten, Dense, Conv2D, MaxPooling2D, Dropout, TextVectorization
    
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/nlp-getting-started/sample_submission.csv
/kaggle/input/nlp-getting-started/train.csv
/kaggle/input/nlp-getting-started/test.csv


In [2]:
train_data = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv',index_col = 0, on_bad_lines = "skip")
test_data = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv',index_col = 0, on_bad_lines = "skip")
train_data.head(20)

Unnamed: 0_level_0,keyword,location,text,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,,,Our Deeds are the Reason of this #earthquake M...,1
4,,,Forest fire near La Ronge Sask. Canada,1
5,,,All residents asked to 'shelter in place' are ...,1
6,,,"13,000 people receive #wildfires evacuation or...",1
7,,,Just got sent this photo from Ruby #Alaska as ...,1
8,,,#RockyFire Update => California Hwy. 20 closed...,1
10,,,#flood #disaster Heavy rain causes flash flood...,1
13,,,I'm on top of the hill and I can see a fire in...,1
14,,,There's an emergency evacuation happening now ...,1
15,,,I'm afraid that the tornado is coming to our a...,1


In [3]:
train_x = train_data.iloc[:,:-1]
train_y = train_data.iloc[:, -1]
train_x = train_x.drop(columns=['keyword', 'location'])

test_x = test_data.iloc[:,:-1]
test_x = test_x.drop(columns=['keyword', 'location'])

print(train_x.iloc[1])
print(train_x.shape)
print(train_y.shape)
print(test_x.shape)


text    Forest fire near La Ronge Sask. Canada
Name: 4, dtype: object
(7613, 1)
(7613,)
(3263, 0)


In [4]:
from sklearn.model_selection import train_test_split

In [5]:
train_x, val_x, train_y, val_y = train_test_split(train_x, train_y, test_size=0.33, random_state = 42)
print(train_x.shape)
print(train_y.shape)
print(test_x.shape)

(5100, 1)
(5100,)
(3263, 0)


In [6]:
# the size of the vocabulary we'll use
vocab_size = 12000
maxlen = 150

def preprocess_twitter(train_x, val_x, test_x, num_words=vocab_size, maxlen=maxlen, vectorize=False):
    
    #np.warnings.filterwarnings('ignore', category=np.VisibleDeprecationWarning)
    #train, testval = imdb.load_data(num_words=num_words, maxlen=maxlen, oov_char=0)
    #np.warnings.filterwarnings('default', category=np.VisibleDeprecationWarning)   

    ### Process the data
    ### Merge train and testval, but then split again into train, test, val sets (according to prop_vec). You can use utils.train_test_val_split().)
    ### - If vectorize=True, then you must encode the features of each example into vectors of vocab_size entries
    ### such that entry i contains the number of time word i appeared in the sequence
    ### - If vectorize=False, then you must encode the features of each examples as a sequence of size maxlen (represented as a np.array()).
    ### Make sure to pad sequences with 0 as appropriate.
    ###* put your code here (~10-15 lines) *###
    from keras.preprocessing.sequence import pad_sequences
    
    #train_x, train_y = trainval
    #test_x, test_y = test
    
    #all_x = np.concatenate([train_x, test_x])
    #all_y = np.concatenate([train_y, test_y])
    
    
    if vectorize:
        tokenizer = keras.preprocessing.text.Tokenizer(num_words = vocab_size, lower=False, char_level=True)
        train_x = tokenizer.sequences_to_matrix(train_x, mode = 'count')
        val_x = tokenizer.sequences_to_matrix(val_x, mode = 'count')
        test_x = tokenizer.sequences_to_matrix(test_x, mode = 'count')
                                                       
    else:
        all_x = pad_sequences(all_x, maxlen=maxlen)

    #train_x, train_y, test_x, test_y, val_x, val_y = utils.train_test_val_split(all_x, all_y, prop_vec)

    return train_x, test_x, val_x

In [7]:
def create_compile_rnn(input_shape=[None], embedding_size=128, num_outputs=1, verbose=False): 
    
    from tensorflow.keras.layers import Embedding, GRU, LSTM
    
    text_dataset = tf.data.Dataset.from_tensor_slices(["foo", "bar", "baz"])
    max_features = 4000  # Maximum vocab size.
    max_len = 4  # Sequence length to pad the outputs to.

    vectorize_layer = tf.keras.layers.TextVectorization(
                                            max_tokens=max_features,
                                            output_mode='int',
                                            output_sequence_length=max_len)
    
    vectorize_layer.adapt(text_dataset.batch(64))
    
    
    model = keras.models.Sequential(name='twitter-RNN')
        
        
    model.add(tf.keras.Input(shape=(1,), dtype=tf.string))
    model.add(vectorize_layer)

    
    model.add(Embedding(input_dim = vocab_size, output_dim = embedding_size))
    
    model.add(GRU(64, return_sequences=True, name='gru1'))
    model.add(GRU(32, return_sequences=True, name='gru2'))
    model.add(GRU(24, return_sequences=True, name='gru3'))
    model.add(GRU(8, name='gru4'))
    
    model.add(Dense(num_outputs, activation='sigmoid', name='output'))

    if verbose:
        model.summary()
        
    opt = keras.optimizers.Adam(lr=0.001)
    model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])

    return model

In [8]:
model = create_compile_rnn(verbose=True)

2022-07-21 17:36:07.304342: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.
2022-07-21 17:36:07.516493: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Model: "twitter-RNN"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
text_vectorization (TextVect (None, 4)                 0         
_________________________________________________________________
embedding (Embedding)        (None, 4, 128)            1536000   
_________________________________________________________________
gru1 (GRU)                   (None, 4, 64)             37248     
_________________________________________________________________
gru2 (GRU)                   (None, 4, 32)             9408      
_________________________________________________________________
gru3 (GRU)                   (None, 4, 24)             4176      
_________________________________________________________________
gru4 (GRU)                   (None, 8)                 816       
_________________________________________________________________
output (Dense)               (None, 1)                 

  "The `lr` argument is deprecated, use `learning_rate` instead.")


In [9]:
max_epochs = 3
batch_size = 128

model.fit(train_x, train_y, epochs=max_epochs, batch_size=batch_size, validation_data=(val_x, val_y))

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7fe550ed1910>