# NLP Quick Start for newbie😁 - with 9 steps

## INDEX
```
Step 1. Library Import & Data Load
Step 2. Data Preprocessing
     2-a. Drop Columns
     2-b. Tokenizer
     2-c. Pad Sequences
     2-d. Match Data type to numpy.ndarray
Step 3. Modeling
Step 4. Model Compile
Step 5. Callbacks
Step 6. Model Fit
Step 7. Model Evaluate & Save
Step 8. Reload Model
Step 9. Predict Test Data
```
---

## Step 1. Library Import & Data Load

In [None]:
import pandas as pd 
import numpy as np 

In [None]:
train_df = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
test_df = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')

In [None]:
train_df.head()

In [None]:
train_df.info()

In [None]:
train_df.nunique()

In [None]:
test_df.head()

In [None]:
test_df.info()

In [None]:
test_df.nunique()

## Step 2. Data Preprocessing

### 2-a. Drop Columns

In [None]:
train_df.head()

In [None]:
train_df.drop(columns=['id','keyword','location'], axis=1, inplace=True)

In [None]:
test_df.head()

In [None]:
test_df.drop(columns=['keyword','location'],axis=1, inplace=True)

In [None]:
print(train_df.shape, test_df.shape)

### 2-b. Tokenizer

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(train_df['text'],train_df['target'], test_size=0.2, random_state=111)

In [None]:
print(X_train.shape, y_train.shape, X_valid.shape, y_valid.shape)

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [None]:
vocab_size = 1000
oov_token = "<OOV>"
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_token)

In [None]:
tokenizer.fit_on_texts(X_train)

In [None]:
X_train = tokenizer.texts_to_sequences(X_train)
X_valid = tokenizer.texts_to_sequences(X_valid)

In [None]:
for i in range(10):
    print(len(X_train[i]))

In [None]:
X_train[0]

In [None]:
for i in range(10):
    print(len(X_valid[i]))

In [None]:
X_valid[0]

### 2-c. Pad Sequences

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
max_length = 120
trunc_type = 'post'
pad_type = 'post'

In [None]:
X_train_padded = pad_sequences(X_train, maxlen=max_length, truncating=trunc_type, padding=pad_type)
X_valid_padded = pad_sequences(X_valid, maxlen=max_length, truncating=trunc_type, padding=pad_type)

In [None]:
X_train_padded[:2]

In [None]:
X_valid_padded[:2]

In [None]:
print(X_train_padded.shape, X_valid_padded.shape)

### 2-d. Match Data type to numpy.ndarray

In [None]:
print(type(X_train_padded), type(X_valid_padded))
print(type(y_train), type(y_valid))

In [None]:
y_train = np.array(y_train)
y_valid = np.array(y_valid)

In [None]:
print(type(X_train_padded), type(X_valid_padded))
print(type(y_train), type(y_valid))

## Step 3. Modeling

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional, Flatten

In [None]:
embedding_dim = 16
# vocab_size = 1000
# max_length = 120

In [None]:
model = Sequential([
    Embedding(vocab_size, embedding_dim, input_length=max_length),
    Bidirectional(LSTM(64, return_sequences=True)),
    Bidirectional(LSTM(64, return_sequences=True)),
    Bidirectional(LSTM(64, dropout=0.5)),
    Dense(32, activation='relu'),
    Dense(16, activation='relu'),
    Dense(1, activation='sigmoid')
])

In [None]:
model.summary()

## Step 4. Model Compile

In [None]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

## Step 5. Callbacks

In [None]:
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

In [None]:
filepath = 'my_checkpoint.ckpt'
cp = ModelCheckpoint(
    filepath=filepath,
    save_weights_only=True,
    save_best_only=True,
    monitor='val_loss',
    verbose=1
)

In [None]:
ep = EarlyStopping(
    monitor='val_loss', 
    patience=5,
)

## Step 6. Model Fit

In [None]:
epochs=30
model.fit(
    X_train_padded, y_train,
    validation_data = (X_valid_padded, y_valid),
    callbacks=[cp,ep],
    epochs=epochs
)

## Step 7. Model Evaluate & Save

In [None]:
model.load_weights(filepath)

In [None]:
model.evaluate(X_valid_padded, y_valid)

In [None]:
X_valid[0]

In [None]:
model.save('./model/basic_nlp.h5')

## Step 8. Reload Model

In [None]:
import tensorflow as tf

In [None]:
mymodel = tf.keras.models.load_model('./model/basic_nlp.h5')

In [None]:
mymodel.summary()

## Step 9. Predict Test Data 

In [None]:
X_test = tokenizer.texts_to_sequences(test_df['text'])

In [None]:
X_test_padded = pad_sequences(X_test, maxlen=max_length, truncating=trunc_type, padding=pad_type)

In [None]:
y_test_raw = model.predict(X_test_padded)

In [None]:
y_test_raw

In [None]:
y_test = list(map(lambda x : 1 if x > 0.5 else 0, y_test_raw))

In [None]:
set(y_test)

In [None]:
y_test[:5]

In [None]:
test_df['predict'] = y_test

In [None]:
test_df

In [None]:
test_df[test_df['predict']==1]

In [None]:
submission = test_df[['id','predict']]

In [None]:
submission

In [None]:
submission.columns = ['id', 'target']

In [None]:
submission

In [None]:
submission.to_csv('./sample_submission.csv', index=False)