In [1]:
import pandas as pd
import numpy as np

from nltk import word_tokenize

import warnings
warnings.filterwarnings('ignore')

import os
import pickle

from sklearn.utils import class_weight
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import MultiLabelBinarizer

from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from keras.layers import Dense, Flatten, Conv1D, MaxPooling1D, Dropout, Activation,concatenate, Input

from keras.layers.embeddings import Embedding

from keras.callbacks import ModelCheckpoint,ReduceLROnPlateau, EarlyStopping

from keras.models import Model


Using TensorFlow backend.


### LOADING THE CLEANED DATASETS

In [2]:
x_train_title = pd.read_pickle(r"D:\Poonam\Project\clean_x_train_title.pkl")
x_test_title = pd.read_pickle(r"D:\Poonam\Project\clean_x_test_title.pkl")

x_train_body = pd.read_pickle(r"D:\Poonam\Project\clean_x_train_body.pkl")
x_test_body = pd.read_pickle(r"D:\Poonam\Project\clean_x_test_body.pkl")

In [3]:
x_train_title.shape , x_test_title.shape, x_train_body.shape , x_test_body.shape

((565068,), (141268,), (565068,), (141268,))

In [4]:
y_train = pd.read_pickle(r"D:\Poonam\Project\y_train.pkl")
y_test = pd.read_pickle(r"D:\Poonam\Project\y_test.pkl")

In [5]:
y_train.shape , y_test.shape

((565068,), (141268,))

### MATRIX OF TITLE

In [8]:
title_len=[]

for title in x_train_title:
    title_len.append(len(word_tokenize(title)))

In [9]:
max(title_len)

22

In [10]:
print(np.quantile(title_len,0.95))
np.quantile(title_len,0.9999)

9.0


18.0

In [11]:
title_len1=[]
for title in x_test_title:
    title_len1.append(len(word_tokenize(title)))

In [12]:
print(max(title_len1))
print(np.quantile(title_len1,0.95))
np.quantile(title_len1,0.9999)

34
9.0


18.0

In [13]:
max_len1=20

In [14]:
tok = Tokenizer(char_level=False, split=' ')
tok.fit_on_texts(x_train_title)

In [15]:
vocab_len1=len(tok.index_word.keys())
vocab_len1

101026

In [16]:
seq_train_title = tok.texts_to_sequences(x_train_title)
seq_train_title

matrix_train_title = sequence.pad_sequences(seq_train_title, maxlen=max_len1)
matrix_train_title.shape

(565068, 20)

In [17]:
seq_test_title = tok.texts_to_sequences(x_test_title)
seq_test_title

matrix_test_title = sequence.pad_sequences(seq_test_title, maxlen=max_len1)
matrix_test_title.shape

(141268, 20)

### MATRIX FOR BODY

In [19]:
body_len=[]

for body in x_train_body:
    body_len.append(len(word_tokenize(body)))

In [20]:
max(body_len)

print(np.quantile(body_len,0.95))

315.0


In [21]:
body_len1=[]
for body in x_test_body:
    body_len1.append(len(word_tokenize(body)))

In [22]:
print(max(body_len1))
print(np.quantile(body_len1,0.95))

8445
313.0


In [23]:
max_len2=330

In [24]:
tok.fit_on_texts(x_train_body)

vocab_len2=len(tok.index_word.keys())
vocab_len2

3945147

In [25]:
seq_train_body = tok.texts_to_sequences(x_train_body)
seq_train_body

matrix_train_body = sequence.pad_sequences(seq_train_body, maxlen=max_len2)
matrix_train_body.shape

(565068, 330)

In [26]:
seq_test_body = tok.texts_to_sequences(x_test_body)
seq_test_body

matrix_test_body = sequence.pad_sequences(seq_test_body, maxlen=max_len2)
matrix_test_body.shape

(141268, 330)

### Target encoding

In [28]:
mlb = MultiLabelBinarizer()

train_y = mlb.fit_transform(y_train)

train_y.shape

(565068, 10)

In [29]:
test_y = mlb.transform(y_test)

### Define model architecture

In [31]:
def multiclass_model():
    # channel 1 for title
    input1 = Input( shape=(max_len1,),name = "title_input")
    embed1 = Embedding(vocab_len1+1,100)(input1)
    conv1 = Conv1D(filters=32, kernel_size = 2, activation="relu")(embed1)
    drop1 = Dropout(0.2)(conv1)
    pool1 = MaxPooling1D(pool_size=2)(drop1)
    flat1 = Flatten()(pool1)
    drop2 = Dropout(0.2)(flat1)
    dense1 = Dense(100, activation="relu")(drop2)
    
    #channel 2 for body
    input2 = Input( shape=(max_len2,),name = "body_input")
    embed2 = Embedding(vocab_len2+1,100)(input2)
    conv2 = Conv1D(filters=32, kernel_size = 2, activation="relu")(embed2)
    drop3 = Dropout(0.2)(conv2)
    pool2 = MaxPooling1D(pool_size=2)(drop3)
    flat2 = Flatten()(pool2)
    drop4 = Dropout(0.2)(flat2)
    dense2 = Dense(100 ,activation="relu")(drop4)
    
    merged = concatenate([dense1,dense2])
    
    dense3 = Dense(50, activation="relu")(merged)
    drop5 = Dropout(0.2)(dense3)
    
    # output layer
    
    output = Dense(10,name ="main_output", activation="sigmoid")(drop5)
    
    model = Model(inputs=[input1,input2],outputs=output)
    return model

model = multiclass_model()
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
title_input (InputLayer)        (None, 20)           0                                            
__________________________________________________________________________________________________
body_input (InputLayer)         (None, 330)          0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 20, 100)      10102700    title_input[0][0]                
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 330, 100)     394514800   body_input[0][0]                 
____________________________________________________________________________________________

In [32]:
model.compile(optimizer='adam',loss='categorical_crossentropy', metrics=['accuracy'])

### DEFINING CALLBACKS

In [33]:
from keras.callbacks import ModelCheckpoint,ReduceLROnPlateau, EarlyStopping
import os
from sklearn.utils import class_weight

filepath = r'D:\Poonam\Project\Stack_m2_output.h5'

checkpoint = ModelCheckpoint(filepath, monitor='val_accuracy', 
                             verbose=1, 
                             save_best_only=True, 
                             save_weights_only=True, 
                             mode='auto')

earlystop = EarlyStopping(monitor='val_loss', 
                          min_delta=0.01, patience= 5,
                          verbose=1, mode='auto')

reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.1,
                              patience=5, min_lr=0.00001)

In [34]:
results=model.fit({'title_input': matrix_train_title, 'body_input': matrix_train_body}, train_y,
          validation_data = [{'title_input':matrix_test_title, 'body_input': matrix_test_body},test_y], epochs=30, batch_size=20000, callbacks=[checkpoint,earlystop,reduce_lr])

Train on 565068 samples, validate on 141268 samples
Epoch 1/30

Epoch 00001: val_accuracy improved from -inf to 0.26300, saving model to D:\Poonam\Project\Stack_m2_output.h5
Epoch 2/30

Epoch 00002: val_accuracy improved from 0.26300 to 0.51315, saving model to D:\Poonam\Project\Stack_m2_output.h5
Epoch 3/30

Epoch 00003: val_accuracy improved from 0.51315 to 0.67973, saving model to D:\Poonam\Project\Stack_m2_output.h5
Epoch 4/30

Epoch 00004: val_accuracy improved from 0.67973 to 0.73504, saving model to D:\Poonam\Project\Stack_m2_output.h5
Epoch 5/30

Epoch 00005: val_accuracy improved from 0.73504 to 0.77252, saving model to D:\Poonam\Project\Stack_m2_output.h5
Epoch 6/30

Epoch 00006: val_accuracy improved from 0.77252 to 0.79193, saving model to D:\Poonam\Project\Stack_m2_output.h5
Epoch 7/30

Epoch 00007: val_accuracy improved from 0.79193 to 0.79884, saving model to D:\Poonam\Project\Stack_m2_output.h5
Epoch 8/30

Epoch 00008: val_accuracy improved from 0.79884 to 0.80046, savi

### SAVING THE MODEL ARCHITECTURE

In [35]:
from keras.models import model_from_json

model_json = model.to_json()

In [36]:
with open("Stack_m2.json","w") as json_file:
    json_file.write(model_json)