#### News Category Prediction

Dataset Link https://machinehack.com/hackathons/predict_the_news_category_hackathon/overview

#### Importing Necessary Libraries 

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf

In [None]:
train_df = pd.read_excel('/content/drive/MyDrive/Datasets/News Category/Data_Train.xlsx')
test_df = pd.read_excel('/content/drive/MyDrive/Datasets/News Category/Data_Test.xlsx')
sample_df = pd.read_excel('/content/drive/MyDrive/Datasets/News Category/Sample_submission.xlsx')

In [None]:
sample_df

Unnamed: 0,SECTION
0,3
1,3
2,3
3,3
4,3
...,...
2743,2
2744,2
2745,2
2746,2


In [None]:
test_df.head()

Unnamed: 0,STORY
0,2019 will see gadgets like gaming smartphones ...
1,It has also unleashed a wave of changes in the...
2,It can be confusing to pick the right smartpho...
3,The mobile application is integrated with a da...
4,We have rounded up some of the gadgets that sh...


In [None]:
import tqdm
from preprocess_engine_kaggle import preprocessor_engine
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
def corpus_pre_processor(corpus):
    norm_corpus = []
    for doc in tqdm.tqdm(corpus):
        #print(type(doc))
        norm_corpus.append(preprocessor_engine(doc, html_strip=False, accent_characters=False, fix_contract=True, remove_stop_words=True , remove_digits=True, lower=False))
        
    return norm_corpus

In [None]:
train_data = corpus_pre_processor(train_df['STORY'])

100%|██████████| 7628/7628 [03:05<00:00, 41.06it/s]


In [None]:
test_data = corpus_pre_processor(test_df['STORY'])

100%|██████████| 2748/2748 [01:04<00:00, 42.38it/s]


In [None]:
#Original Data
train_df.iloc[0,0]

'But the most painful was the huge reversal in fee income, unheard of among private sector lenders. Essentially, it means that Yes Bank took it for granted that fees on structured loan deals will be paid and accounted for upfront on its books. As borrowers turned defaulters, the fees tied to these loan deals fell off the cracks. Gill has now vowed to shift to a safer accounting practice of amortizing fee income rather than booking these upfront.\n\n\nGill’s move to mend past ways means that there will be no nasty surprises in the future. This is good news considering that investors love a clean image and loathe uncertainties.\n\n\nBut there is no gain without pain and the promise of a strong and stable balance sheet comes with some sacrifices as well. Investors will have to give up the hopes of phenomenal growth, a promise made by Kapoor.'

In [None]:
#Preprocessed Data
train_data[0]

'painful huge reversal fee income unheard private sector lender essentially mean yes bank grant fee structured loan deal pay account upfront book borrower turn defaulter fee tie loan deal fall crack gill vow shift safe accounting practice amortize fee income book upfront gill s mend past way mean nasty surprise future good news consider investor love clean image loathe uncertainty gain pain promise strong stable balance sheet come sacrifice investor hope phenomenal growth promise kapoor'

In [None]:
#Section's value counts
train_df['SECTION'].value_counts()

1    2772
2    1924
0    1686
3    1246
Name: SECTION, dtype: int64

In [None]:
tokenzer = tf.keras.preprocessing.text.Tokenizer(oov_token = '<UNK>')
tokenzer.fit_on_texts(train_data)

In [None]:
train_sequences = tokenzer.texts_to_sequences(train_data)
test_sequences = tokenzer.texts_to_sequences(test_data)

In [None]:
print("Vocabulary size ={}".format(len(tokenzer.word_index)))
print("Number of Documents={}".format(tokenzer.document_count))

Vocabulary size =25395
Number of Documents=7628


In [None]:
pd.Series(train_data).apply(lambda x : len(x.split())).max()

546

In [None]:
MAX_SEQUENCE_LENGTH = 547

train_pad_sequences = tf.keras.preprocessing.sequence.pad_sequences(train_sequences, maxlen = MAX_SEQUENCE_LENGTH, padding='post')
test_pad_sequneces = tf.keras.preprocessing.sequence.pad_sequences(test_sequences, maxlen = MAX_SEQUENCE_LENGTH, padding='post')

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, RNN, LSTM, GRU, Bidirectional, Embedding
from tensorflow.keras.callbacks import EarlyStopping
from google.colab import files

In [None]:
y = train_df['SECTION']
y.nunique()

4

In [None]:
def deep_model(layer_name, filename=None, epochs=50, final_pred = True):
    SEED = 42
    np.random.seed(SEED)
    tf.random.set_seed(SEED)

    EMBEDDING_DIM = 300 #Dimension for dense embedding for each token
    VOCAB_SIZE = len(tokenzer.word_index)
    model = Sequential()
    model.add((Embedding(input_dim =VOCAB_SIZE+1,output_dim = EMBEDDING_DIM,input_length = MAX_SEQUENCE_LENGTH)))
    model.add((layer_name(32)))
    model.add((Dense(32,activation = 'relu')))
    model.add(Dense(4,activation = 'softmax'))

    model.compile(loss = tf.keras.losses.SparseCategoricalCrossentropy(),optimizer="adam",metrics =['accuracy'])
    model.summary()
    fit_the_model(model, filename=filename, epochs=epochs, verbose=1, final_pred = final_pred)

In [None]:
def predictions(model, file_name):
    test_pred = model.predict_classes(test_pad_sequneces)
    sample_df['SECTION'] = test_pred
    print(file_name)
    sample_df.to_csv(f'{file_name}pred.csv', index = True)
    files.download(f'{file_name}pred.csv') 

def fit_the_model(model, filename, epochs=50, verbose=1, final_pred = True):
    early_stop = EarlyStopping(monitor='val_loss', patience=5)
    model.fit(train_pad_sequences, y, epochs=epochs, validation_split=0.1, callbacks=[early_stop],verbose=verbose)
    if final_pred:
      predictions(model, filename)
    else:
      pass

In [None]:
deep_model(LSTM, filename="LSTM")

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 547, 300)          7618800   
_________________________________________________________________
lstm_3 (LSTM)                (None, 128)               219648    
_________________________________________________________________
dense_6 (Dense)              (None, 256)               33024     
_________________________________________________________________
dense_7 (Dense)              (None, 4)                 1028      
Total params: 7,872,500
Trainable params: 7,872,500
Non-trainable params: 0
_________________________________________________________________
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50




<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
deep_model(GRU, filename="GRU ")

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 547, 300)          7618800   
_________________________________________________________________
gru_1 (GRU)                  (None, 128)               165120    
_________________________________________________________________
dense_10 (Dense)             (None, 256)               33024     
_________________________________________________________________
dense_11 (Dense)             (None, 4)                 1028      
Total params: 7,817,972
Trainable params: 7,817,972
Non-trainable params: 0
_________________________________________________________________
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50




<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
def stack_model(layer_name, filename, epochs=50):
    SEED = 42
    np.random.seed(SEED)
    tf.random.set_seed(SEED)

    EMBEDDING_DIM = 300 #Dimension for dense embedding for each token
    VOCAB_SIZE = len(tokenzer.word_index)
    model = Sequential()
    model.add((Embedding(input_dim =VOCAB_SIZE+1,output_dim = EMBEDDING_DIM,input_length = MAX_SEQUENCE_LENGTH)))
    model.add((layer_name(256, return_sequences = True)))
    model.add((layer_name(128, return_sequences = False)))
    model.add((Dense(128,activation = 'relu')))
    model.add(Dense(4,activation = 'softmax'))

    model.compile(loss = tf.keras.losses.SparseCategoricalCrossentropy(),optimizer="adam",metrics =['accuracy'])
    model.summary()
    fit_the_model(model, filename, epochs=50, verbose=1)

In [None]:
stack_model(LSTM, filename="LSTM_Stack ")

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 547, 300)          7618800   
_________________________________________________________________
lstm_4 (LSTM)                (None, 547, 256)          570368    
_________________________________________________________________
lstm_5 (LSTM)                (None, 128)               197120    
_________________________________________________________________
dense_12 (Dense)             (None, 128)               16512     
_________________________________________________________________
dense_13 (Dense)             (None, 4)                 516       
Total params: 8,403,316
Trainable params: 8,403,316
Non-trainable params: 0
_________________________________________________________________
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50



<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
stack_model(GRU, filename="GRU_Stack ")

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 547, 300)          7618800   
_________________________________________________________________
gru_2 (GRU)                  (None, 547, 256)          428544    
_________________________________________________________________
gru_3 (GRU)                  (None, 128)               148224    
_________________________________________________________________
dense_14 (Dense)             (None, 128)               16512     
_________________________________________________________________
dense_15 (Dense)             (None, 4)                 516       
Total params: 8,212,596
Trainable params: 8,212,596
Non-trainable params: 0
_________________________________________________________________
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50



<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
def bidirect_model(layer_name, filename, epochs=50, final_pred = True):
    #SEED = 42
    #np.random.seed(SEED)
    #tf.random.set_seed(SEED)

    EMBEDDING_DIM = 300 #Dimension for dense embedding for each token
    VOCAB_SIZE = len(tokenzer.word_index)
    model = Sequential()
    model.add((Embedding(input_dim =VOCAB_SIZE+1,output_dim = EMBEDDING_DIM,input_length = MAX_SEQUENCE_LENGTH)))
    model.add(Bidirectional(layer_name(256, return_sequences= True)))
    model.add(Bidirectional(layer_name(128, return_sequences= True)))
    model.add(Bidirectional(layer_name(128, return_sequences= False)))
    model.add((Dense(256,activation = 'relu')))
    model.add(Dense(4,activation = 'softmax'))

    model.compile(loss = tf.keras.losses.SparseCategoricalCrossentropy(),optimizer="adam",metrics =['accuracy'])
    model.summary()
    fit_the_model(model, filename=filename, epochs=epochs, verbose=1, final_pred = final_pred)

In [None]:
bidirect_model(LSTM, filename="Bi_direct_LSTM ")

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 547, 300)          7618800   
_________________________________________________________________
bidirectional_2 (Bidirection (None, 547, 512)          1140736   
_________________________________________________________________
bidirectional_3 (Bidirection (None, 256)               656384    
_________________________________________________________________
dense_6 (Dense)              (None, 256)               65792     
_________________________________________________________________
dense_7 (Dense)              (None, 4)                 1028      
Total params: 9,482,740
Trainable params: 9,482,740
Non-trainable params: 0
_________________________________________________________________
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50




<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
bidirect_model(GRU, filename="Bi_direct_GRU ")

Model: "sequential_15"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_15 (Embedding)     (None, 547, 300)          7618800   
_________________________________________________________________
bidirectional_12 (Bidirectio (None, 547, 512)          857088    
_________________________________________________________________
bidirectional_13 (Bidirectio (None, 256)               493056    
_________________________________________________________________
dense_28 (Dense)             (None, 256)               65792     
_________________________________________________________________
dense_29 (Dense)             (None, 4)                 1028      
Total params: 9,035,764
Trainable params: 9,035,764
Non-trainable params: 0
_________________________________________________________________
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50




Bi_direct_GRU 


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
bidirect_model(GRU, filename="Bi_direct_GRU_3 ")

Model: "sequential_17"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_17 (Embedding)     (None, 547, 300)          7618800   
_________________________________________________________________
bidirectional_17 (Bidirectio (None, 547, 512)          857088    
_________________________________________________________________
bidirectional_18 (Bidirectio (None, 547, 256)          493056    
_________________________________________________________________
bidirectional_19 (Bidirectio (None, 256)               296448    
_________________________________________________________________
dense_32 (Dense)             (None, 256)               65792     
_________________________________________________________________
dense_33 (Dense)             (None, 4)                 1028      
Total params: 9,332,212
Trainable params: 9,332,212
Non-trainable params: 0
___________________________________________



Bi_direct_GRU_3 


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
bidirect_model(GRU, filename="Bi_direct_GRU_wi ")

Model: "sequential_18"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_18 (Embedding)     (None, 547, 300)          7618800   
_________________________________________________________________
bidirectional_20 (Bidirectio (None, 547, 512)          857088    
_________________________________________________________________
bidirectional_21 (Bidirectio (None, 547, 256)          493056    
_________________________________________________________________
bidirectional_22 (Bidirectio (None, 256)               296448    
_________________________________________________________________
dense_34 (Dense)             (None, 256)               65792     
_________________________________________________________________
dense_35 (Dense)             (None, 4)                 1028      
Total params: 9,332,212
Trainable params: 9,332,212
Non-trainable params: 0
___________________________________________



Bi_direct_GRU_wi 


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>