In [3]:
import pandas as pd

df = pd.read_csv("..//input/hindinewsbook/HindiNewsBook.csv")

# Print the 'headline' column to check if the Hindi text is displayed correctly
print(df.head())

                                               title  \
0  FIR न करने पर दिल्ली पुलिस को नोटिस, पहलवान बो...   
1  बोले- बबीता ने रिपोर्ट पढ़कर साइन किए; उत्पीड़न...   
2              DNPA Code of Ethics For News Websites   
3  Aaj Ka Rashifal: Daily Rashifal, Today's Rashi...   
4  ACB की FIR में IAS दहिया का नाम; ऑस्ट्रेलिया द...   

                                                text  
0  Hindi News\r\nLocal\r\nHaryana\r\nPanipat\r\nB...  
1  Hindi News\r\nLocal\r\nHaryana\r\nRohtak\r\nBr...  
2  CODE OF ETHICS FOR DIGITAL NEWS WEBSITES\r\nTh...  
3  और देखें\r\nवीडियो\r\nOur Divisions\r\nCopyrig...  
4  Hindi News\r\nLocal\r\nHaryana\r\nHaryana Kaus...  


In [4]:
import re

#Removes non-alphabetic characters:
def text_strip(column):
    for row in column:
        
        row=re.sub("(\\t)", ' ', str(row)).lower() #delete escape charecters
        row=re.sub("(\\r)", ' ', str(row)).lower() 
        row=re.sub("(\\n)", ' ', str(row)).lower()
        row= re.sub('[a-zA-Z]', '', str(row)).lower()
        
        row=re.sub("(__+)", ' ', str(row)).lower()   #delete _ if more than one time repeatedly
        row=re.sub("(--+)", ' ', str(row)).lower()   #delete - if more than one time repeatedly
        row=re.sub("(~~+)", ' ', str(row)).lower()   #delete ~ if more than one time repeatedly
        row=re.sub("(\+\++)", ' ', str(row)).lower()   #delete + if more than one time repeatedly
        row=re.sub("(\.\.+)", ' ', str(row)).lower()   #delete . if more than one time repeatedly
        
        row=re.sub(r"[<>()|&©ø\[\]\'\",;?~*!]", ' ', str(row)).lower() #delete <>()|&©ø"',;?~*!
        
        row=re.sub("(mailto:)", ' ', str(row)).lower() #delete mailto:
        row=re.sub(r"(\\x9\d)", ' ', str(row)).lower() #delete \x9* in text
        row=re.sub("([iI][nN][cC]\d+)", 'INC_NUM', str(row)).lower() #replace INC nums to INC_NUM
        row=re.sub("([cC][mM]\d+)|([cC][hH][gG]\d+)", 'CM_NUM', str(row)).lower() #replace CM# and CHG# to CM_NUM
        
        
        row=re.sub("(\.\s+)", ' ', str(row)).lower() #delete full stop at end of words(not between)
        row=re.sub("(\-\s+)", ' ', str(row)).lower() #delete - at end of words(not between)
        row=re.sub("(\:\s+)", ' ', str(row)).lower() #delete : at end of words(not between)
        
        row=re.sub("(\s+.\s+)", ' ', str(row)).lower() #delete any single charecters hanging between 2 spaces
        
        # Change url http://www.youtube.com/watch/43865346kcre8375 ====> www.youtube.com
        try:
            url = re.search(r'((https*:\/*)([^\/\s]+))(.[^\s]+)', str(row))
            repl_url = url.group(3)
            row = re.sub(r'((https*:\/*)([^\/\s]+))(.[^\s]+)',repl_url, str(row))
        except:
            pass #there might be emails with no url in them
        

        
        row = re.sub("(\s+)",' ',str(row)).lower() #delete multiple spaces
        
        #Should always be last
        row=re.sub("(\s+.\s+)", ' ', str(row)).lower() #delete any single charecters hanging between 2 spaces

        
        
        yield row

In [5]:
cleaning1 = text_strip(df['text'])
cleaning2 = text_strip(df['title'])

In [6]:
from time import time
import spacy
nlp = spacy.load('en_core_web_sm', disable=['ner', 'parser']) 

t = time()

text = [str(doc) for doc in nlp.pipe(cleaning1, batch_size=500)] #spaCy.pipe() to speed-up cleaning 

#Takes 40 mins
print('Cleaning time for text: {} mins'.format(round((time() - t) / 60, 2)))

Cleaning time for text: 1.2 mins


In [7]:
t = time()

summary = ['_START_ '+ str(doc) + ' _END_' for doc in nlp.pipe(cleaning2, batch_size=5000)]

#Takes 40 mins
print('Cleaning time for summary: {} mins'.format(round((time() - t) / 60, 2)))

Cleaning time for summary: 0.03 mins


In [8]:
summary[0]

'_START_  करने पर दिल्ली पुलिस को नोटिस पहलवान बोले शिकायत वापस लेने का दबाव जान को खतरा _END_'

In [9]:
import os
# os.mkdir("/kaggle/working/file")


In [10]:
with open("E:\Downloads\text.txt", "w",encoding='utf-8') as f:
  for item in text:
    f.write(item + '\n')

with open('E:\Downloads\summary.txt', "w",encoding='utf-8') as f:
  for item in summary:
    f.write(item + '\n')

In [11]:
text1 = []
summary1 = []
text = []
summary = []

with open('E:\Downloads\text.txt', "r",encoding='utf-8') as f:
  for line in f.readlines():
    text1.append(line)

with open('E:\Downloads\summary.txt', "r",encoding='utf-8') as f:
  for line in f.readlines():
    summary1.append(line)

for item in text1:
  text.append(item.replace('\n', ''))

for item in summary1:
  summary.append(item.replace('\n', ''))

In [12]:
df['cleaned_text'] = pd.Series(text)
df['cleaned_summary'] = pd.Series(summary)
df.head()

Unnamed: 0,title,text,cleaned_text,cleaned_summary
0,"FIR न करने पर दिल्ली पुलिस को नोटिस, पहलवान बो...",Hindi News\r\nLocal\r\nHaryana\r\nPanipat\r\nB...,महिला रेसलर्स की याचिका पर में शुक्रवार को सु...,_START_ करने पर दिल्ली पुलिस को नोटिस पहलवान ...
1,बोले- बबीता ने रिपोर्ट पढ़कर साइन किए; उत्पीड़न...,Hindi News\r\nLocal\r\nHaryana\r\nRohtak\r\nBr...,योगेश्वर दत्त ने बबीता-विनेश के आरोपों को नका...,_START_ बोले बबीता ने रिपोर्ट पढ़कर साइन किए उ...
2,DNPA Code of Ethics For News Websites,CODE OF ETHICS FOR DIGITAL NEWS WEBSITES\r\nTh...,19 .,_START_ _END_
3,"Aaj Ka Rashifal: Daily Rashifal, Today's Rashi...",और देखें\r\nवीडियो\r\nOur Divisions\r\nCopyrig...,और देखें वीडियो 2023-24 .,_START_ _END_
4,ACB की FIR में IAS दहिया का नाम; ऑस्ट्रेलिया द...,Hindi News\r\nLocal\r\nHaryana\r\nHaryana Kaus...,हरियाणा कौशल विकास मिशन रिश्वतकांड की में दहि...,_START_ की में दहिया का नाम ऑस्ट्रेलिया दौरा ...


In [13]:
# 25 words in summary
count=0
for i in df['cleaned_summary']:
    if(len(i.split())<=25):
        count=count+1
print(count/len(df['cleaned_summary']))

0.9930107526881721


In [14]:
# 1700 words in text
count=0
for i in df['cleaned_text']:
    if(len(i.split())<=1700):
        count=count+1
print(count/len(df['cleaned_text']))

0.9559139784946237


In [15]:
max_text_len=1700
max_summary_len=25

In [16]:
import numpy as np
cleaned_text =np.array(df['cleaned_text'])
cleaned_summary=np.array(df['cleaned_summary'])

short_text=[]
short_summary=[]

for i in range(len(cleaned_text)):
    if(len(cleaned_summary[i].split())<=max_summary_len and len(cleaned_text[i].split())<=max_text_len):
        short_text.append(cleaned_text[i])
        short_summary.append(cleaned_summary[i])
        
post_pre_data=pd.DataFrame({'text':short_text,'summary':short_summary})
post_pre_data.head()

Unnamed: 0,text,summary
0,महिला रेसलर्स की याचिका पर में शुक्रवार को सु...,_START_ करने पर दिल्ली पुलिस को नोटिस पहलवान ...
1,योगेश्वर दत्त ने बबीता-विनेश के आरोपों को नका...,_START_ बोले बबीता ने रिपोर्ट पढ़कर साइन किए उ...
2,19 .,_START_ _END_
3,और देखें वीडियो 2023-24 .,_START_ _END_
4,हरियाणा कौशल विकास मिशन रिश्वतकांड की में दहि...,_START_ की में दहिया का नाम ऑस्ट्रेलिया दौरा ...


In [17]:
post_pre_data['summary'] = post_pre_data['summary'].apply(lambda x : 'sostok '+ x + ' eostok')

In [18]:
from sklearn.model_selection import train_test_split
x_tr,x_val,y_tr,y_val=train_test_split(np.array(post_pre_data['text']),np.array(post_pre_data['summary']),test_size=0.1,random_state=0,shuffle=True)

In [19]:
from keras.preprocessing.text import Tokenizer 
from tensorflow.keras.preprocessing.sequence import pad_sequences

#prepare a tokenizer for reviews on training data
x_tknizer = Tokenizer() 
x_tknizer.fit_on_texts(list(x_tr))

In [20]:
thresh=4

count=0
total_count=0
frequency=0
total_frequency=0

for key,value in x_tknizer.word_counts.items():
    total_count=total_count+1
    total_frequency=total_frequency+value
    if(value<thresh):
        count=count+1
        frequency=frequency+value
    
print("% of rare words in vocabulary:",(count/total_count)*100)
print("Total Coverage of rare words:",(frequency/total_frequency)*100)

% of rare words in vocabulary: 60.50722180782532
Total Coverage of rare words: 3.988783798296679


In [21]:
x_tknizer = Tokenizer(num_words=total_count-count) 
x_tknizer.fit_on_texts(list(x_tr))

#convert text sequences into integer sequences (i.e one-hot encodeing all the words)
x_tr_seq    =   x_tknizer.texts_to_sequences(x_tr) 
x_val_seq   =   x_tknizer.texts_to_sequences(x_val)

#padding zero upto maximum length
x_tr    =   pad_sequences(x_tr_seq,  maxlen=max_text_len, padding='post')
x_val   =   pad_sequences(x_val_seq, maxlen=max_text_len, padding='post')

#size of vocabulary ( +1 for padding token)
x_voc   =  x_tknizer.num_words + 1

print("Size of vocabulary in X = {}".format(x_voc))

Size of vocabulary in X = 11649


In [22]:
y_tknizer = Tokenizer()   
y_tknizer.fit_on_texts(list(y_tr))

In [23]:
thresh=6

count=0
total_count=0
frequency=0
total_frequency=0

for key,value in y_tknizer.word_counts.items():
    total_count=total_count+1
    total_frequency=total_frequency+value
    if(value<thresh):
        count=count+1
        frequency=frequency+value
    
print("% of rare words in vocabulary:",(count/total_count)*100)
print("Total Coverage of rare words:",(frequency/total_frequency)*100)

% of rare words in vocabulary: 89.01018922852984
Total Coverage of rare words: 29.010311470181783


In [24]:
#prepare a tokenizer for reviews on training data
y_tknizer = Tokenizer(num_words=total_count-count) 
y_tknizer.fit_on_texts(list(y_tr))

#convert text sequences into integer sequences (i.e one hot encode the text in Y)
y_tr_seq    =   y_tknizer.texts_to_sequences(y_tr) 
y_val_seq   =   y_tknizer.texts_to_sequences(y_val) 

#padding zero upto maximum length
y_tr    =   pad_sequences(y_tr_seq, maxlen=max_summary_len, padding='post')
y_val   =   pad_sequences(y_val_seq, maxlen=max_summary_len, padding='post')

#size of vocabulary
y_voc  =   y_tknizer.num_words +1
print("Size of vocabulary in Y = {}".format(y_voc))

Size of vocabulary in Y = 605


In [25]:
ind=[]
for i in range(len(y_tr)):
    count=0
    for j in y_tr[i]:
        if j!=0:
            count=count+1
    if(count==2):
        ind.append(i)

y_tr=np.delete(y_tr,ind, axis=0)
x_tr=np.delete(x_tr,ind, axis=0)

In [26]:
ind=[]
for i in range(len(y_val)):
    count=0
    for j in y_val[i]:
        if j!=0:
            count=count+1
    if(count==2):
        ind.append(i)

y_val=np.delete(y_val,ind, axis=0)
x_val=np.delete(x_val,ind, axis=0)

In [27]:
from keras import backend as K 
import gensim
from numpy import *
import numpy as np
import pandas as pd 
import re
from bs4 import BeautifulSoup
from keras.preprocessing.text import Tokenizer 
from tensorflow.keras.preprocessing.sequence import pad_sequences
from nltk.corpus import stopwords
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Concatenate, TimeDistributed
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping
import warnings
pd.set_option("display.max_colwidth", 200)
warnings.filterwarnings("ignore")

K.clear_session()

latent_dim = 200
embedding_dim=100

# Encoder
encoder_inputs = Input(shape=(max_text_len,))

#embedding layer
enc_emb =  Embedding(x_voc, embedding_dim,trainable=True)(encoder_inputs)

#encoder lstm 1
encoder_lstm1 = LSTM(latent_dim,return_sequences=True,return_state=True,dropout=0.4,recurrent_dropout=0.4)
encoder_output1, state_h1, state_c1 = encoder_lstm1(enc_emb)

#encoder lstm 2
encoder_lstm2 = LSTM(latent_dim,return_sequences=True,return_state=True,dropout=0.4,recurrent_dropout=0.4)
encoder_output2, state_h2, state_c2 = encoder_lstm2(encoder_output1)

# #encoder lstm 3
encoder_lstm3=LSTM(latent_dim, return_state=True, return_sequences=True,dropout=0.4,recurrent_dropout=0.4)
encoder_outputs, state_h, state_c= encoder_lstm3(encoder_output2)

# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None,))

#embedding layer
dec_emb_layer = Embedding(y_voc, embedding_dim,trainable=True)
dec_emb = dec_emb_layer(decoder_inputs)

decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True,dropout=0.4,recurrent_dropout=0.2)
decoder_outputs,decoder_fwd_state, decoder_back_state = decoder_lstm(dec_emb,initial_state=[state_h, state_c])

#dense layer - softmax
decoder_dense =  TimeDistributed(Dense(y_voc, activation='softmax'))
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model 
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 1700)]       0           []                               
                                                                                                  
 embedding (Embedding)          (None, 1700, 100)    1164900     ['input_1[0][0]']                
                                                                                                  
 lstm (LSTM)                    [(None, 1700, 200),  240800      ['embedding[0][0]']              
                                 (None, 200),                                                     
                                 (None, 200)]                                                     
                                                                                              

In [28]:
model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy')

In [29]:
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1,patience=2)

In [30]:
history = model.fit(
    [x_tr, y_tr[:,:-1]],
    y_tr.reshape(y_tr.shape[0], y_tr.shape[1], 1)[:,1:],
    epochs=30,
    callbacks=[es],
    batch_size=50,
    validation_data=([x_val, y_val[:,:-1]], y_val.reshape(y_val.shape[0], y_val.shape[1], 1)[:,1:])
)


Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 18: early stopping


In [86]:
reverse_target_word_index=y_tknizer.index_word
reverse_source_word_index=x_tknizer.index_word
target_word_index=y_tknizer.word_index

'में'

In [33]:
# Encoding our input seq for feature vector
encoder_model = Model(inputs=encoder_inputs,outputs=[encoder_outputs, state_h, state_c])

decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_hidden_state_input = Input(shape=(max_text_len,latent_dim))

dec_emb2= dec_emb_layer(decoder_inputs) 

# initial states from the previous time step
decoder_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state=[decoder_state_input_h, decoder_state_input_c])

# softmax for probability
decoder_outputs2 = decoder_dense(decoder_outputs2) 

decoder_model = Model(
    [decoder_inputs] + [decoder_hidden_state_input,decoder_state_input_h, decoder_state_input_c],
    [decoder_outputs2] + [state_h2, state_c2])

In [None]:
def sequence_to_summary(input_seq):
    newString=''
    for i in input_seq:
        if((i!=0 and i!=target_word_index['sostok']) and i!=target_word_index['eostok']):
            newString=newString+reverse_target_word_index[i]+' '
    return newString

def sequence_to_text(input_seq):
    newString=''
    for i in input_seq:
        if(i!=0):
            newString=newString+reverse_source_word_index[i]+' '
    return newString

In [88]:
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    e_out, e_h, e_c = encoder_model.predict(input_seq)
    
    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1,1))
    
    # Populate the first word of target sequence with the start word.
    target_seq[0, 0] = target_word_index['sostok']

    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
      
        output_tokens, h, c = decoder_model.predict([target_seq] + [e_out, e_h, e_c])
        

        # Sample a token
    
        sampled_token_index = np.argmax(output_tokens[0, -1, :], axis=0)
#         np.argmax(output_tokens[0, -1])
        
        sampled_token = reverse_target_word_index[sampled_token_index]
#         print(output_tokens)
        
        if(sampled_token!='eostok'):
            decoded_sentence += ' '+sampled_token

        # Exit condition: either hit max length or find stop word.
        if (sampled_token == 'eostok'  or len(decoded_sentence.split()) >= (max_summary_len-1)):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1,1))
        target_seq[0, 0] = sampled_token_index

        # Update internal states
        e_h, e_c = h, c

    return decoded_sentence

In [89]:
for i in range(20,25):
   
    print("Review:",sequence_to_text(x_tr[i]))
    print("Original summary:",(sequence_to_summary(y_tr[i])).replace('start', '').replace('end', ''))
    print("Predicted summary:",(decode_sequence(x_tr[i].reshape(1,max_text_len))).replace('start', '').replace('end', ''))
    print("\n")

Review: 70 नप की बजट बैठक में नहीं आए पार्षद साेनकच्छ में अब 70 रुपए देना होगा जलकर सेवा कर भी बढ़ाया साेनकच्छ घंटे पहले कॉपी लिंक सोनकच्छ नगर परिषद का बजट बैठक साधारण सम्मेलन संपन्न हुआ। अध्यक्ष श्रुति बघेल ने 66 हजार रुपए की बचत का बजट पारित किया है। बजट में आम आदमी की जेब पर थोड़ा फर्क जरूर पड़ेगा। नप ने जलकर को बढ़ाते हुए आवासीय कर 70 रुपए व्यावसायिक कर 120 रुपए किया है। इसके अलावा सेवा कर 30 रुपए मासिक दुकानदारों से 60 रुपए मासिक लिए जाने का फैसला लिया है। बैठक में किसी प्रकार की विवाद की स्थिति नही हुई लेकिन पार्षद के साथ कई पार्षद प्रतिनिधि बैठक में हस्तक्षेप करते हुए देखे गए। इसके अलावा पार्षद की अनुपस्थिति में उनके पति भी बैठे हुए थे। इस नगर के बजट में सुनीता परमार यादव बी प्रिया अग्रवाल महेश यादव थे। पार्षद शब्बीर अंत में आए। बैठक में कई पार्षदों के प्रतिनिधि या फिर बाहरी लोग पर बैठे रहे। श्रुति बघेल ने बताया कि मेले के लिए बजट बढ़ाया है। रामलीला करने का प्रस्ताव भी इसमें शामिल है। पशु पालन विभाग की जमीन जो कि वर्तमान स्थिति में खाली पड़ी है उसका नप को करने को लेकर प्रस्ताव पारित क