## Problem Statement: Shayari Generator using LSTM
### project overview

- Title :- **"Automated Shayari Generation using LSTM-based Language Modeling"**

In [46]:
### Data Preprocessing

import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

# Load The Dataset

with open('hindi_shayari.txt', 'r' , encoding='utf-8') as file:
    text = file.read().lower()

tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
total_words = len(tokenizer.word_index)+1
total_words


3440

In [47]:
tokenizer.word_index

{'है': 1,
 'में': 2,
 'से': 3,
 'की': 4,
 'तो': 5,
 'के': 6,
 'भी': 7,
 'हैं': 8,
 'नहीं': 9,
 'हो': 10,
 'का': 11,
 'को': 12,
 'न': 13,
 'दिल': 14,
 'कर': 15,
 'वो': 16,
 'ये': 17,
 'हम': 18,
 'ही': 19,
 'एक': 20,
 'क्या': 21,
 'पर': 22,
 'हूँ': 23,
 'और': 24,
 'जो': 25,
 'मैं': 26,
 'तुम': 27,
 'तेरी': 28,
 'कुछ': 29,
 'कि': 30,
 'कोई': 31,
 'मेरे': 32,
 'मोहब्बत': 33,
 'अब': 34,
 'किसी': 35,
 'याद': 36,
 'मेरी': 37,
 'ना': 38,
 'मुझे': 39,
 'कभी': 40,
 'ने': 41,
 'था': 42,
 'ए': 43,
 'हर': 44,
 'फिर': 45,
 'तेरे': 46,
 'तू': 47,
 'पे': 48,
 'गया': 49,
 'बहुत': 50,
 'प्यार': 51,
 'करते': 52,
 'इस': 53,
 'थे': 54,
 'तक': 55,
 'जब': 56,
 'मेरा': 57,
 'लिए': 58,
 'दर्द': 59,
 'बात': 60,
 'थी': 61,
 'जाने': 62,
 'अपने': 63,
 'है।': 64,
 'दिया': 65,
 'दे': 66,
 'बस': 67,
 'आ': 68,
 'मगर': 69,
 'होता': 70,
 'गए': 71,
 'जाते': 72,
 'आँखों': 73,
 'आज': 74,
 'अपनी': 75,
 'रहा': 76,
 'होती': 77,
 'लोग': 78,
 'उस': 79,
 'दुनिया': 80,
 'हमें': 81,
 'खुद': 82,
 'गई': 83,
 'आता': 84,
 '।': 85,
 'र

In [49]:
### Creating The Input Sequence

input_seq = []

for line in text.split('\n'):
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1,len(token_list)):
        n_gram_seq = token_list[:i+1]
        input_seq.append(n_gram_seq)

In [50]:
input_seq

[[496, 166],
 [496, 166, 120],
 [496, 166, 120, 971],
 [496, 166, 120, 971, 72],
 [496, 166, 120, 971, 72, 1],
 [93, 972],
 [93, 972, 120],
 [93, 972, 120, 497],
 [93, 972, 120, 497, 121],
 [93, 972, 120, 497, 121, 72],
 [93, 972, 120, 497, 121, 72, 1],
 [235, 973],
 [235, 973, 1496],
 [235, 973, 1496, 15],
 [235, 973, 1496, 15, 290],
 [235, 973, 1496, 15, 290, 1497],
 [13, 1498],
 [13, 1498, 375],
 [13, 1498, 375, 974],
 [13, 1498, 375, 974, 498],
 [13, 1498, 375, 974, 498, 72],
 [13, 1498, 375, 974, 498, 72, 8],
 [20, 142],
 [20, 142, 199],
 [20, 142, 199, 1],
 [20, 142, 199, 1, 167],
 [20, 142, 199, 1, 167, 36],
 [20, 142, 199, 1, 167, 36, 597],
 [20, 142],
 [20, 142, 143],
 [20, 142, 143, 1],
 [20, 142, 143, 1, 167],
 [20, 142, 143, 1, 167, 36],
 [20, 142, 143, 1, 167, 36, 728],
 [22, 39],
 [22, 39, 5],
 [22, 39, 5, 79],
 [22, 39, 5, 79, 142],
 [22, 39, 5, 79, 142, 11],
 [22, 39, 5, 79, 142, 11, 975],
 [22, 39, 5, 79, 142, 11, 975, 1],
 [25, 138],
 [25, 138, 329],
 [25, 138, 329, 1

In [51]:
# maximum sent in a line
max_in_len = max([len(x) for x in input_seq])
max_in_len

23

In [52]:
input_seq = np.array(pad_sequences(input_seq, maxlen=max_in_len, padding='pre'))
input_seq

array([[   0,    0,    0, ...,    0,  496,  166],
       [   0,    0,    0, ...,  496,  166,  120],
       [   0,    0,    0, ...,  166,  120,  971],
       ...,
       [   0,    0,    0, ..., 1495,    2,  371],
       [   0,    0,    0, ...,    2,  371,   72],
       [   0,    0,    0, ...,  371,   72,  113]])

In [53]:
# Create Prediction and Lable
import tensorflow as tf
X,Y = input_seq[:,:-1], input_seq[:, -1]

In [54]:
X

array([[   0,    0,    0, ...,    0,    0,  496],
       [   0,    0,    0, ...,    0,  496,  166],
       [   0,    0,    0, ...,  496,  166,  120],
       ...,
       [   0,    0,    0, ...,    4, 1495,    2],
       [   0,    0,    0, ..., 1495,    2,  371],
       [   0,    0,    0, ...,    2,  371,   72]])

In [55]:
Y

array([166, 120, 971, ..., 371,  72, 113])

In [56]:
# to convert all the 0 only 1

Y = tf.keras.utils.to_categorical(Y,num_classes = total_words)
Y

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [57]:
## Spplit the data training and taisting
x_train, x_test , y_train, y_test = train_test_split(X,Y, test_size = 0.2)

In [58]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

model = Sequential()
model.add(Embedding(total_words, 100, input_length = max_in_len-1))
model.add(LSTM(150))
model.add(Dropout(0.2))
model.add(Dense(total_words, activation='softmax'))

# ## compile the  model 
model.compile(loss="categorical_crossentropy", optimizer="adam",metrics=["accuracy"])
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 22, 100)           344000    
                                                                 
 lstm_2 (LSTM)               (None, 150)               150600    
                                                                 
 dropout_2 (Dropout)         (None, 150)               0         
                                                                 
 dense_2 (Dense)             (None, 3440)              519440    
                                                                 
Total params: 1014040 (3.87 MB)
Trainable params: 1014040 (3.87 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [59]:
##  Train The Model
History = model.fit(x_train, y_train, epochs=5, validation_data=(x_train,y_train), verbose=1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [60]:
## Function to Predict to Next Word

def predict_next_word(model, tokenizer,text,max_in_len):
    token_list = tokenizer.texts_to_sequences([text])[0]
    if len(token_list) >= max_in_len:
        token_list = token_list[-(max_in_len)-1:]
    token_list = pad_sequences([token_list], maxlen=max_in_len-1, padding="pre")
    predicted = model.predict(token_list, verbose=0)    
    predicted_word_index = np.argmax(predicted, axis=1)
    for word, index in tokenizer.word_index.items():
        if index == predicted_word_index:
            return word
    return None

In [63]:
input_Text = "न खुद रहो उदास, न दूसरों को"
print(f"Input Text: {input_Text}")
max_in_len = model.input_shape[1] + 1
next_word = predict_next_word(model, tokenizer, input_Text, max_in_len)
print(f"Next Word Prediction: {next_word}")

Input Text: न खुद रहो उदास, न दूसरों को
Next Word Prediction: लिए
