In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer  # Changed import to use Keras from TensorFlow
from tensorflow.keras.preprocessing.sequence import pad_sequences  # Corrected import for pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

In [5]:
#read dataset
with open('urdu Dataset.txt','r',encoding='utf-8') as file:
    text=file.read()

In [6]:
#Tokenizer process
tokenizer=Tokenizer()
#fit
tokenizer.fit_on_texts([text])
#assign length of word index
total_words=len(tokenizer.word_index)+1

In [7]:
tokenizer.word_index

{'اور': 1,
 'کے': 2,
 'کی': 3,
 'میں': 4,
 'انٹرنیٹ': 5,
 'کو': 6,
 'ہے۔': 7,
 'نوجوانوں': 8,
 'پر': 9,
 'ہیں۔': 10,
 'صحت': 11,
 'نے': 12,
 'سے': 13,
 'ذہنی': 14,
 'کا': 15,
 'ایک': 16,
 'اس': 17,
 'بھی': 18,
 'کر': 19,
 'زندگی': 20,
 'نوجوان': 21,
 'ان': 22,
 'جسمانی': 23,
 'اثرات': 24,
 'منفی': 25,
 'یہ': 26,
 'کہ': 27,
 'لیے': 28,
 'ہو': 29,
 'ہے': 30,
 'معلومات': 31,
 'ہیں': 32,
 'دیا': 33,
 'استعمال': 34,
 'انہیں': 35,
 'سرگرمیوں': 36,
 'نئی': 37,
 'نسل': 38,
 'دنیا': 39,
 'صرف': 40,
 'آن': 41,
 'لائن': 42,
 'غلط': 43,
 'دور': 44,
 'حصہ': 45,
 'فراہم': 46,
 'بہت': 47,
 'پیدا': 48,
 'مثبت': 49,
 'خیالی': 50,
 'کرنا': 51,
 'علم': 52,
 'بنا': 53,
 'سکتا': 54,
 'زیادہ': 55,
 'ویب': 56,
 'سائٹس': 57,
 'بارے': 58,
 'دیتی': 59,
 'نقصان': 60,
 'چاہیے': 61,
 'آج': 62,
 'جہاں': 63,
 'بے': 64,
 'ہیں،': 65,
 'وہیں': 66,
 'مشکلات': 67,
 'طور': 68,
 'مرتب': 69,
 'پہلوؤں': 70,
 'اپنا': 71,
 'حقیقت': 72,
 'پہلو': 73,
 'حاصل': 74,
 'تخلیقی': 75,
 'کیا': 76,
 'مختلف': 77,
 'متاثر': 78,
 'سماجی': 7

In [8]:
tokenizer.word_index

{'اور': 1,
 'کے': 2,
 'کی': 3,
 'میں': 4,
 'انٹرنیٹ': 5,
 'کو': 6,
 'ہے۔': 7,
 'نوجوانوں': 8,
 'پر': 9,
 'ہیں۔': 10,
 'صحت': 11,
 'نے': 12,
 'سے': 13,
 'ذہنی': 14,
 'کا': 15,
 'ایک': 16,
 'اس': 17,
 'بھی': 18,
 'کر': 19,
 'زندگی': 20,
 'نوجوان': 21,
 'ان': 22,
 'جسمانی': 23,
 'اثرات': 24,
 'منفی': 25,
 'یہ': 26,
 'کہ': 27,
 'لیے': 28,
 'ہو': 29,
 'ہے': 30,
 'معلومات': 31,
 'ہیں': 32,
 'دیا': 33,
 'استعمال': 34,
 'انہیں': 35,
 'سرگرمیوں': 36,
 'نئی': 37,
 'نسل': 38,
 'دنیا': 39,
 'صرف': 40,
 'آن': 41,
 'لائن': 42,
 'غلط': 43,
 'دور': 44,
 'حصہ': 45,
 'فراہم': 46,
 'بہت': 47,
 'پیدا': 48,
 'مثبت': 49,
 'خیالی': 50,
 'کرنا': 51,
 'علم': 52,
 'بنا': 53,
 'سکتا': 54,
 'زیادہ': 55,
 'ویب': 56,
 'سائٹس': 57,
 'بارے': 58,
 'دیتی': 59,
 'نقصان': 60,
 'چاہیے': 61,
 'آج': 62,
 'جہاں': 63,
 'بے': 64,
 'ہیں،': 65,
 'وہیں': 66,
 'مشکلات': 67,
 'طور': 68,
 'مرتب': 69,
 'پہلوؤں': 70,
 'اپنا': 71,
 'حقیقت': 72,
 'پہلو': 73,
 'حاصل': 74,
 'تخلیقی': 75,
 'کیا': 76,
 'مختلف': 77,
 'متاثر': 78,
 'سماجی': 7

In [9]:
#n-gram model
#declare n-grams
input_sequences=[]
#splitting from \n
for line in text.split('\n'):
    #getting tokens
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1,len(token_list)):
       n_gram_sequence = token_list[:i+1]
       input_sequences.append(n_gram_sequence)

In [10]:
input_sequences

[[5, 1],
 [5, 1, 37],
 [5, 1, 37, 38],
 [5, 1, 37, 38, 3],
 [5, 1, 37, 38, 3, 14],
 [5, 1, 37, 38, 3, 14, 11],
 [5, 1, 37, 38, 3, 14, 11, 9],
 [5, 1, 37, 38, 3, 14, 11, 9, 24],
 [62, 2],
 [62, 2, 44],
 [62, 2, 44, 4],
 [62, 2, 44, 4, 5],
 [62, 2, 44, 4, 5, 20],
 [62, 2, 44, 4, 5, 20, 15],
 [62, 2, 44, 4, 5, 20, 15, 16],
 [62, 2, 44, 4, 5, 20, 15, 16, 99],
 [62, 2, 44, 4, 5, 20, 15, 16, 99, 45],
 [62, 2, 44, 4, 5, 20, 15, 16, 99, 45, 100],
 [62, 2, 44, 4, 5, 20, 15, 16, 99, 45, 100, 101],
 [62, 2, 44, 4, 5, 20, 15, 16, 99, 45, 100, 101, 7],
 [62, 2, 44, 4, 5, 20, 15, 16, 99, 45, 100, 101, 7, 17],
 [62, 2, 44, 4, 5, 20, 15, 16, 99, 45, 100, 101, 7, 17, 2],
 [62, 2, 44, 4, 5, 20, 15, 16, 99, 45, 100, 101, 7, 17, 2, 102],
 [62, 2, 44, 4, 5, 20, 15, 16, 99, 45, 100, 101, 7, 17, 2, 102, 20],
 [62, 2, 44, 4, 5, 20, 15, 16, 99, 45, 100, 101, 7, 17, 2, 102, 20, 15],
 [62, 2, 44, 4, 5, 20, 15, 16, 99, 45, 100, 101, 7, 17, 2, 102, 20, 15, 103],
 [62,
  2,
  44,
  4,
  5,
  20,
  15,
  16,
  99,
 

In [11]:
#checking token sentence
sentence_token = input_sequences[10]
sentence=[]
for token in sentence_token:
    sentence.append(list((tokenizer.word_index).keys()) [list((tokenizer.word_index).values()).index(token)])
print(sentence)

['آج', 'کے', 'دور', 'میں']


In [12]:
#padding
#maximum sentence length'
max_sequence_len = max([len(seq) for  seq in input_sequences])
#input sequences
input_sequences= np.array(pad_sequences(input_sequences,maxlen=max_sequence_len,padding='pre'))

In [13]:
input_sequences

array([[  0,   0,   0, ...,   0,   5,   1],
       [  0,   0,   0, ...,   5,   1,  37],
       [  0,   0,   0, ...,   1,  37,  38],
       ...,
       [  0,   0,   0, ...,   4, 282, 283],
       [  0,   0,   0, ..., 282, 283,  54],
       [  0,   0,   0, ..., 283,  54,   7]])

In [14]:
X = input_sequences[:, :-1]
y = input_sequences[:, -1]

In [15]:
X.shape

(717, 100)

In [16]:
y.shape

(717,)

In [17]:
y=np.array(tf.keras.utils.to_categorical(y,num_classes=total_words))

In [18]:
#create mode;
model=Sequential()
model.add(Embedding(total_words,100,input_length=max_sequence_len-1))
model.add(LSTM(150))
model.add(Dense(total_words,activation='softmax'))

from tensorflow.keras import backend as K
K.clear_session()






In [19]:
#compile the model
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])

In [20]:
#fit the model
model.fit(X,y,epochs=100,verbose=1)

Epoch 1/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 37ms/step - accuracy: 0.0312 - loss: 5.6189
Epoch 2/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 38ms/step - accuracy: 0.0371 - loss: 5.1703
Epoch 3/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 40ms/step - accuracy: 0.0521 - loss: 5.0686
Epoch 4/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 39ms/step - accuracy: 0.0462 - loss: 5.0181
Epoch 5/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 39ms/step - accuracy: 0.0424 - loss: 4.9992
Epoch 6/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 38ms/step - accuracy: 0.0422 - loss: 4.9528
Epoch 7/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 38ms/step - accuracy: 0.0612 - loss: 4.8600
Epoch 8/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 38ms/step - accuracy: 0.0646 - loss: 4.8115
Epoch 9/100
[1m23/23[0m [32m━━━━━━━━━

KeyboardInterrupt: 

In [22]:
#determining the text
import time
seed_text = "انٹرنیٹ کے زیادہ "
next_words = 10

for _ in range(next_words):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
    predicted = np.argmax(model.predict(token_list), axis=-1)
    output_word = ""
    for word, index in tokenizer.word_index.items():
        if index == predicted:
            output_word = word
            break
    seed_text += " " + output_word
    print(seed_text)
    time.sleep(2)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
انٹرنیٹ کے زیادہ  استعمال
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
انٹرنیٹ کے زیادہ  استعمال نے
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
انٹرنیٹ کے زیادہ  استعمال نے نوجوانوں
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
انٹرنیٹ کے زیادہ  استعمال نے نوجوانوں کی
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
انٹرنیٹ کے زیادہ  استعمال نے نوجوانوں کی ذہنی
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
انٹرنیٹ کے زیادہ  استعمال نے نوجوانوں کی ذہنی صحت
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
انٹرنیٹ کے زیادہ  استعمال نے نوجوانوں کی ذہنی صحت کو
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
انٹرنیٹ کے زیادہ  استعمال نے نوجوانوں کی ذہنی صحت کو بھی
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
انٹرنیٹ کے ز