In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Data preprocessing

In [2]:
import requests, zipfile, io, random

# Download and extract dataset
url = "https://www.cs.cornell.edu/~cristian/data/cornell_movie_dialogs_corpus.zip"
r = requests.get(url)
z = zipfile.ZipFile(io.BytesIO(r.content))
z.extractall()

# Read lines file
lines = open("cornell movie-dialogs corpus/movie_lines.txt", "r", encoding="utf-8", errors="ignore").read().splitlines()

# Take 1/100th subset randomly
subset = random.sample(lines, len(lines)//100)

# Combine into a single text string
text = " ".join(subset)

print(f"Length of text: {len(text):,} characters")
print(text[:500])


Length of text: 349,739 characters
L424511 +++$+++ u2389 +++$+++ m152 +++$+++ WESLEY +++$+++ You lie, motherfucker... L108175 +++$+++ u613 +++$+++ m39 +++$+++ BOURNE +++$+++ What's going on in Berlin? L419260 +++$+++ u2312 +++$+++ m149 +++$+++ LEON +++$+++ I begin to believe in it myself since I've met you. I still don't know what to make of it. It confuses me, it frightens me a little, but it fascinates me, Ninotchka. L654718 +++$+++ u8917 +++$+++ m606 +++$+++ SANDRA +++$+++ Is that Sam Lombardo? L422786 +++$+++ u6896 +++$+++ m4


In [3]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer

2025-10-20 12:51:05.957088: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1760964666.134031      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1760964666.193598      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [4]:
tokenizer=Tokenizer()

In [5]:
tokenizer.fit_on_texts([text])

In [6]:
tokenizer.word_index

{'you': 1,
 'i': 2,
 'the': 3,
 'to': 4,
 'a': 5,
 'it': 6,
 'and': 7,
 'of': 8,
 'that': 9,
 'in': 10,
 'is': 11,
 'what': 12,
 'me': 13,
 'this': 14,
 "i'm": 15,
 "don't": 16,
 'for': 17,
 'know': 18,
 'we': 19,
 'my': 20,
 'your': 21,
 'have': 22,
 'he': 23,
 'on': 24,
 'with': 25,
 'not': 26,
 'do': 27,
 "it's": 28,
 'no': 29,
 'are': 30,
 'was': 31,
 'be': 32,
 'but': 33,
 'like': 34,
 'all': 35,
 "you're": 36,
 'just': 37,
 'get': 38,
 'if': 39,
 'about': 40,
 'they': 41,
 'at': 42,
 'out': 43,
 'one': 44,
 'her': 45,
 'up': 46,
 'so': 47,
 'want': 48,
 'think': 49,
 'here': 50,
 'can': 51,
 'got': 52,
 "that's": 53,
 'him': 54,
 'there': 55,
 'how': 56,
 'well': 57,
 'right': 58,
 'she': 59,
 'now': 60,
 'going': 61,
 'will': 62,
 'tell': 63,
 'did': 64,
 "i'll": 65,
 'go': 66,
 "he's": 67,
 'then': 68,
 'u': 69,
 'never': 70,
 "can't": 71,
 'or': 72,
 'why': 73,
 'yes': 74,
 'see': 75,
 'come': 76,
 'oh': 77,
 'man': 78,
 'were': 79,
 'as': 80,
 'when': 81,
 'us': 82,
 "i've": 

In [7]:
last_index=len(tokenizer.word_index)
last_index

11453

In [8]:
input_sequences=[]

In [9]:
for sentence in text.split("\n"):
    tokenized_sentence=tokenizer.texts_to_sequences([sentence])[0]
    for i in range (1,len(tokenized_sentence)):
        n_gram=tokenized_sentence[:i+1]
        input_sequences.append(n_gram)

In [10]:
input_sequences[:5]

[[3765, 3766],
 [3765, 3766, 320],
 [3765, 3766, 320, 3767],
 [3765, 3766, 320, 3767, 1],
 [3765, 3766, 320, 3767, 1, 1275]]

In [11]:
max_len=max([len(x) for x in input_sequences])
max_len=min(180,max_len)
max_len

180

In [12]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [13]:
padded_input_sequences= pad_sequences(input_sequences,maxlen=max_len,padding='pre')

In [14]:
padded_input_sequences[:5]

array([[   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0, 

In [15]:
X=padded_input_sequences[:,:-1]
X

array([[    0,     0,     0, ...,     0,     0,  3765],
       [    0,     0,     0, ...,     0,  3765,  3766],
       [    0,     0,     0, ...,  3765,  3766,   320],
       ...,
       [   29, 11419, 11420, ...,     3, 11452, 11453],
       [11419, 11420,  1610, ..., 11452, 11453,  3734],
       [11420,  1610, 11421, ..., 11453,  3734,    21]], dtype=int32)

In [16]:
y=padded_input_sequences[:,-1]
y

array([3766,  320, 3767, ..., 3734,   21,  171], dtype=int32)

In [17]:
from tensorflow.keras.utils import to_categorical
to_categorical(y,num_classes=len(tokenizer.word_index)+1)

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

# model architecture and training

In [18]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
import numpy as np

In [19]:
model = Sequential([
    Embedding(last_index+1, 256, input_shape=(max_len,)),
    LSTM(256, dropout=0.3),
    Dense(last_index+1, activation='softmax')
])

  super().__init__(**kwargs)
I0000 00:00:1760964692.096996      19 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 15513 MB memory:  -> device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:04.0, compute capability: 6.0


In [20]:
from tensorflow.keras.optimizers import Adam
optimizer = Adam(learning_rate=0.005, clipnorm=1.0)
model.compile(loss='sparse_categorical_crossentropy',
              optimizer=optimizer,
              metrics=['accuracy'])

In [21]:
model.summary()

In [22]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

early_stop = EarlyStopping(
    monitor='val_loss',
    patience=5,
    restore_best_weights=True
)

checkpoint = ModelCheckpoint(
    'best_model.h5',
    monitor='val_loss',
    save_best_only=True
)

In [23]:
history = model.fit(
    X, y,
    validation_split=0.1,
    epochs=100,
    batch_size=64,
    callbacks=[early_stop, checkpoint]
)

Epoch 1/100


I0000 00:00:1760964701.881867      59 cuda_dnn.cc:529] Loaded cuDNN version 90300


[1m636/636[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 26ms/step - accuracy: 0.0279 - loss: 8.0159 - val_accuracy: 0.0562 - val_loss: 7.5694
Epoch 2/100
[1m636/636[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 25ms/step - accuracy: 0.0759 - loss: 6.5547 - val_accuracy: 0.0932 - val_loss: 7.4079
Epoch 3/100
[1m636/636[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 25ms/step - accuracy: 0.1501 - loss: 5.1463 - val_accuracy: 0.1045 - val_loss: 7.6440
Epoch 4/100
[1m636/636[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 25ms/step - accuracy: 0.2825 - loss: 3.8485 - val_accuracy: 0.1007 - val_loss: 8.1928
Epoch 5/100
[1m636/636[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 25ms/step - accuracy: 0.3901 - loss: 2.9838 - val_accuracy: 0.0941 - val_loss: 8.6091
Epoch 6/100
[1m636/636[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 25ms/step - accuracy: 0.4521 - loss: 2.5290 - val_accuracy: 0.0801 - val_loss: 9.0383
Epoch 7/100
[1m636/63

# prediction

In [24]:
text="waiting"
# tokenize
tokenized_text=tokenizer.texts_to_sequences([text])[0]
# padding
padded_text=pad_sequences([tokenized_text],maxlen=max_len,padding='pre')

In [25]:
# predict
position=np.argmax(model.predict(padded_text))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 210ms/step


In [26]:
for word,index in tokenizer.word_index.items():
    if index==position:
        print(word)

and


In [27]:
num_words=10

In [28]:
# prediction multiple words
text="i am"
for i in range(num_words):
    # tokenize
    tokenized_text=tokenizer.texts_to_sequences([text])[0]
    # padding
    padded_text=pad_sequences([tokenized_text],maxlen=max_len,padding='pre')
    # predict
    position=np.argmax(model.predict(padded_text))
    for word,index in tokenizer.word_index.items():
        if index==position:
            text=text+" "+word
            print(word)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
a
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
little
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
and
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
i
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
don't
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
know
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
what
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
i
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
don't
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
know


In [29]:
text

"i am a little and i don't know what i don't know"

now we can see the issue with n gram also