In [30]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense ,Dropout
import numpy as np
import pandas as pd
import random
import sys

In [31]:
df = pd.read_csv('/content/train.csv')
text = " ".join(df['text'].dropna().astype(str)).lower()

print(f'Total characters in text: {len(text)}')

Total characters in text: 35695884


In [32]:
df.head()

Unnamed: 0,title,text,subject,date
0,Greens say no support for Macron's EZ budget i...,BERLIN (Reuters) - None of the German parties ...,worldnews,"October 25, 2017"
1,Trump faces uphill battle to overcome court's ...,(Reuters) - U.S. President Donald Trump faces ...,politicsNews,"February 6, 2017"
2,Ukraine president denies hampering anti-corrup...,VILNIUS/KIEV (Reuters) - Ukrainian President P...,worldnews,"December 8, 2017"
3,U.S. defense chief: White House shakeup will n...,BRUSSELS (Reuters) - U.S. Defense Secretary Ji...,politicsNews,"February 14, 2017"
4,Irish government set to fall weeks before Brex...,DUBLIN (Reuters) - Ireland s minority governme...,worldnews,"November 24, 2017"


In [33]:
vocab = sorted(set(text))
print(f'Vocabulary size: {len(vocab)}')

char2idx = {c: i for i, c in enumerate(vocab)}
idx2char = np.array(vocab)

text_as_int = np.array([char2idx[c] for c in text])

Vocabulary size: 104


In [34]:
seq_length = 100

char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

sequences = char_dataset.batch(seq_length + 1, drop_remainder=True)

def split_input_target(chunk):
    return chunk[:-1], chunk[1:]

dataset = sequences.map(split_input_target)

BATCH_SIZE = 64
BUFFER_SIZE = 10000

dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

In [37]:
vocab_size = len(vocab)
embedding_dim = 64
rnn_units = 128

model = Sequential([
    Embedding(vocab_size, embedding_dim, input_shape=(None,)),
    LSTM(rnn_units, return_sequences=True, recurrent_initializer='glorot_uniform'),
    LSTM(rnn_units, return_sequences=True, recurrent_initializer='glorot_uniform'),
    Dropout(0.3),
    Dense(vocab_size)
])



In [39]:
def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

In [41]:

model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=3e-4),
        loss=loss,
        metrics=["accuracy"]
    )

model.summary()

In [51]:
from tensorflow.keras.callbacks import ModelCheckpoint , ReduceLROnPlateau,EarlyStopping

checkpoint = ModelCheckpoint(
    filepath='best_model.keras',
    monitor='accuracy',
    save_best_only=True,
    mode='max',
    verbose=1
)

reduce_lr = ReduceLROnPlateau(
    monitor='accuracy',
    factor=0.5,
    patience=5,
    min_lr=1e-6,
    verbose=1
)
early_stopping = EarlyStopping(monitor='loss', patience=10, restore_best_weights=True)


In [52]:
EPOCHS = 30
history = model.fit(
    dataset,
    epochs=EPOCHS,
     callbacks=[checkpoint, early_stopping, reduce_lr])

Epoch 1/30
[1m5520/5522[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 20ms/step - accuracy: 0.5444 - loss: 1.5443
Epoch 1: accuracy improved from -inf to 0.54923, saving model to best_model.keras
[1m5522/5522[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m135s[0m 24ms/step - accuracy: 0.5444 - loss: 1.5443 - learning_rate: 3.0000e-04
Epoch 2/30
[1m5520/5522[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 22ms/step - accuracy: 0.5596 - loss: 1.4873
Epoch 2: accuracy improved from 0.54923 to 0.56267, saving model to best_model.keras
[1m5522/5522[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m128s[0m 22ms/step - accuracy: 0.5596 - loss: 1.4873 - learning_rate: 3.0000e-04
Epoch 3/30
[1m5520/5522[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 22ms/step - accuracy: 0.5691 - loss: 1.4504
Epoch 3: accuracy improved from 0.56267 to 0.57139, saving model to best_model.keras
[1m5522/5522[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 22ms/step - accuracy: 0.5

In [55]:
def generate_text(model, start_string, num_generate=2, temperature=1.0):
    input_eval = [char2idx.get(s, 0) for s in start_string.lower()]
    input_eval = tf.expand_dims(input_eval, 0)

    text_generated = []

    model.layers[1].reset_states()  # Reset LSTM layer states

    for _ in range(num_generate):
        predictions = model(input_eval)
        predictions = tf.squeeze(predictions, 0) / temperature
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1, 0].numpy()

        input_eval = tf.expand_dims([predicted_id], 0)
        text_generated.append(idx2char[predicted_id])

    return start_string + ''.join(text_generated)

In [58]:
print(generate_text(model, start_string="An ", num_generate=200, temperature=0.8))

An fotgre t and aye thatherqucarsthame kan callden merupo honge mereded s a rser ins s thees. of targ. thilarendintid 20 her deananes hererul red ond s il k asthasaind h ct s asire toutritueafio iorin ca
