In [1]:
import tensorflow as tf
print("TensorFlow version:", tf.__version__)


TensorFlow version: 2.20.0


In [2]:
#STEP 1: Import Libraries
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

In [4]:
#STEP 2: Load the Text File
# Load text data
# STEP 2: Load the Text File
with open(r"C:\Users\omkar\OneDrive\Desktop\Handwriten_text_project\Handwritten.txt.txt", "r", encoding="utf-8") as f:
    text = f.read()

# Repeat text to increase training data
text = text * 50

print("Total characters:", len(text))
print(text[:200])

Total characters: 38000
To be, or not to be, that is the question:
Whether 'tis nobler in the mind to suffer
The slings and arrows of outrageous fortune,
Or to take arms against a sea of troubles
And by opposing end them. To


In [5]:
# STEP 3: Character Mapping

import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

# Get all unique characters
unique_chars = sorted(set(text))
char2idx = {char: idx for idx, char in enumerate(unique_chars)}
idx2char = {idx: char for idx, char in enumerate(unique_chars)}

print(f"Total unique characters: {len(unique_chars)}")

Total unique characters: 45


In [6]:
# STEP 4: Prepare Input Sequences


sequence_length = 40
input_sequences = []
target_chars = []

for i in range(len(text) - sequence_length):
    input_seq = text[i:i+sequence_length]
    target_char = text[i+sequence_length]
    input_sequences.append([char2idx[c] for c in input_seq])
    target_chars.append(char2idx[target_char])

# Convert to NumPy arrays
X_seq = np.array(input_sequences)
y_seq = np.array(target_chars)

print(f"Input sequences shape: {X_seq.shape}")

Input sequences shape: (37960, 40)


In [7]:
# STEP 5: Normalize Inputs


X_seq = X_seq / float(len(unique_chars))

In [8]:
# STEP 6: Build the LSTM Model


from tensorflow.keras.optimizers import Adam

model = Sequential()
model.add(LSTM(128, input_shape=(X_seq.shape[1], 1)))
model.add(Dense(len(unique_chars), activation='softmax'))

model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(learning_rate=0.001))
model.summary()

  super().__init__(**kwargs)


In [9]:
# STEP 7: Train the Model


model.fit(
    X_seq.reshape((X_seq.shape[0], X_seq.shape[1], 1)),
    y_seq,
    epochs=100,        # Lower epochs if dataset is small
    batch_size=32
)



Epoch 1/100
[1m1187/1187[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 20ms/step - loss: 3.0551
Epoch 2/100
[1m1187/1187[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 20ms/step - loss: 2.4678
Epoch 3/100
[1m1187/1187[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 20ms/step - loss: 1.1316
Epoch 4/100
[1m1187/1187[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 20ms/step - loss: 0.3266
Epoch 5/100
[1m1187/1187[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 20ms/step - loss: 0.0646
Epoch 6/100
[1m1187/1187[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 20ms/step - loss: 0.1170
Epoch 7/100
[1m1187/1187[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 20ms/step - loss: 0.0114
Epoch 8/100
[1m1187/1187[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 20ms/step - loss: 0.0054
Epoch 9/100
[1m1187/1187[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 21ms/step - loss: 0.2835
Epoch 10/100
[1m1187/1187[0m [32m━━━━━━━━━━

<keras.src.callbacks.history.History at 0x238c75c5160>

In [15]:
# STEP 8: Generate Text using Trained LSTM Model

def generate_handwritten_text(seed_text, num_chars=300, temperature=0.6):
    
    generated_text = seed_text
    encoded_seed = [char2idx[c] for c in seed_text if c in char2idx]

    for _ in range(num_chars):
        x_input = np.zeros((1, sequence_length, 1))
        
        for t, char_index in enumerate(encoded_seed[-sequence_length:]):
            x_input[0, t, 0] = char_index / float(len(unique_chars))
        
        predictions = model.predict(x_input, verbose=0)[0]
        predictions = np.log(predictions + 1e-8) / temperature
        probabilities = np.exp(predictions) / np.sum(np.exp(predictions))
        
        next_char_index = np.random.choice(len(unique_chars), p=probabilities)
        next_char = idx2char[next_char_index]
        
        generated_text += next_char
        encoded_seed.append(next_char_index)

    return generated_text


In [16]:
start_string = "To be, or not to be, "
print(generate_handwritten_text(start_string, 400, temperature=0.5))


To be, or not to be, rrRoo,,,,,,,nnReoRR ne be,eosgew
bo   C t no  om sleep;
To slrep, perchance to dream—ay, there's the rub:
For in that sleep of death what dreams may come,
When we have shuffled off this mortal coil,
Must give us pause—there's the respect
That makes calamity of so long life.

O Romeo, Romeo! wherefore art thou Romeo?
Deny thy father and refuse thy name;
Or, if thou wilt not, be but sworn my love,
A


In [None]:
#Report  : Task 5
Handwritten Text Generation using Deep Learning

CODSOFT Internship – Machine Learning Task

1. Introduction

Handwritten-style text generation is an important application of Natural Language Processing (NLP) and Deep
Learning. It involves training a model to learn sequential patterns in text and generate new, human-like content 
character by character.

This project focuses on building a character-level Recurrent Neural Network (RNN) using LSTM 
(Long Short-Term Memory) to generate handwritten-like text based on a given seed input.

2. Objective
The objectives of this project are:
To preprocess raw text data for character-level modeling
To map characters to numerical representations
To train an LSTM-based deep learning model
To generate meaningful handwritten-style text
To understand the impact of sequence length and temperature on text generation

3. Dataset Description
The dataset is a text-based dataset containing literary handwritten-style content.
Dataset Characteristics:
Type: Plain text (.txt)
Content: Literary text (dialogues, sentences, punctuation)
Granularity: Character-level (letters, spaces, symbols)
Each character is treated as an individual feature, and the task is to predict the next character in a sequence.

 4. Data Preprocessing
The preprocessing steps included:
Reading the text file and converting it into lowercase text
Extracting all unique characters from the dataset
Creating character-to-index and index-to-character mappings
Generating fixed-length character sequences (sequence length = 40)
Normalizing the input data for better training performance
These steps converted raw text into structured numerical data suitable for deep learning.

 5. Exploratory Analysis

The dataset contains alphabets, punctuation, and whitespace characters
Character distribution helps the model learn writing patterns
Sequential relationships between characters enable meaningful sentence generation
Understanding character frequency and sequence length plays a crucial role in text generation quality.

 6. Deep Learning Model
The following model architecture was used:
 LSTM-Based Character Model
Input Layer: Character sequences
LSTM Layer: 128 hidden units
Dense Output Layer with Softmax activation
Loss Function: Sparse Categorical Crossentropy
Optimizer: Adam
The LSTM model effectively captures long-term dependencies in character sequences.

 7. Model Training
The model was trained for multiple epochs
Batch size was set to ensure efficient learning
Training focused on predicting the next character in a sequence
The model gradually learned grammatical structure, spacing, and punctuation from the dataset.

8. Text Generation
After training, the model generates text using:
A seed input string
Temperature-based sampling to control randomness
Character-by-character prediction
Lower temperature produces more predictable text, while higher temperature increases creativity.

 9. Results & Conclusion
The trained LSTM model successfully generated coherent handwritten-style text:
Generated text followed grammatical patterns
Proper punctuation and spacing were maintained
Output resembled the style of the training dataset
Key Takeaways:
Character-level LSTM models are effective for text generation
Sequence length and temperature significantly affect output quality
Deep learning can mimic human writing patterns when trained properly

10. Future Improvements
Train on larger and more diverse handwritten text datasets
Experiment with Bidirectional LSTM or GRU
Apply word-level or transformer-based models
Fine-tune temperature sampling for better creativity
Deploy the model as a web-based text generator
Project Completed as Part of
CODSOFT – Machine Learning Internship

