<a href="https://colab.research.google.com/github/SushrutReddy/DL-ASSiGNMENT-2/blob/main/DL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Question 1

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense


In [3]:
# Function to read and process transliteration data
def load_data(file_path):
    df = pd.read_csv(file_path, sep='\t', header=None).dropna()
    return [(row[1], row[0]) for _, row in df.iterrows()]  # Latin, Devanagari pairs

# Load the train and validation data
train_pairs = load_data('hi.translit.sampled.train.tsv')
val_pairs = load_data('hi.translit.sampled.dev.tsv')

# Prepare sequences with start/end tokens
input_texts = [src for src, _ in train_pairs]
target_texts = ['\t' + tgt + '\n' for _, tgt in train_pairs]

# Create character sets for input and output
input_chars = sorted(set(''.join(input_texts)))
target_chars = sorted(set(''.join(target_texts)))

# Create token-to-index mappings
input_token_index = {ch: i for i, ch in enumerate(input_chars)}
target_token_index = {ch: i for i, ch in enumerate(target_chars)}

# Determine the max sequence lengths
max_encoder_seq_length = max(len(txt) for txt in input_texts)
max_decoder_seq_length = max(len(txt) for txt in target_texts)

# Initialize data arrays for one-hot encoding
encoder_input_data = np.zeros((len(input_texts), max_encoder_seq_length, len(input_chars)), dtype='float32')
decoder_input_data = np.zeros((len(target_texts), max_decoder_seq_length, len(target_chars)), dtype='float32')
decoder_target_data = np.zeros((len(target_texts), max_decoder_seq_length, len(target_chars)), dtype='float32')

# Fill one-hot encoded arrays for input and target sequences
for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
    for t, char in enumerate(input_text):
        encoder_input_data[i, t, input_token_index[char]] = 1.0
    for t, char in enumerate(target_text):
        decoder_input_data[i, t, target_token_index[char]] = 1.0
        if t > 0:
            decoder_target_data[i, t - 1, target_token_index[char]] = 1.0


In [4]:
# Encoder model setup
encoder_inputs = Input(shape=(None, len(input_chars)))
encoder_lstm = LSTM(256, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_inputs)
encoder_states = [state_h, state_c]

# Decoder model setup
decoder_inputs = Input(shape=(None, len(target_chars)))
decoder_lstm = LSTM(256, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)
decoder_dense = Dense(len(target_chars), activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define and compile the full model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()


In [7]:
# Train the model
model.fit(
    [encoder_input_data, decoder_input_data],
    decoder_target_data,
    batch_size=64,
    epochs=90,
    validation_split=0.22
)


Epoch 1/90
[1m539/539[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m114s[0m 211ms/step - accuracy: 0.0668 - loss: 1.1488 - val_accuracy: 0.0625 - val_loss: 1.2700
Epoch 2/90
[1m539/539[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m144s[0m 215ms/step - accuracy: 0.0806 - loss: 1.1031 - val_accuracy: 0.0598 - val_loss: 1.2844
Epoch 3/90
[1m539/539[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m115s[0m 214ms/step - accuracy: 0.0960 - loss: 1.0503 - val_accuracy: 0.0595 - val_loss: 1.2894
Epoch 4/90
[1m539/539[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m152s[0m 233ms/step - accuracy: 0.1089 - loss: 1.0147 - val_accuracy: 0.0576 - val_loss: 1.2804
Epoch 5/90
[1m539/539[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 234ms/step - accuracy: 0.1146 - loss: 0.9890 - val_accuracy: 0.0597 - val_loss: 1.2990
Epoch 6/90
[1m539/539[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m115s[0m 214ms/step - accuracy: 0.1192 - loss: 0.9695 - val_accuracy: 0.0627 - val_loss: 1.2793
Epoc

<keras.src.callbacks.history.History at 0x7fc299f25b90>

In [9]:
# Save the encoder model for inference
encoder_model = Model(encoder_inputs, encoder_states)

# Define decoder model for inference
decoder_state_input_h = Input(shape=(256,))
decoder_state_input_c = Input(shape=(256,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_outputs, state_h, state_c = decoder_lstm(
    decoder_inputs, initial_state=decoder_states_inputs
)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)

decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs] + decoder_states
)

# Reverse lookup for character indices
reverse_input_char_index = {i: char for char, i in input_token_index.items()}
reverse_target_char_index = {i: char for char, i in target_token_index.items()}


In [10]:
# Function to decode sequence (input to target)
def decode_sequence(input_seq):
    states_value = encoder_model.predict(input_seq)
    target_seq = np.zeros((1, 1, len(target_chars)))
    target_seq[0, 0, target_token_index['\t']] = 1.0

    decoded_sentence = ''
    for _ in range(max_decoder_seq_length):
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += sampled_char

        if sampled_char == '\n':
            break

        target_seq = np.zeros((1, 1, len(target_chars)))
        target_seq[0, 0, sampled_token_index] = 1.0
        states_value = [h, c]

    return decoded_sentence.strip()


In [11]:
# Test the model on the first 10 sequences
for seq_index in range(10):
    input_seq = encoder_input_data[seq_index:seq_index+1]
    decoded = decode_sequence(input_seq)
    print(f"Input: {input_texts[seq_index]} → Prediction: {decoded} | Target: {target_texts[seq_index]}")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 591ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 91ms/step
Input: an → Prediction: अ | Target: 	अं

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 83ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 87ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 85ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 86ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 83ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 86ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 83ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 90ms/step
Input: ankganit → Prediction: अंंकाा | Target: 	अंकगणित

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 126ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━

In [12]:
# Function to prepare input text for prediction
def encode_input_text(input_text):
    encoder_input = np.zeros((1, max_encoder_seq_length, len(input_chars)))
    for t, char in enumerate(input_text):
        if char in input_token_index:
            encoder_input[0, t, input_token_index[char]] = 1.0
    return encoder_input

# Interactive loop for user input
while True:
    user_input = input("Enter a Latin word (or type 'exit' to quit): ").strip().lower()
    if user_input == 'exit':
        break

    encoded_input = encode_input_text(user_input)
    prediction = decode_sequence(encoded_input)
    print(f"Predicted Devanagari: {prediction}")


Enter a Latin word (or type 'exit' to quit): ankush
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 71ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 75ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 66ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 64ms/step
Predicted Devanagari: अंका
Enter a Latin word (or type 'exit' to quit): anga
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 55ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 66ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61ms/step
Predicted Devanagari: अं
Enter a Latin word (or type 'exit' to quit): ankurit
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 95ms/step


Question 2

In [None]:
from google.colab import files
files.upload()  # Upload kaggle.json when prompted

!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json


Saving kaggle.json to kaggle.json


In [None]:
!kaggle datasets download -d paultimothymooney/poetry
!unzip poetry.zip

Dataset URL: https://www.kaggle.com/datasets/paultimothymooney/poetry
License(s): CC0-1.0
Archive:  poetry.zip
  inflating: Kanye_West.txt          
  inflating: Lil_Wayne.txt           
  inflating: adele.txt               
  inflating: al-green.txt            
  inflating: alicia-keys.txt         
  inflating: amy-winehouse.txt       
  inflating: beatles.txt             
  inflating: bieber.txt              
  inflating: bjork.txt               
  inflating: blink-182.txt           
  inflating: bob-dylan.txt           
  inflating: bob-marley.txt          
  inflating: britney-spears.txt      
  inflating: bruce-springsteen.txt   
  inflating: bruno-mars.txt          
  inflating: cake.txt                
  inflating: dickinson.txt           
  inflating: disney.txt              
  inflating: dj-khaled.txt           
  inflating: dolly-parton.txt        
  inflating: dr-seuss.txt            
  inflating: drake.txt               
  inflating: eminem.txt              
  inflating: ja

In [None]:
!pip install transformers datasets accelerate


Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (f

In [None]:
# Path to the drake.txt file
drake_file_path = './drake.txt'

# Read the file to confirm it's loaded
with open(drake_file_path, 'r', encoding='utf-8') as file:
    drake_lyrics = file.read()

print(drake_lyrics[:500])  # Display the first 500 characters of the lyrics to verify


[Hook]
I've been down so long, it look like up to me
They look up to me
I got fake people showin' fake love to me
Straight up to my face, straight up to my face
I've been down so long, it look like up to me
They look up to me
I got fake people showin' fake love to me
Straight up to my face, straight up to my face [Verse 1]
Somethin' ain't right when we talkin'
Somethin' ain't right when we talkin'
Look like you hidin' your problems
Really you never was solid
No, you can't "son" me
You won't neve


In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Load the GPT-2 tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

model = GPT2LMHeadModel.from_pretrained("gpt2")
model.resize_token_embeddings(len(tokenizer))


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Embedding(50257, 768)

In [None]:
from transformers import TextDataset, DataCollatorForLanguageModeling

# Function to load the dataset (drake.txt) and create a text dataset
def load_dataset(file_path, tokenizer, block_size=128):
    return TextDataset(
        tokenizer=tokenizer,
        file_path=file_path,
        block_size=block_size
    )

# Function to get the data collator for language modeling
def get_data_collator(tokenizer):
    return DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False
    )

# Load the Drake dataset
dataset = load_dataset(drake_file_path, tokenizer)
data_collator = get_data_collator(tokenizer)




In [None]:
from transformers import Trainer, TrainingArguments
import torch

training_args = TrainingArguments(
    output_dir="./gpt2-drake-lyrics",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=2,
    save_steps=500,
    save_total_limit=2,
    prediction_loss_only=True,
    logging_steps=100,
    fp16=torch.cuda.is_available(),  # Enable mixed precision if GPU is available
    report_to='none',  # Disable W&B logging
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    data_collator=data_collator,
)


In [None]:
trainer.train()
trainer.save_model("./gpt2-drake-lyrics")
tokenizer.save_pretrained("./gpt2-drake-lyrics")


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
100,3.5873
200,3.3446
300,3.1402
400,3.0487
500,2.8546
600,2.8056


('./gpt2-drake-lyrics/tokenizer_config.json',
 './gpt2-drake-lyrics/special_tokens_map.json',
 './gpt2-drake-lyrics/vocab.json',
 './gpt2-drake-lyrics/merges.txt',
 './gpt2-drake-lyrics/added_tokens.json')

In [None]:
from transformers import pipeline

# Load the trained model and tokenizer for generation
generator = pipeline("text-generation", model="./gpt2-drake-lyrics", tokenizer="./gpt2-drake-lyrics")

# Set your prompt
prompt = "Started from the bottom, now we're here"
generated_lyrics = generator(prompt, max_length=100, num_return_sequences=1)

# Display the generated lyrics
print(generated_lyrics[0]['generated_text'])


Device set to use cpu
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Started from the bottom, now we're here
And we've been here before
And he never should've left
It's just not your time
You're still here, still in control of it, still with it, now I'm here And I know you could go to bed this morning But you could sleep better that you're gone
So if you didn't wake up this morning
I don't know what I'm going through
I just don't know what to say



In [None]:
trainer.train()
# Check loss every 100 steps


Step,Training Loss
100,2.7741
200,2.7358
300,2.5597
400,2.5372
500,2.3269
600,2.3009


TrainOutput(global_step=651, training_loss=2.52590090375159, metrics={'train_runtime': 3802.4943, 'train_samples_per_second': 0.342, 'train_steps_per_second': 0.171, 'total_flos': 84854587392000.0, 'train_loss': 2.52590090375159, 'epoch': 3.0})