**TASK 1 = Write a Python function to implement a basic tokenization algorithm for a given language.**


In [1]:
pip install nltk




In [2]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
def nltk_tokenizer(text):
    tokens = word_tokenize(text)
    return tokens

text = "Hello, world! This is an NLTK tokenization example."
tokens = nltk_tokenizer(text)
print(tokens)


['Hello', ',', 'world', '!', 'This', 'is', 'an', 'NLTK', 'tokenization', 'example', '.']


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


**TASK2 = Use a pre-trained Word2Vec model to generate word embeddings for a given text corpus**

In [3]:
pip install gensim




In [4]:
import gensim.downloader as api
from gensim.utils import simple_preprocess

def load_pretrained_word2vec():

    model = api.load("word2vec-google-news-300")
    return model

def get_word_embeddings(text, model):

    tokens = simple_preprocess(text)

    embeddings = {}
    for word in tokens:
        if word in model:
            embeddings[word] = model[word]

    return embeddings


if __name__ == "__main__":

    model = load_pretrained_word2vec()

    text = "Natural language processing enables machines to understand human language."

    word_embeddings = get_word_embeddings(text, model)

    for word, embedding in word_embeddings.items():
        print(f"Word: {word}, Embedding (first 5 dimensions): {embedding[:5]}")


Word: natural, Embedding (first 5 dimensions): [ 0.03088379  0.20019531 -0.08789062  0.14550781 -0.00567627]
Word: language, Embedding (first 5 dimensions): [ 0.02307129  0.0168457   0.15429688  0.12792969 -0.26757812]
Word: processing, Embedding (first 5 dimensions): [-0.09033203  0.04394531  0.11621094  0.05737305 -0.00469971]
Word: enables, Embedding (first 5 dimensions): [-0.1015625  -0.01940918  0.08496094 -0.04321289 -0.0534668 ]
Word: machines, Embedding (first 5 dimensions): [ 0.35546875 -0.09814453  0.21191406  0.28515625 -0.23046875]
Word: understand, Embedding (first 5 dimensions): [-0.08935547 -0.04980469 -0.19726562 -0.05834961 -0.3046875 ]
Word: human, Embedding (first 5 dimensions): [ 0.0559082   0.09228516  0.10791016  0.28320312 -0.24316406]


**TASK3 =  Implement beam search decoding for an NMT model to improve translation quality**

In [8]:
input_seq = np.array([[1, 2, 3, 4, 5]])

In [10]:
import numpy as np
import tensorflow as tf

def beam_search_decoder(model, input_seq, beam_width=3, max_length=50):

    enc_output, enc_hidden = model.encoder(input_seq)

    start_token = np.array([[model.start_token]])
    end_token = model.end_token
    sequences = [[start_token, 0.0, enc_hidden]]

    for _ in range(max_length):
        all_candidates = []
        for seq, score, hidden in sequences:

            if seq[-1] == end_token:
                all_candidates.append((seq, score, hidden))
                continue

            dec_input = tf.expand_dims(seq[-1], axis=0)
            dec_output, hidden, _ = model.decoder(dec_input, enc_output, hidden)

            top_k_probs, top_k_indices = tf.nn.top_k(dec_output, k=beam_width)

            for i in range(beam_width):
                token = top_k_indices[0][i].numpy()
                prob = top_k_probs[0][i].numpy()
                candidate = seq + [token]
                candidate_score = score + np.log(prob)
                all_candidates.append((candidate, candidate_score, hidden))

        sequences = sorted(all_candidates, key=lambda x: x[1], reverse=True)[:beam_width]

        if all(seq[-1] == end_token for seq, _, _ in sequences):
            break

    best_seq = sequences[0][0]
    return best_seq


decoded_sequence = beam_search_decoder(model, input_seq, beam_width=5, max_length=50)
print(decoded_sequence)


[[22279]]


**TASK4 =  Create a feature to translate the language from French to Tamil and
    it should predict if the french word has only five letter if the french word has more than five letters or less than five letters the model should not translate the word**

In [2]:

!pip install gradio
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
import gradio as gr

Collecting gradio
  Downloading gradio-5.1.0-py3-none-any.whl.metadata (15 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0 (from gradio)
  Downloading fastapi-0.115.2-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.4.0-py3-none-any.whl.metadata (2.9 kB)
Collecting gradio-client==1.4.0 (from gradio)
  Downloading gradio_client-1.4.0-py3-none-any.whl.metadata (7.1 kB)
Collecting httpx>=0.24.1 (from gradio)
  Downloading httpx-0.27.2-py3-none-any.whl.metadata (7.1 kB)
Collecting huggingface-hub>=0.25.1 (from gradio)
  Downloading huggingface_hub-0.25.2-py3-none-any.whl.metadata (13 kB)
Collecting markupsafe~=2.0 (from gradio)
  Downloading MarkupSafe-2.1.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 kB)
Collecting orjson~=3.0 (from gradio)
  Downloading orjson-3.10.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata

In [3]:

fr_en_tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-fr-en")
fr_en_model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-fr-en")

en_ta_tokenizer = AutoTokenizer.from_pretrained("suriya7/English-to-Tamil")
en_ta_model = AutoModelForSeq2SeqLM.from_pretrained("suriya7/English-to-Tamil")

def translate_fr_to_en(text):
    inputs = fr_en_tokenizer(text, return_tensors="pt", padding=True)
    outputs = fr_en_model.generate(**inputs, max_length=40, num_beams=4, early_stopping=True)
    translated_text = fr_en_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return translated_text

def translate_en_to_ta(text):
    tokenized = en_ta_tokenizer([text], return_tensors='pt')
    out = en_ta_model.generate(**tokenized, max_length=128)
    translated_text = en_ta_tokenizer.decode(out[0], skip_special_tokens=True)
    return translated_text

def translate_fr_to_ta(text):

    if len(text.split()) == 1 and len(text) == 5:

        translated_to_english = translate_fr_to_en(text)


        translated_to_tamil = translate_en_to_ta(translated_to_english)

        return translated_to_tamil
    else:
        return " "


interface = gr.Interface(
    fn=translate_fr_to_ta,
    inputs="text",
    outputs="text",
    title="French to Tamil Translator",
    description="Enter a single French word with exactly five letters to translate it into Tamil."  # Description
)


interface.launch()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/778k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.34M [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/301M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/19.8k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/3.71M [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.01k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.13k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/947 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.94G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/198 [00:00<?, ?B/s]

Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://14246a06cf82971e5b.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




**TASK 5 = Create a feature to translate the language with a combination of two languages at the same time . We should be able to convert the 2 different languages at the same time . translate English to French and Hindi at the same time . This model should work only for 10 letter English words**

In [1]:
import numpy as np
import pandas as pd
import re
import string
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Dense, Embedding, Bidirectional, TimeDistributed
from tensorflow.keras.losses import sparse_categorical_crossentropy
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split


df = pd.read_csv(r"/content/eng-french (2).csv")
df.columns = ['en', 'fr']
custom_punct = string.punctuation.replace("-", "").replace("'", "")


def clean(text):
    text = text.lower()
    text = re.sub("[" + custom_punct + "]", "", text)
    return text

df["clean_en"] = df["en"].apply(clean)
df["clean_fr"] = df["fr"].apply(clean)

X_train, X_test, y_train, y_test = train_test_split(df["clean_en"], df["clean_fr"], test_size=0.2)

en_tokenizer = Tokenizer()
fr_tokenizer = Tokenizer()

en_tokenizer.fit_on_texts(X_train)
fr_tokenizer.fit_on_texts(y_train)

input_vocab_size = len(en_tokenizer.word_index) + 1
output_vocab_size = len(fr_tokenizer.word_index) + 1

X_train_sequences = en_tokenizer.texts_to_sequences(X_train)
X_test_sequences = en_tokenizer.texts_to_sequences(X_test)

y_train_sequences = fr_tokenizer.texts_to_sequences(y_train)
y_test_sequences = fr_tokenizer.texts_to_sequences(y_test)


maxlen = 55
X_train_pad = pad_sequences(X_train_sequences, maxlen=maxlen, truncating='post', padding="post")
X_test_pad = pad_sequences(X_test_sequences, maxlen=maxlen, truncating='post', padding="post")

y_train_pad = pad_sequences(y_train_sequences, maxlen=maxlen, truncating='post', padding="post")
y_test_pad = pad_sequences(y_test_sequences, maxlen=maxlen, truncating='post', padding="post")

y_train_pad = y_train_pad.reshape(*y_train_pad.shape, 1)
y_test_pad = y_test_pad.reshape(*y_test_pad.shape, 1)


class TransformerBlock(tf.keras.layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.3):
        super(TransformerBlock, self).__init__()
        self.att = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = tf.keras.Sequential([
            tf.keras.layers.Dense(ff_dim, activation="relu"),
            tf.keras.layers.Dense(embed_dim),
        ])
        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

class TokenAndPositionEmbedding(tf.keras.layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.token_emb = tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = tf.keras.layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions


num_heads = 8
ff_dim = 128
embedding_dim = 200
adam = Adam(learning_rate=0.003)


inputs = tf.keras.layers.Input(shape=(maxlen,))
embedding_layer = TokenAndPositionEmbedding(maxlen, input_vocab_size, embedding_dim)
x = embedding_layer(inputs)

transformer_block = TransformerBlock(embedding_dim, num_heads, ff_dim)
x = transformer_block(x, training=True)

x = TimeDistributed(Dense(256, activation="relu"))(x)
outputs = TimeDistributed(Dense(output_vocab_size, activation="softmax"))(x)

model = tf.keras.Model(inputs=inputs, outputs=outputs)

model.compile(loss=sparse_categorical_crossentropy, optimizer=adam, metrics=['accuracy'])

model.summary()



In [2]:

history = model.fit(X_train_pad,
                    y_train_pad,
                    validation_data=(X_test_pad, y_test_pad),
                    verbose=1,
                    batch_size=16,
                    epochs=10,
                   )


Epoch 1/10
[1m8781/8781[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m669s[0m 72ms/step - accuracy: 0.8810 - loss: 0.9481 - val_accuracy: 0.8955 - val_loss: 0.7525
Epoch 2/10
[1m8781/8781[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m521s[0m 59ms/step - accuracy: 0.8942 - loss: 0.7554 - val_accuracy: 0.8987 - val_loss: 0.7143
Epoch 3/10
[1m8781/8781[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m562s[0m 59ms/step - accuracy: 0.8976 - loss: 0.7139 - val_accuracy: 0.9010 - val_loss: 0.6878
Epoch 4/10
[1m8781/8781[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m531s[0m 60ms/step - accuracy: 0.8988 - loss: 0.6914 - val_accuracy: 0.9024 - val_loss: 0.6659
Epoch 5/10
[1m8781/8781[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m532s[0m 61ms/step - accuracy: 0.9002 - loss: 0.6654 - val_accuracy: 0.9037 - val_loss: 0.6621
Epoch 6/10
[1m8781/8781[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m552s[0m 59ms/step - accuracy: 0.9014 - loss: 0.6432 - val_accuracy: 0.9049 - val_loss: 0.629

In [5]:
samples = [
    "moon"
]
for sample in samples:
    pred = model.predict([pad_sequences(en_tokenizer.texts_to_sequences([sample]), maxlen=maxlen, padding='post', truncating='post')])[0].argmax(axis=1)
    output_text = fr_tokenizer.sequences_to_texts([pred])[0]
    print("EN: " + sample)
    print("FR: " + output_text)
    print()

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
EN: moon
FR: le lune



In [4]:
pip install transformers




In [5]:
from transformers import MarianMTModel, MarianTokenizer

model_name = 'Helsinki-NLP/opus-mt-en-hi'
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)


tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/812k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/1.07M [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.10M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/306M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

In [6]:

english_text = ["moon"]

tokenized_text = tokenizer(english_text, return_tensors="pt", padding=True, truncation=True)
translated = model.generate(**tokenized_text)

hindi_translation = [tokenizer.decode(t, skip_special_tokens=True) for t in translated]

print(hindi_translation)

['चन्द्रमा']


**TASK = 6Create a feature to translate the English word to Hindi and it should not translate if the English starts with vowels and other words it should convert . If we enter a English word starts with Vowels it should show an error message as This word starts with Vowels provide some other words and this model should be able to convert english word starts with vowels around 9PMto10PM**

In [15]:
import gradio as gr
from transformers import MarianMTModel, MarianTokenizer
from datetime import datetime
import pytz

model_name = 'Helsinki-NLP/opus-mt-en-hi'
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

def starts_with_vowel(word):
    vowels = 'AEIOUaeiou'
    return word[0] in vowels


def is_within_time_window():
    ist = pytz.timezone('Asia/Kolkata')
    current_time_ist = datetime.now(ist)
    return current_time_ist.hour == 21

# Function to translate text
def translate_to_hindi(english_text):
    if starts_with_vowel(english_text):
        if is_within_time_window():

            tokenized_text = tokenizer([english_text], return_tensors="pt", padding=True, truncation=True)
            translated = model.generate(**tokenized_text)
            hindi_translation = [tokenizer.decode(t, skip_special_tokens=True) for t in translated]
            return f"Translated to Hindi: {hindi_translation[0]}"
        else:

            return "Error: This word starts with a vowel. Please provide a word that does not start with a vowel."
    else:

        tokenized_text = tokenizer([english_text], return_tensors="pt", padding=True, truncation=True)
        translated = model.generate(**tokenized_text)
        hindi_translation = [tokenizer.decode(t, skip_special_tokens=True) for t in translated]
        return f"{hindi_translation[0]}"


def gradio_interface(english_word):
    return translate_to_hindi(english_word)


interface = gr.Interface(
    fn=gradio_interface,
    inputs="text",
    outputs="text",
    title="English to Hindi Translator",
    description="Translate English words to Hindi. Note: Words starting with vowels are only allowed to translate between 9 PM and 10 PM IST."
)


interface.launch()


Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://8780f4e89096127fd9.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


