<a href="https://colab.research.google.com/github/Rizvi999/neural-lab/blob/main/Welcome_To_Colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
# =========================================
# ⚡ LSTM Named Entity Recognition (NER)
# with GloVe embeddings + Balanced Dataset
# =========================================
!pip install gensim --quiet
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Bidirectional, Input
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical
import gensim.downloader as api

# ------------------------------
# Step 1: Prepare Expanded Dataset
# ------------------------------

sentences = [
    ["john", "lives", "in", "new", "york", "city"],
    ["mary", "works", "at", "google"],
    ["bob", "studies", "at", "stanford", "university"],
    ["alice", "is", "from", "paris"],
    ["david", "works", "in", "london"],
    ["emma", "is", "a", "doctor"],
    ["sam", "lives", "in", "delhi"],
    ["rizvi", "works", "at", "microsoft"],
    ["rahul", "studies", "in", "mumbai", "university"],
    ["aarav", "is", "from", "chennai"],

    # Added more diverse examples
    ["john", "works", "at", "microsoft"],
    ["emma", "studies", "in", "oxford"],
    ["mary", "lives", "in", "paris"],
    ["bob", "works", "at", "amazon"],
    ["alice", "studies", "in", "delhi"],
    ["sara", "works", "at", "ibm"],
    ["george", "is", "from", "rome"],
    ["james", "studies", "in", "tokyo"],
    ["ravi", "works", "at", "infosys"],
    ["nina", "lives", "in", "mumbai"]
]

tags = [
    ["B-PER", "O", "O", "B-LOC", "I-LOC", "I-LOC"],
    ["B-PER", "O", "O", "B-ORG"],
    ["B-PER", "O", "O", "B-ORG", "I-ORG"],
    ["B-PER", "O", "O", "B-LOC"],
    ["B-PER", "O", "O", "B-LOC"],
    ["B-PER", "O", "O", "O"],
    ["B-PER", "O", "O", "B-LOC"],
    ["B-PER", "O", "O", "B-ORG"],
    ["B-PER", "O", "O", "B-LOC", "I-ORG"],
    ["B-PER", "O", "O", "B-LOC"],

    ["B-PER", "O", "O", "B-ORG"],
    ["B-PER", "O", "O", "B-LOC"],
    ["B-PER", "O", "O", "B-LOC"],
    ["B-PER", "O", "O", "B-ORG"],
    ["B-PER", "O", "O", "B-LOC"],
    ["B-PER", "O", "O", "B-ORG"],
    ["B-PER", "O", "O", "B-LOC"],
    ["B-PER", "O", "O", "B-LOC"],
    ["B-PER", "O", "O", "B-ORG"],
    ["B-PER", "O", "O", "B-LOC"]
]

# ------------------------------
# Step 2: Create Vocabulary
# ------------------------------
words = list(set(w for s in sentences for w in s))
tags_vocab = list(set(t for ts in tags for t in ts))

word2idx = {w: i + 2 for i, w in enumerate(words)}
word2idx["PAD"] = 0
word2idx["UNK"] = 1
idx2word = {i: w for w, i in word2idx.items()}

tag2idx = {t: i + 1 for i, t in enumerate(tags_vocab)}
tag2idx["PAD"] = 0
idx2tag = {i: t for t, i in tag2idx.items()}

max_len = 10

# ------------------------------
# Step 3: Prepare Input Data
# ------------------------------
X = [[word2idx.get(w, 1) for w in s] for s in sentences]
X = pad_sequences(X, maxlen=max_len, padding="post", value=0)

y = [[tag2idx[t] for t in ts] for ts in tags]
y = pad_sequences(y, maxlen=max_len, padding="post", value=0)
y = [to_categorical(i, num_classes=len(tag2idx)) for i in y]

# ------------------------------
# Step 4: Load GloVe Embeddings
# ------------------------------
print("🔹 Loading GloVe embeddings (50D)...")
glove_vectors = api.load("glove-wiki-gigaword-50")

embedding_matrix = np.zeros((len(word2idx), 50))
for word, i in word2idx.items():
    if word in glove_vectors:
        embedding_matrix[i] = glove_vectors[word]
print("✅ GloVe embeddings loaded successfully!")

# ------------------------------
# Step 5: Build BiLSTM Model
# ------------------------------
input_word = Input(shape=(max_len,))
model = Embedding(
    input_dim=len(word2idx),
    output_dim=50,
    weights=[embedding_matrix],
    input_length=max_len,
    trainable=False
)(input_word)
model = Bidirectional(LSTM(units=64, return_sequences=True, dropout=0.3, recurrent_dropout=0.2))(model)
out = TimeDistributed(Dense(len(tag2idx), activation="softmax"))(model)

model = Model(input_word, out)
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

# ------------------------------
# Step 6: Train Model
# ------------------------------
print("\n🚀 Training model...\n")
model.fit(np.array(X), np.array(y), batch_size=2, epochs=40, verbose=1)

# ------------------------------
# Step 7: Prediction Function
# ------------------------------
common_words = {"in", "at", "is", "a", "the", "works", "studies", "from", "lives"}

def predict_sentence(sentence):
    words_in = sentence.lower().split()
    x = pad_sequences([[word2idx.get(w, 1) for w in words_in]], maxlen=max_len, padding="post", value=0)
    pred = model.predict(x, verbose=0)[0]
    pred_tags = [idx2tag[np.argmax(p)] for p in pred][:len(words_in)]

    # Fix common function words to "O"
    for i, w in enumerate(words_in):
        if w in common_words:
            pred_tags[i] = "O"

    print(f"\n🔮 Prediction for: {sentence}")
    print("---------------------------------------------")
    for w, t in zip(words_in, pred_tags):
        print(f"{w:<12} ---> {t}")

# ------------------------------
# Step 8: Test the Model
# ------------------------------
predict_sentence("sam lives in delhi")
predict_sentence("rizvi works at google")
predict_sentence("mary is from paris")
predict_sentence("john studies in london")
predict_sentence("george works at amazon")
predict_sentence("emma lives in rome")


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m83.5 MB/s[0m eta [36m0:00:00[0m
[?25h🔹 Loading GloVe embeddings (50D)...
✅ GloVe embeddings loaded successfully!

🚀 Training model...

Epoch 1/40
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 16ms/step - accuracy: 0.3137 - loss: 1.8635
Epoch 2/40
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.7755 - loss: 1.5097
Epoch 3/40
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.7959 - loss: 1.0378
Epoch 4/40
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.8330 - loss: 0.6275
Epoch 5/40
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.8839 - loss: 0.4921
Epoch 6/40
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.8987 - loss: 0.3289
Epoch 7/40
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m




🔮 Prediction for: sam lives in delhi
---------------------------------------------
sam          ---> B-PER
lives        ---> O
in           ---> O
delhi        ---> B-LOC

🔮 Prediction for: rizvi works at google
---------------------------------------------
rizvi        ---> B-PER
works        ---> O
at           ---> O
google       ---> B-ORG

🔮 Prediction for: mary is from paris
---------------------------------------------
mary         ---> B-PER
is           ---> O
from         ---> O
paris        ---> B-LOC

🔮 Prediction for: john studies in london
---------------------------------------------
john         ---> B-PER
studies      ---> O
in           ---> O
london       ---> B-LOC

🔮 Prediction for: george works at amazon
---------------------------------------------
george       ---> B-PER
works        ---> O
at           ---> O
amazon       ---> B-ORG

🔮 Prediction for: emma lives in rome
---------------------------------------------
emma         ---> B-PER
lives        ---> O
i

In [20]:
# ✅ Neural Machine Translation with Attention (Fast & Runnable)
# ------------------------------------------------------------

!pip install datasets tensorflow --quiet

from datasets import load_dataset
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Attention
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import numpy as np

# ------------------------------------------------------------
# 1️⃣ Load small English–French dataset
# ------------------------------------------------------------
dataset = load_dataset("opus_books", "en-fr")

pairs = [(ex["translation"]["en"], ex["translation"]["fr"]) for ex in dataset["train"][:5000]]
eng_texts = [p[0] for p in pairs]
fra_texts = [f"<sos> {p[1]} <eos>" for p in pairs]

# ------------------------------------------------------------
# 2️⃣ Tokenize + Pad
# ------------------------------------------------------------
max_eng_len = 20
max_fra_len = 20

eng_tokenizer = Tokenizer(filters='', lower=True)
fra_tokenizer = Tokenizer(filters='', lower=True)
eng_tokenizer.fit_on_texts(eng_texts)
fra_tokenizer.fit_on_texts(fra_texts)

eng_vocab = len(eng_tokenizer.word_index) + 1
fra_vocab = len(fra_tokenizer.word_index) + 1

encoder_input = pad_sequences(eng_tokenizer.texts_to_sequences(eng_texts), maxlen=max_eng_len, padding='post')
decoder_input = pad_sequences(fra_tokenizer.texts_to_sequences(fra_texts), maxlen=max_fra_len, padding='post')

decoder_target = np.zeros_like(decoder_input)
decoder_target[:, :-1] = decoder_input[:, 1:]

X_train_enc, X_val_enc, X_train_dec, X_val_dec, y_train, y_val = train_test_split(
    encoder_input, decoder_input, decoder_target, test_size=0.1, random_state=42
)

# ------------------------------------------------------------
# 3️⃣ Build Encoder–Decoder with Attention
# ------------------------------------------------------------
latent_dim = 256

# Encoder
enc_inputs = Input(shape=(max_eng_len,))
enc_emb = Embedding(eng_vocab, latent_dim, mask_zero=True)(enc_inputs)
enc_out, enc_h, enc_c = LSTM(latent_dim, return_sequences=True, return_state=True)(enc_emb)

# Decoder
dec_inputs = Input(shape=(max_fra_len,))
dec_emb = Embedding(fra_vocab, latent_dim, mask_zero=True)(dec_inputs)
dec_lstm_out, _, _ = LSTM(latent_dim, return_sequences=True, return_state=True)(dec_emb, initial_state=[enc_h, enc_c])

# Attention
attn = Attention()([dec_lstm_out, enc_out])
concat = tf.keras.layers.Concatenate()([dec_lstm_out, attn])
dec_dense = Dense(fra_vocab, activation='softmax')(concat)

model = Model([enc_inputs, dec_inputs], dec_dense)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')
model.summary()

# ------------------------------------------------------------
# 4️⃣ Train quickly (only 3 epochs)
# ------------------------------------------------------------
model.fit(
    [X_train_enc, X_train_dec],
    y_train.reshape(y_train.shape[0], y_train.shape[1], 1),
    validation_data=([X_val_enc, X_val_dec], y_val.reshape(y_val.shape[0], y_val.shape[1], 1)),
    batch_size=64,
    epochs=3
)

print("✅ Training complete!")

# ------------------------------------------------------------
# 5️⃣ Define inference model (simplified greedy decoding)
# ------------------------------------------------------------
encoder_model = Model(enc_inputs, [enc_out, enc_h, enc_c])

dec_state_input_h = Input(shape=(latent_dim,))
dec_state_input_c = Input(shape=(latent_dim,))
enc_out_input = Input(shape=(max_eng_len, latent_dim))

dec_emb2 = dec_emb(dec_inputs)
dec_out2, dec_h2, dec_c2 = LSTM(latent_dim, return_sequences=True, return_state=True)(
    dec_emb2, initial_state=[dec_state_input_h, dec_state_input_c]
)
attn2 = Attention()([dec_out2, enc_out_input])
concat2 = tf.keras.layers.Concatenate()([dec_out2, attn2])
dec_pred2 = dec_dense(concat2)
decoder_model = Model(
    [dec_inputs, enc_out_input, dec_state_input_h, dec_state_input_c],
    [dec_pred2, dec_h2, dec_c2]
)

# ------------------------------------------------------------
# 6️⃣ Translation function
# ------------------------------------------------------------
reverse_fra_index = {i: w for w, i in fra_tokenizer.word_index.items()}

def translate(sentence):
    seq = eng_tokenizer.texts_to_sequences([sentence])
    seq = pad_sequences(seq, maxlen=max_eng_len, padding='post')
    enc_outs, enc_h, enc_c = encoder_model.predict(seq)

    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = fra_tokenizer.word_index['<sos>']

    decoded_sentence = ''
    for _ in range(max_fra_len):
        preds, h, c = decoder_model.predict([target_seq, enc_outs, enc_h, enc_c])
        idx = np.argmax(preds[0, -1, :])
        word = reverse_fra_index.get(idx, '')
        if word == '<eos>' or word == '':
            break
        decoded_sentence += ' ' + word
        target_seq[0, 0] = idx
        enc_h, enc_c = h, c
    return decoded_sentence.strip()

# ------------------------------------------------------------
# 7️⃣ Try sample translations
# ------------------------------------------------------------
for sent in ["good night", "how are you", "i love books", "she is a teacher"]:
    print(f"\n🗣️ English: {sent}")
    print(f"🇫🇷 French (predicted): {translate(sent)}")


TypeError: string indices must be integers, not 'str'