# Question 1: AI for Lexical and Syntax Assistance

Creating typo-correction pairs

In [None]:
import os
import csv
import random
import string

random.seed(42)

keyboard_neighbors = {
    'a': 'qwsz', 'b': 'vghn', 'c': 'xdfv', 'd': 'serfcx',
    'e': 'wsdr', 'f': 'rtgvcd', 'g': 'tyhbvf', 'h': 'yujnbg',
    'i': 'ujko', 'j': 'uikmnh', 'k': 'iolmj', 'l': 'kop',
    'm': 'njk', 'n': 'bhjm', 'o': 'iklp', 'p': 'ol', 'q': 'wa',
    'r': 'edft', 's': 'awedxz', 't': 'rfgy', 'u': 'yhji',
    'v': 'cfgb', 'w': 'qesa', 'x': 'zsdc', 'y': 'tghu', 'z': 'asx'
}

ALPHABET = string.ascii_lowercase


def adaptive_typo_count(word, min_per_word=30, max_per_word=500):
    """
    Determine how many unique typos to attempt for `word`.
    Short words get fewer, long identifiers get many.
    Tunable via min_per_word / max_per_word.
    """
    length = len(word)
    if length <= 2:
        return min_per_word // 4
    elif length <= 4:
        return min_per_word
    elif length <= 8:
        return min(min_per_word * 4, max_per_word // 4)
    elif length <= 15:
        return min(min_per_word * 10, max_per_word // 2)
    else:
        return max_per_word

def replace_with_neighbor(ch):
    """Replace character with one of its keyboard neighbors (if present)."""
    lower = ch.lower()
    if lower in keyboard_neighbors and keyboard_neighbors[lower]:
        choice = random.choice(keyboard_neighbors[lower])
        # preserve case
        return choice.upper() if ch.isupper() else choice
    else:
        return random.choice(ALPHABET)

def random_case_transform(s):
    """Randomly toggle case of one or more characters (useful for identifiers)."""
    if len(s) == 0:
        return s
    s = list(s)
    i = random.randrange(len(s))
    s[i] = s[i].upper() if s[i].islower() else s[i].lower()
    return "".join(s)

def generate_typo(word):
    """
    Create a single realistic typo variant of `word`.
    Uses multiple strategies; returns a string (possibly equal to input rarely).
    """
    if not word:
        return word

    ops = [
        "swap_adjacent",
        "delete", 
        "insert_neighbor", 
        "replace_neighbor",
        "replace_random", 
        "double_char", 
        "transpose", 
        "case_change"      
    ]
    op = random.choice(ops)
    s = list(word)

    try:
        if op == "swap_adjacent" and len(s) > 1:
            i = random.randint(0, len(s) - 2)
            s[i], s[i+1] = s[i+1], s[i]

        elif op == "delete" and len(s) > 0:
            i = random.randint(0, len(s) - 1)
            del s[i]

        elif op == "insert_neighbor":
            i = random.randint(0, len(s))

            if len(word) > 0:
                source_index = max(0, min(len(s)-1, i-1))
                ch = s[source_index]
                ins = replace_with_neighbor(ch)
            else:
                ins = random.choice(ALPHABET)
            s.insert(i, ins)

        elif op == "replace_neighbor" and len(s) > 0:
            i = random.randint(0, len(s) - 1)
            s[i] = replace_with_neighbor(s[i])

        elif op == "replace_random" and len(s) > 0:
            i = random.randint(0, len(s) - 1)
            s[i] = random.choice(ALPHABET.upper() if s[i].isupper() else ALPHABET)

        elif op == "double_char" and len(s) > 0:
            i = random.randint(0, len(s) - 1)
            s.insert(i, s[i])

        elif op == "transpose" and len(s) > 1:
            i = random.randint(0, len(s) - 1)
            j = random.randint(0, len(s) - 1)
            if i != j:
                s[i], s[j] = s[j], s[i]

        elif op == "case_change":
            return random_case_transform(word)

        if random.random() < 0.12 and len(s) > 0:
            j = random.choice(["delete", "double_char", "replace_neighbor", "insert_neighbor"])
            if j == "delete" and len(s) > 0:
                idx = random.randint(0, len(s)-1)
                del s[idx]
            elif j == "double_char" and len(s) > 0:
                idx = random.randint(0, len(s)-1)
                s.insert(idx, s[idx])
            elif j == "replace_neighbor" and len(s) > 0:
                idx = random.randint(0, len(s)-1)
                s[idx] = replace_with_neighbor(s[idx])
            elif j == "insert_neighbor":
                idx = random.randint(0, len(s))
                src = s[max(0, min(len(s)-1, idx-1))] if s else random.choice(ALPHABET)
                s.insert(idx, replace_with_neighbor(src))

    except Exception:
        if len(word) > 0:
            i = random.randint(0, len(word)-1)
            s = list(word)
            s[i] = random.choice(ALPHABET)

    typo = "".join(s)

    if typo == word:
        if len(word) > 0:
            i = random.randint(0, len(word)-1)
            s = list(word)
            s[i] = random.choice(ALPHABET)
            typo = "".join(s)

    return typo


def make_typos_for_word(word, target_count, safety_cap=5000):
    """
    Generate up to target_count unique typos for a single word.
    Uses a safety_cap to stop infinite loops.
    """
    typos = set()
    attempts = 0
    while len(typos) < target_count and attempts < safety_cap:
        t = generate_typo(word)
        if t and t != word:
            typos.add(t)
        attempts += 1
    return list(typos)

def generate_typos_for_language(lang, keywords, min_per_word=30, max_per_word=500):
    """
    For each keyword, generate an adaptive # of typos and return list of (typo, correct).
    """
    all_pairs = []
    for word in keywords:
        target = adaptive_typo_count(word, min_per_word, max_per_word)
        typos = make_typos_for_word(word, target, safety_cap=target * 20)
        for t in typos:
            all_pairs.append((t, word))
    return all_pairs


LANG_KEYWORDS = {
    "python": [
        "def", "return", "if", "else", "elif", "for", "while", "break", "continue",
        "import", "from", "as", "class", "try", "except", "lambda", "global", "with",
        "print", "input", "open", "read", "write", "strip", "split", "join", "format",
        "self", "__init__", "__str__", "len", "range", "enumerate", "map", "filter"
    ],
    "java": [
        "public", "class", "static", "void", "main", "String", "if", "else", "for",
        "while", "return", "int", "float", "boolean", "try", "catch", "import", "package",
        "new", "this", "extends", "implements", "throws", "interface", "System.out.println"
    ],
    "c": [
        "int", "float", "if", "else", "for", "while", "return", "printf", "scanf",
        "include", "define", "char", "void", "main", "switch", "case", "break",
        "#include", "malloc", "free", "sizeof", "struct", "typedef"
    ]
}


OUTPUT_DIR = "typo_datasets"
os.makedirs(OUTPUT_DIR, exist_ok=True)

MIN_PER_WORD = 30 
MAX_PER_WORD = 600 

for lang, keywords in LANG_KEYWORDS.items():
    print(f"\nGenerating typos for {lang} ...")
    pairs = generate_typos_for_language(lang, keywords, min_per_word=MIN_PER_WORD, max_per_word=MAX_PER_WORD)
    random.shuffle(pairs)

    csv_path = os.path.join(OUTPUT_DIR, f"typo_data_{lang}.csv")
    with open(csv_path, "w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerow(["typo", "correct"])
        writer.writerows(pairs)

    print(f"âœ… Saved {len(pairs)} pairs to {csv_path}")

print("\nAll CSV files created.")


Encoding Function

In [None]:
LOWER_CASE_LETTERS = list("abcdefghijklmnopqrstuvwxyz")
UPPER_CASE_LETTERS = list("ABCDEFGHIJKLMNOPQRSTUVWXYZ")
DIGITS = list("0123456789")
SPECIAL_CHARS = ['_', '-', '.','@','|','(',')','~','!','#','$','%','*','-','+','{','}','[',']','\\n',':',';','"','\'','<','>',',','.','=','`']

SPECIAL_TOKENS = ['<pad>', '<sos>', '<eos>']

VOCAB = SPECIAL_TOKENS + LOWER_CASE_LETTERS + DIGITS + SPECIAL_CHARS


stoi = {ch: i for i, ch in enumerate(VOCAB)}
itos = {i: ch for i, ch in enumerate(VOCAB)}

def encode_string(word):
    """
    Converts a word (string) into a list of integers.
    Adds <sos> at the start and <eos> at the end.
    Unknown characters are converted to <pad>.
    """
    encoded = [stoi['<sos>']]
    for ch in word:
        encoded.append(stoi.get(ch, stoi['<pad>'])) 
    encoded.append(stoi['<eos>'])
    return encoded

def decode_string(indices):
    """
    Converts a list of numeric indices back into a readable word.
    Ignores special tokens (<sos>, <eos>, <pad>).
    """
    chars = [itos[i] for i in indices if itos[i] not in ['<sos>', '<eos>', '<pad>']]
    return ''.join(chars)

Integrating both the above cells to prepare the data for the Nueral Network

In [None]:
import os
import csv
import numpy as np

DATA_DIR = "typo_datasets"
OUTPUT_DIR = "processed"
os.makedirs(OUTPUT_DIR, exist_ok=True)

LANGS = ["python", "java", "c"]
MAX_LEN = 32 

def build_vocab_from_csv(csv_path):
    chars = set()

    with open(csv_path, "r", encoding="utf-8") as f:
        reader = csv.reader(f)
        next(reader)
        for typo, correct in reader:
            for ch in typo.strip():
                chars.add(ch)
            for ch in correct.strip():
                chars.add(ch)

    chars = sorted(list(chars))

    vocab = ["<pad>", "<sos>", "<eos>"] + chars

    stoi = {ch: i for i, ch in enumerate(vocab)}
    itos = {i: ch for i, ch in enumerate(vocab)}

    return stoi, itos

def encode_string(text, stoi):
    """Convert string into [<sos>, chars..., <eos>]"""
    ids = [stoi["<sos>"]]
    for ch in text:
        ids.append(stoi.get(ch, stoi["<pad>"]))
    ids.append(stoi["<eos>"])
    return ids

def pad_sequence(seq, max_len, pad_idx):
    if len(seq) < max_len:
        return seq + [pad_idx] * (max_len - len(seq))
    else:
        return seq[:max_len]

for lang in LANGS:
    print(f"\nProcessing {lang.upper()} dataset...")

    csv_path = os.path.join(DATA_DIR, f"typo_data_{lang}.csv")
    if not os.path.exists(csv_path):
        print(f"File not found: {csv_path}")
        continue

    stoi, itos = build_vocab_from_csv(csv_path)
    pad_idx = stoi["<pad>"]

    print(f"Vocabulary size ({lang}):", len(stoi))

    np.save(os.path.join(OUTPUT_DIR, f"{lang}_stoi.npy"), stoi)
    np.save(os.path.join(OUTPUT_DIR, f"{lang}_itos.npy"), itos)

    typo_pairs = []
    with open(csv_path, "r", encoding="utf-8") as f:
        reader = csv.reader(f)
        next(reader)
        typo_pairs = [(t.strip(), c.strip()) for t, c in reader]


    X_list = []
    Y_in_list = []
    Y_out_list = []

    for typo, correct in typo_pairs:
        enc_typo = encode_string(typo, stoi)
        enc_correct = encode_string(correct, stoi)

        dec_in = enc_correct[:-1]
        dec_out = enc_correct[1:]

        X_list.append(pad_sequence(enc_typo, MAX_LEN, pad_idx))
        Y_in_list.append(pad_sequence(dec_in, MAX_LEN, pad_idx))
        Y_out_list.append(pad_sequence(dec_out, MAX_LEN, pad_idx))

    X = np.array(X_list, dtype=np.int32)
    Y_in = np.array(Y_in_list, dtype=np.int32)
    Y_out = np.array(Y_out_list, dtype=np.int32)

    np.save(os.path.join(OUTPUT_DIR, f"{lang}_X.npy"), X)
    np.save(os.path.join(OUTPUT_DIR, f"{lang}_Yin.npy"), Y_in)
    np.save(os.path.join(OUTPUT_DIR, f"{lang}_Yout.npy"), Y_out)

    print(f"âœ” Saved {lang} arrays:")
    print("  X    :", X.shape)
    print("  Y_in :", Y_in.shape)
    print("  Y_out:", Y_out.shape)

print("\nAll languages processed successfully!")


In [None]:
%pip install tensorflow
%pip install scikit-learn

LSTM Model

In [None]:
import os
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import (
    Input, LSTM, Embedding, Dense, Dropout,
    TimeDistributed, Attention, Concatenate
)
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.saving import register_keras_serializable


@register_keras_serializable()
class CastToFloat32(tf.keras.layers.Layer):
    def call(self, inputs):
        return tf.cast(inputs, tf.float32)


def pad_sequence(seq, max_len, pad_idx):
    return seq + [pad_idx] * (max_len - len(seq)) if len(seq) < max_len else seq[:max_len]


def decode_sequence(id_list, itos):
    chars = []
    for token_id in id_list:
        ch = itos[token_id]
        if ch in ["<pad>", "<sos>", "<eos>"]:
            continue
        chars.append(ch)
    return ''.join(chars)


def encode_string(text, stoi):
    ids = [stoi["<sos>"]]
    for ch in text:
        ids.append(stoi.get(ch, stoi["<pad>"]))
    ids.append(stoi["<eos>"])
    return ids

def edit_distance(a, b):
    la, lb = len(a), len(b)
    dp = [[0]*(lb+1) for _ in range(la+1)]
    for i in range(la+1): dp[i][0] = i
    for j in range(lb+1): dp[0][j] = j

    for i in range(1, la+1):
        for j in range(1, lb+1):
            cost = 0 if a[i-1] == b[j-1] else 1
            dp[i][j] = min(
                dp[i-1][j]+1,
                dp[i][j-1]+1,
                dp[i-1][j-1]+cost
            )
    return dp[la][lb]


LANG_KEYWORDS = {
    "python": [
        "def","return","if","else","elif","for","while","break","continue",
        "import","from","as","class","try","except","lambda","global","with",
        "print","input","len","range","int","float","str","True","False","None"
    ],
    "java": [
        "public","class","static","void","main","String","if","else","for",
        "while","return","int","float","boolean","try","catch","import","package"
    ],
    "c": [
        "int","float","if","else","for","while","return","printf","scanf",
        "define","char","void","main","switch","case","break"
    ]
}


def nearest_keyword(word, lang_keywords, max_dist=2):
    wl = word.lower()
    best = word
    best_d = 999
    for kw in lang_keywords:
        d = edit_distance(wl, kw.lower())
        if d < best_d:
            best_d = d
            best = kw
    return best if best_d <= max_dist else word


def build_model(vocab_size, max_len, embedding_dim=256, lstm_units=384):
    encoder_input = Input(shape=(max_len,), name="encoder_input")
    encoder_emb = Embedding(vocab_size, embedding_dim)(encoder_input)
    encoder_emb = Dropout(0.3)(encoder_emb)

    encoder_outputs, state_h, state_c = LSTM(
        lstm_units, return_sequences=True, return_state=True
    )(encoder_emb)

    encoder_outputs = CastToFloat32()(encoder_outputs)

    decoder_input = Input(shape=(max_len,), name="decoder_input")
    decoder_emb = Embedding(vocab_size, embedding_dim)(decoder_input)

    decoder_outputs, _, _ = LSTM(
        lstm_units, return_sequences=True, return_state=True
    )(decoder_emb, initial_state=[state_h, state_c])

    decoder_outputs = CastToFloat32()(decoder_outputs)

    attention = Attention()([decoder_outputs, encoder_outputs])
    context = Concatenate()([decoder_outputs, attention])

    output = TimeDistributed(Dense(vocab_size, activation="softmax"))(context)

    model = Model([encoder_input, decoder_input], output)
    model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
    return model


def predict_typo(model, word, stoi, itos, max_len, lang_keywords):
    encoded = encode_string(word, stoi)
    enc_padded = np.array([pad_sequence(encoded, max_len, stoi["<pad>"])])

    target_seq = np.array([[stoi["<sos>"]]])
    decoded_tokens = []

    for _ in range(max_len):
        dec_padded = np.array([pad_sequence(target_seq[0].tolist(), max_len, stoi["<pad>"])])
        preds = model.predict([enc_padded, dec_padded], verbose=0)

        token_id = np.argmax(preds[0, len(target_seq[0]) - 1])
        token = itos[token_id]

        if token == "<eos>":
            break
        if token != "<pad>":
            decoded_tokens.append(token)

        target_seq = np.append(target_seq, [[token_id]], axis=1)

    pred = ''.join(decoded_tokens)
    return nearest_keyword(pred, lang_keywords)


LANGS = ["python", "java", "c"]

for lang in LANGS:
    print(f"TRAINING LSTM MODEL FOR: {lang.upper()}")

    X = np.load(f"processed/{lang}_X.npy")
    Y_in = np.load(f"processed/{lang}_Yin.npy")
    Y_out = np.load(f"processed/{lang}_Yout.npy")

    stoi = np.load(f"processed/{lang}_stoi.npy", allow_pickle=True).item()
    itos = np.load(f"processed/{lang}_itos.npy", allow_pickle=True).item()

    vocab_size = len(stoi)
    max_len = X.shape[1]

    words_raw = []
    for seq in Y_out:
        words_raw.append(decode_sequence(seq, itos))


    X_train, X_test, Y_in_train, Y_in_test, Y_out_train, Y_out_test, words_raw_train, words_raw_test = train_test_split(
        X, Y_in, Y_out, words_raw, test_size=0.15, random_state=42
    )

    Y_out_train_exp = np.expand_dims(Y_out_train, -1)
    Y_out_test_exp = np.expand_dims(Y_out_test, -1)

    model = build_model(vocab_size, max_len)

    early = EarlyStopping(monitor="val_loss", patience=10, restore_best_weights=True)
    lr_reduce = ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=5, min_lr=1e-6)


    model.fit(
        [X_train, Y_in_train],
        Y_out_train_exp,
        validation_split=0.2,
        batch_size=32,
        epochs=30,
        callbacks=[early, lr_reduce],
        verbose=1
    )


    os.makedirs("models", exist_ok=True)
    save_path = f"models/{lang}_lstm_typo_corrector.keras"
    model.save(save_path)
    print(f"Saved model: {save_path}")

    loss, acc = model.evaluate([X_test, Y_in_test], Y_out_test_exp, verbose=0)
    print(f"Token Accuracy ({lang}): {acc:.4f}")


    correct = 0
    for w in words_raw_test:
        if predict_typo(model, w, stoi, itos, max_len, LANG_KEYWORDS[lang]) == w:
            correct += 1

    print(f"Word Accuracy ({lang}): {correct/len(words_raw_test):.4f}")


    print("\nSAMPLE PREDICTIONS:")
    test_words = ["pritn", "whiel", "retunr", "flase", "ture", "inptu"]
    for w in test_words:
        try:
            print(f"{w:12s} â†’ {predict_typo(model, w, stoi, itos, max_len, LANG_KEYWORDS[lang])}")
        except:
            print(f"{w:12s} â†’ ERROR")


In [None]:
%pip install pandas
%pip install matplotlib

In [None]:
import os
DATASET_DIR = "typo_datasets"
LANGUAGES = ["python", "java", "c"]
MODEL_DIR = "keras_models"

os.makedirs(MODEL_DIR, exist_ok=True)

In [None]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Layer, Embedding, Dense, Dropout, LayerNormalization, MultiHeadAttention
from tensorflow.keras import Model
from sklearn.model_selection import train_test_split
import json

DATASET_DIR = "./typo_datasets"
MODEL_DIR = "./models"
os.makedirs(MODEL_DIR, exist_ok=True)

LANG_FILES = {
    "c": "typo_data_c.csv",
    "java": "typo_data_java.csv",
    "python": "typo_data_python.csv",
}

SEED = 42
np.random.seed(SEED)
tf.random.set_seed(SEED)


def positional_encoding(position, d_model):
    angle_rads = np.zeros((position, d_model), dtype=np.float32)
    positions = np.arange(position)[:, np.newaxis]
    dims = np.arange(d_model)[np.newaxis, :]
    angle_rates = 1 / np.power(10000, (2 * (dims // 2)) / d_model)
    angle_rads = positions * angle_rates
    angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
    angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
    return tf.cast(angle_rads[np.newaxis, ...], tf.float32)


def build_vocab(df):
    chars = sorted(set("".join(df['typo']) + "".join(df['correct'])))
    chars = ["<pad>", "<start>", "<end>"] + chars
    char2idx = {c: i for i, c in enumerate(chars)}
    idx2char = {i: c for i, c in enumerate(chars)}
    return char2idx, idx2char


def encode_texts(texts, char2idx, max_len):
    pad = char2idx["<pad>"]
    start = char2idx["<start>"]
    end = char2idx["<end>"]

    encoded = []
    for t in texts:
        seq = [start] + [char2idx.get(c, pad) for c in str(t)] + [end]
        seq = seq[:max_len] + [pad] * max(0, max_len - len(seq))
        encoded.append(seq)
    return np.array(encoded)


def decode(ids, idx2char):
    return "".join([idx2char[i] for i in ids if idx2char[i] not in ("<pad>", "<start>", "<end>")])


class TransformerBlock(Layer):
    def __init__(self, embed_dim, num_heads, ff_dim):
        super().__init__()
        self.att = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = tf.keras.Sequential([Dense(ff_dim, activation="relu"), Dense(embed_dim)])
        self.l1 = LayerNormalization()
        self.l2 = LayerNormalization()
        self.d1 = Dropout(0.1)
        self.d2 = Dropout(0.1)

    def call(self, x, training=False):
        att_out = self.att(x, x)
        att_out = self.d1(att_out, training=training)
        out1 = self.l1(x + att_out)
        ffn_out = self.ffn(out1)
        ffn_out = self.d2(ffn_out, training=training)
        return self.l2(out1 + ffn_out)


class CharacterTransformer(Model):
    def __init__(self, vocab_size, embed_dim=128, heads=4, ff_dim=256, layers=2, max_pos=100):
        super().__init__()
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim
        self.heads = heads
        self.ff_dim = ff_dim
        self.num_layers = layers
        self.max_pos = max_pos

        self.embedding = Embedding(vocab_size, embed_dim)
        self.pos = positional_encoding(max_pos, embed_dim)

        self.blocks = [TransformerBlock(embed_dim, heads, ff_dim) for _ in range(layers)]
        self.final = Dense(vocab_size)

    def call(self, x, training=False):
        length = tf.shape(x)[1]
        x = self.embedding(x) + self.pos[:, :length, :]
        for blk in self.blocks:
            x = blk(x, training=training)
        return self.final(x)

    def get_config(self):
        return {
            "vocab_size": self.vocab_size,
            "embed_dim": self.embed_dim,
            "heads": self.heads,
            "ff_dim": self.ff_dim,
            "layers": self.num_layers,
            "max_pos": self.max_pos,
        }

    @classmethod
    def from_config(cls, cfg):
        return cls(**cfg)

def train_language(lang, file_name):
    print(f"\nTraining {lang}...")

    df = pd.read_csv(os.path.join(DATASET_DIR, file_name))
    df['typo'] = df['typo'].astype(str)
    df['correct'] = df['correct'].astype(str)

    char2idx, idx2char = build_vocab(df)
    vocab = len(char2idx)

    max_len = max(df['typo'].str.len().max(), df['correct'].str.len().max()) + 2

    X = encode_texts(df["typo"], char2idx, max_len)
    Y = encode_texts(df["correct"], char2idx, max_len)

    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

    train_ds = tf.data.Dataset.from_tensor_slices((X_train, y_train)).shuffle(500).batch(32)
    test_ds = tf.data.Dataset.from_tensor_slices((X_test, y_test)).batch(32)

    model = CharacterTransformer(vocab, max_pos=max_len)
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    optim = tf.keras.optimizers.Adam()

    train_acc = tf.keras.metrics.SparseCategoricalAccuracy()
    test_acc = tf.keras.metrics.SparseCategoricalAccuracy()

    for epoch in range(1, 31):
        train_acc.reset_state()
        test_acc.reset_state()
        total_loss = 0
        batches = 0

        for inp, tar in train_ds:
            with tf.GradientTape() as tape:
                pred = model(inp, training=True)
                L = loss(tar, pred)

            grads = tape.gradient(L, model.trainable_variables)
            optim.apply_gradients(zip(grads, model.trainable_variables))

            train_acc.update_state(tar, pred)
            total_loss += float(L)
            batches += 1

        for inp, tar in test_ds:
            pred = model(inp, training=False)
            test_acc.update_state(tar, pred)

        print(f"Epoch {epoch:02d} | Loss:{total_loss/batches:.4f} | Train:{train_acc.result()*100:.2f}% | Test:{test_acc.result()*100:.2f}%")


    model_path = os.path.join(MODEL_DIR, f"{lang}_typer.keras")
    model.save(model_path)
    print(f"Saved model â†’ {model_path}")


    meta_path = os.path.join(MODEL_DIR, f"{lang}_typer.metadata.npz")
    np.savez(meta_path, vocab=np.array(list(char2idx.keys())), max_len=max_len)
    print(f"Saved metadata â†’ {meta_path}")


    settings_path = os.path.join(MODEL_DIR, f"{lang}_model_settings.json")
    with open(settings_path, "w") as f:
        json.dump({"char2idx": char2idx, "max_len": int(max_len)}, f, indent=4)

    print(f"Saved settings â†’ {settings_path}")


for lang, file_name in LANG_FILES.items():
    train_language(lang, file_name)


def predict_language_word(lang, word):
    model_path = f"{MODEL_DIR}/{lang}_typer.keras"
    meta_path = f"{MODEL_DIR}/{lang}_typer.metadata.npz"

    if not os.path.exists(model_path):
        print("Model not found.")
        return

    model = tf.keras.models.load_model(
        model_path,
        custom_objects={"CharacterTransformer": CharacterTransformer}
    )

    meta = np.load(meta_path, allow_pickle=True)
    chars = list(meta["vocab"])
    max_len = int(meta["max_len"])

    char2idx = {c: i for i, c in enumerate(chars)}
    idx2char = {i: c for i, c in enumerate(chars)}

    inp = encode_texts([word], char2idx, max_len)
    logits = model(inp, training=False)

    pred = tf.argmax(logits[0], axis=-1).numpy()
    result = decode(pred, idx2char)

    print(f"{word} â†’ {result}")
    return result

In [None]:
import tensorflow as tf
print(tf.config.list_physical_devices())

In [None]:
import os
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import (
    Input, Embedding, SimpleRNN, Dense, Dropout,
    Bidirectional, LayerNormalization
)
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
import matplotlib.pyplot as plt

LANGS = ["python", "java", "c"]
def pad_sequence(seq, max_len, pad_idx):
    return seq + [pad_idx] * (max_len - len(seq)) if len(seq) < max_len else seq[:max_len]


def encode_string(text, stoi):
    ids = [stoi["<sos>"]]
    for ch in text:
        ids.append(stoi.get(ch, stoi["<pad>"]))
    ids.append(stoi["<eos>"])
    return ids


def decode_sequence(id_list, itos):
    chars = []
    for token_id in id_list:
        ch = itos[token_id]
        if ch not in ["<pad>", "<sos>", "<eos>"]:
            chars.append(ch)
    return ''.join(chars)

for lang in LANGS:

    print("\n" + "="*80)
    print(f"TRAINING RNN MODEL FOR LANGUAGE: {lang.upper()}")
    print("="*80)

    X = np.load(f"processed/{lang}_X.npy")
    Y_out = np.load(f"processed/{lang}_Yout.npy")
    stoi = np.load(f"processed/{lang}_stoi.npy", allow_pickle=True).item()
    itos = np.load(f"processed/{lang}_itos.npy", allow_pickle=True).item()

    vocab_size = len(stoi)
    max_seq_len = X.shape[1]

    Y_expanded = np.expand_dims(Y_out, -1)

    print(f"Loaded shapes â†’ X: {X.shape}, Y: {Y_expanded.shape}")

    embedding_dim = 128
    hidden_units = 256
    dropout_rate = 0.25
    learning_rate = 0.0005

    inputs = Input(shape=(max_seq_len,), name="input_layer")

    x = Embedding(
        input_dim=vocab_size,
        output_dim=embedding_dim,
        mask_zero=True,
        name="embedding_layer"
    )(inputs)

    x = Dropout(dropout_rate)(x)

    x = Bidirectional(SimpleRNN(
        hidden_units,
        return_sequences=True,
        recurrent_dropout=0.2,
    ), name="bidir_rnn_1")(x)

    x = LayerNormalization()(x)
    x = Dropout(dropout_rate)(x)

    x = SimpleRNN(
        hidden_units,
        return_sequences=True,
        recurrent_dropout=0.2,
        name="rnn_layer_2"
    )(x)

    x = LayerNormalization()(x)
    x = Dropout(dropout_rate)(x)

    x = Dense(128, activation='relu')(x)
    x = Dropout(dropout_rate)(x)

    outputs = Dense(vocab_size, activation='softmax')(x)

    rnn_model = Model(inputs, outputs, name=f"RNN_{lang}")

    rnn_model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
        loss="sparse_categorical_crossentropy",
        metrics=["accuracy"]
    )

    print(rnn_model.summary())

    callbacks = [
        EarlyStopping(
            monitor="val_loss",
            patience=8,
            restore_best_weights=True,
            verbose=1
        ),
        ReduceLROnPlateau(
            monitor="val_loss",
            factor=0.5,
            patience=4,
            min_lr=1e-6,
            verbose=1
        )
    ]

    history = rnn_model.fit(
        X, Y_expanded,
        batch_size=64,
        epochs=30,
        validation_split=0.15,
        callbacks=callbacks,
        verbose=1
    )

    os.makedirs("models", exist_ok=True)
    save_path = f"models/{lang}_rnn_typo_corrector.keras"
    rnn_model.save(save_path)

    print(f"\nSaved model to {save_path}")

    def predict_correction(typo):
        encoded = pad_sequence(encode_string(typo, stoi), max_seq_len, stoi["<pad>"])
        X_input = np.array([encoded])

        preds = rnn_model.predict(X_input, verbose=0)
        pred_ids = np.argmax(preds[0], axis=-1)

        out_chars = [
            itos[i] for i in pred_ids
            if itos[i] not in ["<pad>", "<sos>", "<eos>"]
        ]
        return "".join(out_chars)

    print("\nðŸ”Ž SAMPLE TESTS")
    test_typos = ["pritn", "whiel", "reutrn", "fo", "esle", "inpt", "rnage"]
    for typo in test_typos:
        print(f"{typo:<12} â†’ {predict_correction(typo)}")

    val_split = int(len(X) * 0.85)
    X_val = X[val_split:]
    Y_val = Y_expanded[val_split:]

    val_loss, val_acc = rnn_model.evaluate(X_val, Y_val, verbose=0)

    print("\nVALIDATION RESULTS")
    print(f"Loss:      {val_loss:.4f}")
    print(f"Accuracy:  {val_acc:.4f}")


print("\n" + "="*80)
print("ALL LANGUAGES TRAINED SUCCESSFULLY!")
print("="*80)