In [None]:
import tensorflow as tf
import numpy as np
import pickle
import os
import re
import math  # For ceil

# --- USER: SET YOUR PRETRAINING DATA FILE PATH AND EPOCHS HERE ---
PRETRAINING_DATA_FILE_PATH = "/content/Physics_Dataset.txt"  # <--- EDIT THIS
NUM_EPOCHS_PRETRAIN = 26                                   # <--- EDIT THIS (optional)
# --- END USER SETTINGS ---

# --- Hyperparameters (Shared) ---
VOCAB_SIZE = 8000
EMBEDDING_DIM = 128
LSTM_UNITS = 256
DROPOUT_RATE = 0.2
LEARNING_RATE = 0.001
# BATCH_SIZE now refers to the number of sequences per training step,
# processed by the generator.
BATCH_SIZE = 64
MAX_PARAGRAPH_LEN = 250

# --- Fixed File Paths (Shared) ---
MODEL_SAVE_PATH = "paragraph_lm.keras"
TOKENIZER_SAVE_PATH = "paragraph_tokenizer.pkl"

# --- Utility Functions (Shared) ---
def clean_text(text):
    text = text.lower()
    text = re.sub(r"\[.*?\]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    text = re.sub(r"[^a-z0-9\s\.\,\!\?\']", "", text)
    return text

def load_tokenizer(path):
    if os.path.exists(path):
        with open(path, 'rb') as handle:
            tokenizer = pickle.load(handle)
        print(f"Tokenizer loaded from {path}")
        return tokenizer
    return None

def save_tokenizer(tokenizer, path):
    with open(path, 'wb') as handle:
        pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
    print(f"Tokenizer saved to {path}")

def build_model(model_vocab_size, embedding_dim, lstm_units, dropout_rate, max_len_for_embedding_layer):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(
            input_dim=model_vocab_size,
            output_dim=embedding_dim,
            input_length=max_len_for_embedding_layer - 1
        ),
        tf.keras.layers.LSTM(lstm_units),
        tf.keras.layers.Dropout(dropout_rate),
        tf.keras.layers.Dense(model_vocab_size, activation='softmax')
    ])
    return model

def load_or_initialize_model(
    model_path,
    model_vocab_size,
    embedding_dim,
    lstm_units,
    dropout_rate,
    max_len_for_embedding_layer,
    learning_rate
):
    if os.path.exists(model_path):
        print(f"Loading existing model from {model_path}")
        model = tf.keras.models.load_model(model_path)
        print(f"Re-compiling loaded model with learning rate: {learning_rate}")
        model.compile(
            optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
            loss='categorical_crossentropy',
            metrics=['accuracy']
        )
    else:
        print("Initializing new model.")
        model = build_model(
            model_vocab_size,
            embedding_dim,
            lstm_units,
            dropout_rate,
            max_len_for_embedding_layer
        )
        model.compile(
            optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
            loss='categorical_crossentropy',
            metrics=['accuracy']
        )
    return model

# --- DATA GENERATOR ---
class PretrainingDataGenerator(tf.keras.utils.Sequence):
    def __init__(self, text_data_str, tokenizer, batch_size, max_paragraph_len, model_vocab_size, shuffle=True):
        self.tokenizer = tokenizer
        self.batch_size = batch_size
        self.max_paragraph_len = max_paragraph_len  # Max length of original paragraph part to consider
        self.model_vocab_size = model_vocab_size    # VOCAB_SIZE + 1
        self.shuffle = shuffle

        self.raw_paragraphs = [
            p.strip()
            for p in text_data_str.strip().split('\n\n')
            if p.strip()
        ]

        # Create all (input_token_list, target_token_idx) pairs upfront.
        self.samples = []
        print("Preprocessing data to generate all samples for the generator...")
        for para_idx, raw_paragraph in enumerate(self.raw_paragraphs):
            if (para_idx + 1) % 500 == 0:
                print(f"  Processing paragraph {para_idx + 1}/{len(self.raw_paragraphs)} for sample creation")

            cleaned_paragraph = clean_text(raw_paragraph)
            token_list = self.tokenizer.texts_to_sequences([cleaned_paragraph])[0]

            if not token_list or len(token_list) < 2:
                continue

            # Truncate paragraph tokens if longer than max_paragraph_len
            token_list = token_list[:self.max_paragraph_len]

            for i in range(1, len(token_list)):
                input_seq_tokens = token_list[:i]  # Input is up to word i-1
                target_token_id = token_list[i]    # Target is word i

                # Ensure target_token_id is valid
                if target_token_id < self.model_vocab_size:
                    self.samples.append((input_seq_tokens, target_token_id))

        print(f"DataGenerator initialized with {len(self.samples)} total training samples.")
        self.on_epoch_end()

    def __len__(self):
        # Number of batches per epoch
        return math.ceil(len(self.samples) / self.batch_size)

    def __getitem__(self, batch_idx):
        batch_start_idx = batch_idx * self.batch_size
        batch_end_idx = (batch_idx + 1) * self.batch_size
        batch_samples = self.samples[batch_start_idx:batch_end_idx]

        batch_input_token_lists = [sample[0] for sample in batch_samples]
        batch_target_token_ids = [sample[1] for sample in batch_samples]

        # Pad input sequences for this batch
        padded_input_sequences = tf.keras.preprocessing.sequence.pad_sequences(
            batch_input_token_lists,
            maxlen=self.max_paragraph_len - 1,
            padding='pre',
            truncating='pre'
        )

        # One-hot encode target words for this batch
        categorical_target_words = tf.keras.utils.to_categorical(
            batch_target_token_ids,
            num_classes=self.model_vocab_size
        )

        return padded_input_sequences, categorical_target_words

    def on_epoch_end(self):
        # Shuffle samples at the end of each epoch if shuffle is True
        if self.shuffle:
            indices = np.arange(len(self.samples))
            np.random.shuffle(indices)
            self.samples = [self.samples[i] for i in indices]

# --- END DATA GENERATOR ---

def run_pretraining():
    print("\n--- Starting Pretraining ---")

    # Read the entire text data for tokenizer fitting and generator sample creation
    with open(PRETRAINING_DATA_FILE_PATH, 'r', encoding='utf-8') as f:
        text_data = f.read()

    # --- Tokenizer Initialization and Fitting ---
    paragraphs_for_tokenizer = [
        clean_text(p)
        for p in text_data.strip().split('\n\n')
        if p.strip()
    ]
    tokenizer = load_tokenizer(TOKENIZER_SAVE_PATH)
    if tokenizer:
        print("Updating existing tokenizer with new pretraining data.")
        if tokenizer.num_words != VOCAB_SIZE:
            print(
                f"Warning: Loaded tokenizer num_words ({tokenizer.num_words}) "
                f"!= VOCAB_SIZE ({VOCAB_SIZE}). Re-initializing."
            )
            tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=VOCAB_SIZE, oov_token="<unk>")
        tokenizer.fit_on_texts(paragraphs_for_tokenizer)
    else:
        print("Creating new tokenizer for pretraining.")
        tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=VOCAB_SIZE, oov_token="<unk>")
        tokenizer.fit_on_texts(paragraphs_for_tokenizer)

    model_vocab_size = VOCAB_SIZE + 1
    print(f"Effective vocabulary size for model layers: {model_vocab_size}")
    print(f"Tokenizer actual word_index size: {len(tokenizer.word_index)}")
    save_tokenizer(tokenizer, TOKENIZER_SAVE_PATH)

    # --- Initialize Data Generator ---
    print("Initializing data generator...")
    pretrain_generator = PretrainingDataGenerator(
        text_data_str=text_data,
        tokenizer=tokenizer,
        batch_size=BATCH_SIZE,
        max_paragraph_len=MAX_PARAGRAPH_LEN,
        model_vocab_size=model_vocab_size
    )

    if len(pretrain_generator.samples) == 0:
        print("No training samples were generated. Check your data or preprocessing. Aborting.")
        return

    # --- Load or Initialize Model ---
    model = load_or_initialize_model(
        MODEL_SAVE_PATH,
        model_vocab_size,
        EMBEDDING_DIM,
        LSTM_UNITS,
        DROPOUT_RATE,
        MAX_PARAGRAPH_LEN,
        LEARNING_RATE
    )
    model.summary()

    # --- Train the Model using the Generator ---
    early_stopping = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=3, restore_best_weights=True)

    print(f"\nStarting pretraining for {NUM_EPOCHS_PRETRAIN} epochs using data generator...")
    model.fit(
        pretrain_generator,
        epochs=NUM_EPOCHS_PRETRAIN,
        callbacks=[early_stopping]
    )

    model.save(MODEL_SAVE_PATH)
    print(f"Pretrained model saved to {MODEL_SAVE_PATH}")
    print("--- Pretraining Finished ---")

if __name__ == "__main__":
    run_pretraining()



--- Starting Pretraining ---
Creating new tokenizer for pretraining.
Effective vocabulary size for model layers: 8001
Tokenizer actual word_index size: 12443
Tokenizer saved to paragraph_tokenizer.pkl
Initializing data generator...
Preprocessing data to generate all samples for the generator...
  Processing paragraph 500/4232 for sample creation
  Processing paragraph 1000/4232 for sample creation
  Processing paragraph 1500/4232 for sample creation
  Processing paragraph 2000/4232 for sample creation
  Processing paragraph 2500/4232 for sample creation
  Processing paragraph 3000/4232 for sample creation
  Processing paragraph 3500/4232 for sample creation
  Processing paragraph 4000/4232 for sample creation
DataGenerator initialized with 480349 total training samples.
Initializing new model.




  self._warn_if_super_not_called()



Starting pretraining for 6 epochs using data generator...
Epoch 1/6
[1m7506/7506[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m154s[0m 20ms/step - accuracy: 0.1587 - loss: 5.9311
Epoch 2/6
[1m7506/7506[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m149s[0m 20ms/step - accuracy: 0.2811 - loss: 4.4765
Epoch 3/6
[1m7506/7506[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m150s[0m 20ms/step - accuracy: 0.3247 - loss: 3.9062
Epoch 4/6
[1m7506/7506[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m149s[0m 20ms/step - accuracy: 0.3543 - loss: 3.5471
Epoch 5/6
[1m7506/7506[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m150s[0m 20ms/step - accuracy: 0.3775 - loss: 3.2947
Epoch 6/6
[1m7506/7506[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m151s[0m 20ms/step - accuracy: 0.3988 - loss: 3.0852
Pretrained model saved to paragraph_lm.keras
--- Pretraining Finished ---


In [None]:
import tensorflow as tf
import numpy as np
import pickle
import os
import re
import json # For SFT data
# import tensorflow as tf
# ... other imports ...

# Enable mixed precision for GPU training (if available)
policy = tf.keras.mixed_precision.Policy('mixed_float16')
tf.keras.mixed_precision.set_global_policy(policy)
print('Mixed precision enabled')

# --- USER: SET YOUR SFT DATA FILE PATH AND EPOCHS HERE ---
SFT_DATA_FILE_PATH = "/content/new.json"  # <--- EDIT THIS (JSONL format: {"prompt": "...", "response": "..."})
NUM_EPOCHS_SFT = 5                          # <--- EDIT THIS (optional)
# --- END USER SETTINGS ---

# --- Hyperparameters (Shared) ---
VOCAB_SIZE = 8000        # Must match pretraining. Determines model layer sizes.
EMBEDDING_DIM = 128
LSTM_UNITS = 256
DROPOUT_RATE = 0.2
SFT_LEARNING_RATE = 0.0001 # Typically smaller for fine-tuning
BATCH_SIZE = 128
MAX_PARAGRAPH_LEN = 250  # Must match pretraining.

# --- Fixed File Paths (Shared) ---
MODEL_SAVE_PATH = "paragraph_lm.keras"
TOKENIZER_SAVE_PATH = "paragraph_tokenizer.pkl"

# --- Utility Functions (Shared - Copied for script independence) ---
def clean_text(text):
    text = text.lower()
    text = re.sub(r"\[.*?\]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    text = re.sub(r"[^a-z0-9\s\.\,\!\?\']", "", text)
    return text

def load_tokenizer(path):
    if os.path.exists(path):
        with open(path, 'rb') as handle:
            tokenizer = pickle.load(handle)
        print(f"Tokenizer loaded from {path}")
        return tokenizer
    print(f"ERROR: Tokenizer not found at {path}. Pretrain first.")
    return None

def save_tokenizer(tokenizer, path): # Might be used if tokenizer is updated during SFT
    with open(path, 'wb') as handle:
        pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
    print(f"Tokenizer saved to {path}")

def build_model(model_vocab_size, embedding_dim, lstm_units, dropout_rate, max_len_for_embedding_layer):
    # This function is mainly for initial creation, SFT loads an existing model.
    # However, keeping it for consistency in case load_or_initialize_model needs it.
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(input_dim=model_vocab_size,
                                  output_dim=embedding_dim,
                                  input_length=max_len_for_embedding_layer-1),
        tf.keras.layers.LSTM(lstm_units),
        tf.keras.layers.Dropout(dropout_rate),
        tf.keras.layers.Dense(model_vocab_size, activation='softmax')
    ])
    return model




def preprocess_sft_data(sft_pairs, tokenizer, max_len, model_vocab_size_for_output_layer):
    input_sequences = []
    target_words = []
    print(f"Found {len(sft_pairs)} SFT pairs for preprocessing.")

    for i, (prompt, response) in enumerate(sft_pairs):
        if (i + 1) % 100 == 0:
            print(f"Processing SFT pair {i + 1}/{len(sft_pairs)}")

        full_text = clean_text(prompt + " " + response) # Combine prompt and response
        token_list = tokenizer.texts_to_sequences([full_text])[0]

        if not token_list or len(token_list) < 2:
            continue

        token_list = token_list[:max_len] # Truncate

        for j in range(1, len(token_list)):
            n_gram_sequence = token_list[:j+1]
            input_seq = n_gram_sequence[:-1]
            target_idx = n_gram_sequence[-1]

            if target_idx < model_vocab_size_for_output_layer:
                input_sequences.append(input_seq)
                target_words.append(target_idx)

    if not input_sequences:
        print("No valid SFT sequences generated.")
        return None, None

    padded_input_sequences = tf.keras.preprocessing.sequence.pad_sequences(
        input_sequences, maxlen=max_len-1, padding='pre'
    )
    categorical_target_words = tf.keras.utils.to_categorical(
        target_words, num_classes=model_vocab_size_for_output_layer
    )
    print(f"Generated {len(padded_input_sequences)} SFT training sequences.")
    return padded_input_sequences, categorical_target_words
# --- End Utility Functions ---

def run_sft():
    print("\n--- Starting Supervised Fine-Tuning ---")

    if SFT_DATA_FILE_PATH == "your_sft_data.jsonl":
        print("ERROR: Please edit 'SFT_DATA_FILE_PATH' in the script with your actual data file path.")
        return

    sft_pairs = []
    try:
        with open(SFT_DATA_FILE_PATH, 'r', encoding='utf-8') as f:
            for line_num, line in enumerate(f):
                try:
                    data = json.loads(line)
                    sft_pairs.append((data['prompt'], data['response']))
                except json.JSONDecodeError:
                    print(f"Warning: SFT data, line {line_num+1}: Could not parse JSON: {line.strip()}")
                except KeyError:
                    print(f"Warning: SFT data, line {line_num+1}: Missing 'prompt' or 'response': {line.strip()}")
    except FileNotFoundError:
        print(f"Error: SFT data file not found at '{SFT_DATA_FILE_PATH}'")
        return

    if not sft_pairs:
        print("No SFT data loaded. Aborting SFT.")
        return

    tokenizer = load_tokenizer(TOKENIZER_SAVE_PATH)
    if not tokenizer:
        return # Error message already printed by load_tokenizer

    # Important: VOCAB_SIZE hyperparameter must be consistent with the pretraining phase
    # as it determines the model's layer dimensions.
    # The tokenizer.num_words should reflect this.
    if tokenizer.num_words != VOCAB_SIZE:
        print(f"CRITICAL WARNING: Loaded tokenizer has num_words={tokenizer.num_words}, "
              f"but SFT script's VOCAB_SIZE is {VOCAB_SIZE}. These must match the settings "
              f"used during pretraining when the model was created/last saved. Aborting.")
        return

    model_vocab_size = VOCAB_SIZE + 1 # Consistent with pretraining
    print(f"Using effective vocabulary size for SFT model layers: {model_vocab_size}")

    # Note: We are NOT refitting the tokenizer on SFT data here by default.
    # If SFT data has many new words critical for learning, consider:
    # 1. Adding SFT text to pretraining data and re-pretraining.
    # 2. Carefully updating tokenizer and potentially resizing model embedding/output layers (complex).
    # For this setup, new words in SFT data will be treated as <unk> if not in pre-trained tokenizer.

    input_seqs, target_words_cat = preprocess_sft_data(
        sft_pairs, tokenizer, MAX_PARAGRAPH_LEN, model_vocab_size
    )

    if input_seqs is None:
        print("SFT aborted: No sequences generated.")
        return

    # Load the pre-trained model. It MUST exist for SFT.
    model = load_or_initialize_model(
        MODEL_SAVE_PATH, model_vocab_size, EMBEDDING_DIM, LSTM_UNITS,
        DROPOUT_RATE, MAX_PARAGRAPH_LEN, SFT_LEARNING_RATE # Use SFT_LEARNING_RATE
    )

    if model is None: # load_or_initialize_model returns None on critical errors
        print("SFT aborted due to model loading issues.")
        return

    model.summary()

    dataset = tf.data.Dataset.from_tensor_slices((input_seqs, target_words_cat))
    dataset = dataset.shuffle(buffer_size=len(input_seqs)).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

    early_stopping = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=3, restore_best_weights=True)

    print(f"\nStarting fine-tuning for {NUM_EPOCHS_SFT} epochs...")
    model.fit(dataset, epochs=NUM_EPOCHS_SFT, callbacks=[early_stopping])

    model.save(MODEL_SAVE_PATH) # Overwrites the model with the fine-tuned one
    print(f"Fine-tuned model saved to {MODEL_SAVE_PATH}")
    print("--- Supervised Fine-Tuning Finished ---")

if __name__ == "__main__":
    run_sft()

Mixed precision enabled

--- Starting Supervised Fine-Tuning ---
Tokenizer loaded from paragraph_tokenizer.pkl
Using effective vocabulary size for SFT model layers: 8001
Found 4232 SFT pairs for preprocessing.
Processing SFT pair 100/4232
Processing SFT pair 200/4232
Processing SFT pair 300/4232
Processing SFT pair 400/4232
Processing SFT pair 500/4232
Processing SFT pair 600/4232
Processing SFT pair 700/4232
Processing SFT pair 800/4232
Processing SFT pair 900/4232
Processing SFT pair 1000/4232
Processing SFT pair 1100/4232
Processing SFT pair 1200/4232
Processing SFT pair 1300/4232
Processing SFT pair 1400/4232
Processing SFT pair 1500/4232
Processing SFT pair 1600/4232
Processing SFT pair 1700/4232
Processing SFT pair 1800/4232
Processing SFT pair 1900/4232
Processing SFT pair 2000/4232
Processing SFT pair 2100/4232
Processing SFT pair 2200/4232
Processing SFT pair 2300/4232
Processing SFT pair 2400/4232
Processing SFT pair 2500/4232
Processing SFT pair 2600/4232
Processing SFT pair

In [None]:
import tensorflow as tf
import numpy as np
import pickle
import os
import re

# --- Hyperparameters (Must match model training) ---
VOCAB_SIZE = 8000        # Must match the VOCAB_SIZE used for training the loaded model
MAX_PARAGRAPH_LEN = 250  # Must match the MAX_PARAGRAPH_LEN used for training

# --- Fixed File Paths (Shared) ---
MODEL_SAVE_PATH = "paragraph_lm.keras"
TOKENIZER_SAVE_PATH = "paragraph_tokenizer.pkl"

# --- Utility Functions (Shared - Copied for script independence) ---
def clean_text(text): # Simplified for inference input3
    text = text.lower()
    text = re.sub(r"\s+", " ", text).strip()
    return text

def load_tokenizer(path):
    if os.path.exists(path):
        with open(path, 'rb') as handle:
            tokenizer = pickle.load(handle)
        print(f"Tokenizer loaded from {path}")
        return tokenizer
    print(f"ERROR: Tokenizer not found at {path}. Train a model first.")
    return None

def sample_from_probs(probs, temp):
    probs = np.asarray(probs).astype('float64')
    if temp <= 0 or temp == float('inf'): # temp=0 greedy, temp=inf uniform random (approx)
        return np.argmax(probs)

    # Add a small epsilon to prevent log(0) and ensure sum is not zero after division
    probs = np.log(probs + 1e-9) / temp
    exp_preds = np.exp(probs)
    preds = exp_preds / (np.sum(exp_preds) + 1e-9) # Add epsilon to denominator too

    try:
        # Ensure probabilities sum to 1 for multinomial
        preds = preds / np.sum(preds)
        probas = np.random.multinomial(1, preds, 1)
        return np.argmax(probas)
    except ValueError as e:
        # print(f"Warning: Multinomial sampling failed ('{e}'). Sum of preds: {np.sum(preds)}. Falling back to argmax.")
        return np.argmax(preds) # Fallback to greedy if sampling fails
# --- End Utility Functions ---

def generate_text_for_inference(model, tokenizer, seed_text, max_generated_len=50, temperature=1.0):
    print(f"\nGenerating text from seed: '{seed_text}' with temp {temperature}, max_len {max_generated_len}")

    current_text_for_model = clean_text(seed_text) # Model expects cleaned input
    generated_suffix = "" # Store only newly generated words to append to original seed

    # The model's vocabulary size, including padding (index 0) and OOV
    # This should match the output dimension of the model's dense layer.
    model_vocab_size = VOCAB_SIZE + 1

    for _ in range(max_generated_len):
        token_list = tokenizer.texts_to_sequences([current_text_for_model])[0]

        if not token_list:
            # print("Warning: Token list became empty during generation.")
            break

        # Pad the sequence to the model's expected input length
        padded_sequence = tf.keras.preprocessing.sequence.pad_sequences(
            [token_list], maxlen=MAX_PARAGRAPH_LEN-1, padding='pre' # MAX_PARAGRAPH_LEN-1 for model input
        )

        predicted_probs = model.predict(padded_sequence, verbose=0)[0]

        # Ensure predicted_probs length matches model_vocab_size.
        # This might not be strictly necessary if model construction is always correct, but good sanity check.
        if len(predicted_probs) != model_vocab_size:
            print(f"Warning: Predicted_probs length ({len(predicted_probs)}) "
                  f"mismatches model_vocab_size ({model_vocab_size}).")
            # Potentially pad or truncate predicted_probs, or error out.
            # For now, we assume it matches.

        predicted_word_index = sample_from_probs(predicted_probs, temperature)

        # Stop if padding (index 0) is predicted
        if predicted_word_index == 0:
            # print("Prediction stopped: Padding token predicted.")
            break

        output_word = tokenizer.index_word.get(predicted_word_index)

        # Stop if OOV token is predicted or word not found (shouldn't happen with valid index)
        if not output_word or output_word == tokenizer.oov_token:
            # print(f"Prediction stopped: OOV ('{tokenizer.oov_token}') or no word for index {predicted_word_index}.")
            break

        generated_suffix += " " + output_word
        current_text_for_model += " " + output_word # Append to the input for the next prediction step

        # Optional: Stop on sentence-ending punctuation
        if output_word in ['.', '!', '?'] and len(generated_suffix.strip()) > 0 : # Check len to avoid stopping on first word if it's punctuation
            # print("Sentence end punctuation detected.")
            break

    return seed_text + generated_suffix # Append generated part to original, uncleaned seed


def run_interactive_inference():
    print("\n--- Starting Interactive Inference Mode ---")

    tokenizer = load_tokenizer(TOKENIZER_SAVE_PATH)
    if not tokenizer:
        return

    # Verify tokenizer's num_words matches VOCAB_SIZE hyperparameter. Critical for consistency.
    if tokenizer.num_words != VOCAB_SIZE:
        print(f"CRITICAL WARNING: Loaded tokenizer has num_words={tokenizer.num_words}, "
              f"but Inference script's VOCAB_SIZE is {VOCAB_SIZE}. These must match the settings "
              f"used during model training. Inference might be incorrect.")
        # Decide whether to proceed or abort. For now, we proceed with a warning.

    if not os.path.exists(MODEL_SAVE_PATH):
        print(f"Error: Model not found at {MODEL_SAVE_PATH}. Train a model first.")
        return

    try:
        model = tf.keras.models.load_model(MODEL_SAVE_PATH)
        print("Model loaded successfully.")
        # model.summary() # Optional: print summary
    except Exception as e:
        print(f"Error loading model: {e}")
        return

    print("\nEnter 'quit' to exit inference mode.")
    while True:
        try:
            user_prompt = input("Enter your prompt: ")
            if user_prompt.lower() == 'quit':
                break
            if not user_prompt.strip():
                print("Prompt cannot be empty.")
                continue

            temp_str = input(f"Enter temperature (e.g., 0.7, default 1.0, 0 for greedy): ")
            try:
                temperature = float(temp_str) if temp_str.strip() else 1.0
            except ValueError:
                print("Invalid temperature, using 1.0.")
                temperature = 1.0

            max_len_str = input(f"Max new words to generate (e.g., 50, default 50): ")
            try:
                max_gen_len = int(max_len_str) if max_len_str.strip() else 50
                if max_gen_len <=0: max_gen_len = 50
            except ValueError:
                print("Invalid max length, using 50.")
                max_gen_len = 50

            response = generate_text_for_inference(model, tokenizer, user_prompt,
                                                   max_generated_len=max_gen_len,
                                                   temperature=temperature)
            print(f"\nModel Response:\n{response}\n")

        except KeyboardInterrupt:
            print("\nInference interrupted by user.")
            break
        except Exception as e:
            print(f"An error occurred during inference: {e}")
            import traceback
            traceback.print_exc()

    print("--- Exiting Inference Mode ---")

if __name__ == "__main__":
    run_interactive_inference()


--- Starting Interactive Inference Mode ---
Tokenizer loaded from paragraph_tokenizer.pkl
Model loaded successfully.

Enter 'quit' to exit inference mode.
Enter your prompt: Energy is the capacity to do work
Enter temperature (e.g., 0.7, default 1.0, 0 for greedy): 0.5
Max new words to generate (e.g., 50, default 50): 25

Generating text from seed: 'Energy is the capacity to do work' with temp 0.5, max_len 25

Model Response:
Energy is the capacity to do work is a fundamental property of waves and is governed by the maximum energy of the light but it is expressed as p t where p

Enter your prompt: quit
--- Exiting Inference Mode ---


In [None]:
!pip install streamlit as st
import tensorflow as tf
import numpy as np
import pickle, re

# --- 1. USER: VERIFY THESE CONSTANTS MATCH YOUR TRAINING SCRIPT ---
VOCAB_SIZE, MAX_PARAGRAPH_LEN = 8000, 250 # <<< EDIT if different from training
MODEL_INPUT_LEN = MAX_PARAGRAPH_LEN - 1   # Model expects input_length = MAX_PARAGRAPH_LEN - 1
MODEL_PATH, TOKENIZER_PATH = "paragraph_lm.keras", "paragraph_tokenizer.pkl"

@st.cache_resource # Load model & tokenizer once
def load_resources():
    try:
        tok = pickle.load(open(TOKENIZER_PATH, 'rb'))
        mod = tf.keras.models.load_model(MODEL_PATH)
        return mod, tok
    except Exception as e:
        st.error(f"🔴 Error loading files: {e}. Ensure '{MODEL_PATH}' & '{TOKENIZER_PATH}' are present.")
        return None, None
model, tokenizer = load_resources()

st.set_page_config(page_title="Nano LLM", layout="centered") # <<< EDIT Page Tab Title
st.title("✍️ Nano Script Spark") # <<< EDIT Main Title

if model and tokenizer:
    prompt = st.text_area("🎬 Your Script Idea:", "In a world filled with magic...", height=80, placeholder="Start typing your script idea here...") # <<< EDIT Prompt Label & Placeholder

    col1, col2 = st.columns([0.6, 0.4]) # Adjust column ratios if needed
    with col1: temp = st.slider("🌡️ Creativity (Temp)", 0.0, 2.0, 0.7, 0.05, help="0: more predictable, >1: more random") # <<< EDIT Slider Label
    with col2: max_new = st.number_input("📝 Max New Words", 10, MODEL_INPUT_LEN - 10, 50, 5) # <<< EDIT Number Input Label

    if st.button("✨ Generate Script!", type="primary", use_container_width=True): # <<< EDIT Button Text
        if prompt.strip():
            with st.spinner("⏳ AI is crafting..."): # <<< EDIT Spinner Text
                try:
                    current_text = re.sub(r"\s+", " ", prompt.lower().strip()) # Basic cleaning
                    gen_suffix = ""
                    for _ in range(max_new):
                        tokens = tokenizer.texts_to_sequences([current_text])[0]
                        if not tokens: break
                        padded = tf.keras.preprocessing.sequence.pad_sequences([tokens], maxlen=MODEL_INPUT_LEN, padding='pre')
                        probs = model.predict(padded, verbose=0)[0].astype('float64')

                        if temp <= 0: idx = np.argmax(probs) # Greedy
                        else: # Sampling with temperature
                            log_probs = np.log(probs + 1e-9) / temp; exp_probs = np.exp(log_probs)
                            norm_probs = exp_probs / (np.sum(exp_probs) + 1e-9)
                            try: idx = np.argmax(np.random.multinomial(1, norm_probs / np.sum(norm_probs), 1))
                            except: idx = np.argmax(norm_probs) # Fallback

                        if idx == 0: break # Predicted padding token
                        word = tokenizer.index_word.get(idx)
                        if not word or word == tokenizer.oov_token: break # Predicted OOV or unknown

                        gen_suffix += " " + word
                        current_text += " " + word
                        if word in ['.', '!', '?'] and len(gen_suffix.strip()) > 1: break # Stop on sentence end

                    st.subheader("📜 Generated Snippet:") # <<< EDIT Output Header
                    st.markdown(f"> {prompt.strip()}{gen_suffix.strip()}") # Display with quote style
                except Exception as e: st.error(f"⚠️ Generation error: {e}")
        else: st.warning("💡 Please enter a script idea to start.") # <<< EDIT Empty Prompt Warning
else:
    st.error("🔴 Critical: Model or Tokenizer could not be loaded. Check file paths and console.")

# To run: streamlit run app.py

Collecting streamlit
  Downloading streamlit-1.45.1-py3-none-any.whl.metadata (8.9 kB)
[31mERROR: Ignored the following versions that require a different python version: 0.55.2 Requires-Python <3.5[0m[31m
[0m[31mERROR: Could not find a version that satisfies the requirement as (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for as[0m[31m
[0m

NameError: name 'st' is not defined