<a href="https://colab.research.google.com/github/Sharuk-baba/Research/blob/main/Koopman%20Operator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers sentence-transformers numpy scipy scikit-learn matplotlib torch

import numpy as np
import torch
import matplotlib.pyplot as plt
from transformers import AutoTokenizer, AutoModelForCausalLM
from sentence_transformers import SentenceTransformer

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [2]:
MODEL_NAME = "gpt2-medium"  # or "mistralai/Mistral-7B-Instruct" if on A100 Colab Pro+

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [3]:
embedder = SentenceTransformer("all-MiniLM-L6-v2")

def embed_text(samples):
    return embedder.encode(samples, convert_to_tensor=True)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [4]:
def generate_samples(prompt, max_length=50, temperature=0.7, top_k=50, top_p=0.9):
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids
    outputs = model.generate(input_ids, max_length=max_length,
                             temperature=temperature, top_k=top_k, top_p=top_p)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [5]:
def koopman_operator(X, reg=1e-3):
    X1, X2 = X[:-1], X[1:]
    K = np.linalg.solve(X1.T @ X1 + reg * np.eye(X1.shape[1]), X1.T @ X2)
    return K

In [6]:
def predict_trajectory(X, K, steps=3):
    trajectory = [X[-1]]
    for _ in range(steps):
        trajectory.append(trajectory[-1] @ K)
    return np.array(trajectory)

In [7]:
from sklearn.metrics.pairwise import cosine_similarity

def rerank_samples(samples, koopman_predicted):
    embeddings = embed_text(samples)
    similarities = cosine_similarity(koopman_predicted.reshape(1, -1), embeddings)
    ranked_samples = [s for _, s in sorted(zip(similarities[0], samples), reverse=True)]
    return ranked_samples

In [8]:
def generate_greedy(prompt, max_length=50):
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids
    outputs = model.generate(input_ids, max_length=max_length, do_sample=False)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [9]:
def plot_trajectory(real, predicted):
    plt.plot(real, label="Real Trajectory")
    plt.plot(predicted, label="Predicted Trajectory")
    plt.legend()
    plt.show()

In [10]:
class KoopmanDecodingPipeline:
    def __init__(self, model_name="gpt2-medium", embedder_name="all-MiniLM-L6-v2"):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForCausalLM.from_pretrained(model_name)
        self.embedder = SentenceTransformer(embedder_name)

    def generate_candidates(self, prompt):
        return generate_samples(prompt)

    def compute_embeddings(self, samples):
        return embed_text(samples)

    def fit_koopman(self, embeddings):
        return koopman_operator(embeddings)

    def predict_optimal(self, X, K):
        return predict_trajectory(X, K)

    def rerank_samples(self, samples, koopman_predicted):
        return rerank_samples(samples, koopman_predicted)