# Rephrasing Chemistry  

### Hurst could allow us to predict downstream performance.  

To calculate it: Run an LM through it. Grab the prediction and calculate the amount of bits required to represent this prediction. Use this as your timeseries to calculate the Hurst Parameter

Create the dataset

In [None]:
import torch
from torch.utils.data import Dataset
import os

BATCH_SIZE = 4

class TextDataset(Dataset):
    def __init__(self, markdown_folder):
        super().__init__()
        self.paragraphs = self.load_paragraphs(markdown_folder)

    def load_paragraphs(self, folder):
        chunks = []
        # Loop through each file in the markdown folder
        for filename in os.listdir(folder):
            if filename.endswith('.mmd'):
                file_path = os.path.join(folder, filename)
                try:
                    with open(file_path, 'r', encoding='utf-8') as file:
                        text = file.read()
                        words = text.split()
                        chunks.extend([" ".join(words[i:i+1000]) for i in range(0, len(words), 1000)])
                except IOError as e:
                    print(f"Failed to read {file_path}: {e}")
        return chunks

    def __len__(self):
        return len(self.paragraphs)

    def __getitem__(self, idx):
        return self.paragraphs[idx]

In [None]:
from torch.utils.data import Dataset
import json

class InputDataset(Dataset):
    def __init__(self, filename):
        super().__init__()
        self.input_texts = self.load_paragraphs(filename)

    def load_paragraphs(self, filename):
        input_texts = []
        try:
            with open(filename, 'r', encoding='utf-8') as file:
                data = json.load(file)
                for item in data['results']:
                    input_texts.append(item['input_text'])  # Removed extra parentheses
        except IOError as e:
            print(f"Failed to read {filename}: {e}")
        return input_texts

    def __len__(self):
        # Assuming each input_text has a corresponding output_text
        return len(self.input_texts)

    def __getitem__(self, idx):
        return self.input_texts[idx]

In [None]:
from torch.utils.data import Dataset
import json

class OutputDataset(Dataset):
    def __init__(self, filename):
        super().__init__()
        self.output_texts = self.load_paragraphs(filename)

    def load_paragraphs(self, filename):
        output_texts = []
        try:
            with open(filename, 'r', encoding='utf-8') as file:
                data = json.load(file)
                for item in data['results']:
                    output_texts.append(item['output_text'])  # Removed extra parentheses
        except IOError as e:
            print(f"Failed to read {filename}: {e}")
        return output_texts

    def __len__(self):
        # Assuming each input_text has a corresponding output_text
        return len(self.output_texts)

    def __getitem__(self, idx):
        return self.output_texts[idx]

Load the model for the paper

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
import torch
from torch.nn.functional import softmax
from torch.utils.data import DataLoader
from tqdm import tqdm
import pprint as pp

device = "cuda"  # the device to load the model onto
models_path = {"qwen": "Qwen/Qwen1.5-7B-Chat", "mistral-inst": "mistralai/Mistral-7B-Instruct-v0.1", "zephyr": "HuggingFaceH4/zephyr-7b-alpha"}
MODELPATH = models_path["qwen"]

batch_size = 4
dataset_input = InputDataset("./wikipedia/10-Papers-Mistral-7B-Instruct-v0.1.json")
dataset_output = OutputDataset("./wikipedia/10-Papers-Mistral-7B-Instruct-v0.1.json")
dataloader_out = DataLoader(dataset_output, batch_size=batch_size, shuffle=False)
dataloader_in = DataLoader(dataset_input, batch_size=batch_size, shuffle=False)

model = AutoModelForCausalLM.from_pretrained(MODELPATH)
model.to(device)
tokenizer = AutoTokenizer.from_pretrained(MODELPATH)
config = AutoConfig.from_pretrained(MODELPATH)
tokenizer.pad_token = tokenizer.eos_token


In [None]:
        
import torch
import torch.nn.functional as F

results_in = []
results_out = []

def calculate_probability_and_perplexity(input_ids, model):
    with torch.no_grad():
        outputs = model(input_ids, labels=input_ids)
        logits = outputs.logits
        shift_logits = logits[..., :-1, :].contiguous()
        shift_labels = input_ids[..., 1:].contiguous()
        # print(tokenizer.decode(shift_labels[0][0]))
        # print(tokenizer.decode(shift_labels[0][1]))
        # print(tokenizer.decode(shift_labels[0][2]))

        # print(shift_labels)   


        # Assuming shift_logits is your input tensor of logits
        log_probabilities_base_e = F.log_softmax(shift_logits, dim=-1)

        # Convert to base 2
        probabilities = log_probabilities_base_e / torch.log(torch.tensor(2.0)) * -1
        print(probabilities)

        # print(probabilities.shape)
        # print(shift_labels.shape)
        
        # Gather the probabilities of the actual next tokens
        actual_next_token_probs = torch.gather(probabilities, 2, shift_labels.unsqueeze(-1)).squeeze(-1)
        # pp.pprint(zip(shift_labels[0], probabilities))
        
        # Compute average negative log likelihood for perplexity
        loss = outputs.loss
        perplexity = torch.exp(loss).item()
        
        return actual_next_token_probs, perplexity

results_probs_next_token_original = []

for batch in tqdm(dataloader_in):
    for chunk in batch:
        inputs = tokenizer(chunk, return_tensors="pt", padding=True, truncation=True, max_length=config.max_position_embeddings)
        input_ids = inputs.input_ids.to(device)

        actual_next_token_probs, perplexity = calculate_probability_and_perplexity(input_ids, model)

        # actual_next_token_probs = 

        results_in.append({"input_text": chunk, "actual_next_token_probs": actual_next_token_probs, "perplexity": perplexity})

for batch in tqdm(dataloader_out):
    for chunk in batch:
        inputs = tokenizer(chunk, return_tensors="pt", padding=True, truncation=True, max_length=config.max_position_embeddings)
        input_ids = inputs.input_ids.to(device)

        actual_next_token_probs, perplexity = calculate_probability_and_perplexity(input_ids, model)

        results_out.append({"input_text": chunk, "actual_next_token_probs": actual_next_token_probs, "perplexity": perplexity})

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from hurst import compute_Hc

print(results_in)

results_in_filtered = []
results_out_filtered = []

for i in range(len(results_in)):
    if len(results_in[i]["actual_next_token_probs"][0]) > 100 and len(results_out[i]["actual_next_token_probs"][0]) > 100:
        results_in_filtered.append(results_in[i])
        results_out_filtered.append(results_out[i])

# Assuming results_in and results_out are your datasets
hurst_in = []
hurst_out = []

for i in range(len(results_in_filtered)):
    # Calculate Hurst exponent and append it to the lists
    H_in, _, _ = compute_Hc(results_in_filtered[i]["actual_next_token_probs"][0].cpu(), kind='change', simplified=True)
    hurst_in.append(H_in)
    H_out, _, _ = compute_Hc(results_out_filtered[i]["actual_next_token_probs"][0].cpu(), kind='change', simplified=True)
    hurst_out.append(H_out)
    # Calculate and append the ratio of Hurst exponents

filtered_hurst_in=[]
filtered_hurst_out= []

hurst_ratio = []
hurst_diff = []

for i in range(0, len(hurst_in)):
    if hurst_in[i] > 0.5 and hurst_out[i] > 0.5:
        H_in = hurst_in[i]
        H_out = hurst_out[i]

        filtered_hurst_in.append(H_in)
        filtered_hurst_out.append(H_out)
        hurst_ratio.append(H_in / H_out)
        hurst_diff.append(H_out - H_in)

hurst_in = filtered_hurst_in
hurst_out = filtered_hurst_out




# Plot for Hurst exponents
f, ax = plt.subplots()
ax.plot(hurst_in, label='Input', color="purple")
ax.plot(hurst_out, label='Output', color="deepskyblue")
ax.set_xlabel('Sample Index')
ax.set_ylabel('Hurst Exponent')
ax.grid(True)
ax.legend()
plt.show()

# Plot for Hurst ratio
f, ax = plt.subplots()
ax.plot(hurst_ratio, label='Hurst Ratio (Input/Output)', color="orange")
ax.set_xlabel('Sample Index')
# ax.set_ylim([0, 2])
ax.set_ylabel('Hurst Ratio')
ax.grid(True)
ax.legend()
plt.show()


# Plot for Hurst ratio
f, ax = plt.subplots()
ax.plot(hurst_ratio, label='Hurst Diff (Output - Input)', color="orange")
ax.set_xlabel('Sample Index')
# ax.set_ylim([0, 2])
ax.set_ylabel('Hurst Diff')
ax.grid(True)
ax.legend()
plt.show()

In [None]:
incomprehensible_text_id = np.argmin(hurst_in)
print(incomprehensible_text_id)
print(results_in_filtered[incomprehensible_text_id]["input_text"])
print(hurst_in[incomprehensible_text_id])
print(results_out_filtered[incomprehensible_text_id]["input_text"])
print(hurst_out[incomprehensible_text_id])

bestter_text = np.argmin(hurst_ratio)
print(results_in_filtered[bestter_text]["input_text"])
print(hurst_in[bestter_text])
print(results_out_filtered[bestter_text]["input_text"])
print(hurst_out[bestter_text])