In [None]:
!pip install outlines
!pip install transformers bitsandbytes>=0.39.0 -q
!pip install accelerate

In [1]:
!huggingface-cli login --token ''

Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
import matplotlib.pyplot as plt
from dataclasses import dataclass
from transformers import AutoModelForCausalLM, AutoTokenizer
import seaborn as sns
from datetime import datetime
import pandas as pd
import numpy as np
import os
import torch
import outlines
from outlines import models
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.datasets import fetch_openml
from sklearn.neighbors import KernelDensity
from transformers import BitsAndBytesConfig

os.environ["HUGGINGFACEHUB_API_TOKEN"] = ''

In [2]:
co2 = fetch_openml(data_id=41187, as_frame=True, parser='auto')
co2_data = co2.frame
co2_data["date"] = pd.to_datetime(co2_data[["year", "month", "day"]])
co2_data = co2_data.sort_values(by="date")
co2_data = co2_data[["date", "co2"]].set_index("date")

co2_data=co2_data.squeeze()
train, test = co2_data[:int(0.7*len(co2_data))], co2_data[int(0.7*len(co2_data)):]
print(train.shape,test.shape,co2_data.shape)

(1557,) (668,) (2225,)


In [4]:
@dataclass
class Scaler:
    transform: callable = lambda x: x
    inv_transform: callable = lambda x: x

In [5]:
model_cache = {}

In [9]:
class LLMTime:

    def __init__(self, train, test, alpha=0.95, beta=0.3, batch_length=400, basic=False, temperature=0.5, do_sample=True, model_name="mistralai/Mistral-7B-v0.1", repetition_penalty=1.0, all_in_one=False, pretrain=False, load_in_4bit=True, num_samples=10):
        self.model_name = model_name
        self.train = train
        self.test = test
        self.alpha = alpha
        self.beta = beta
        self.batch_length = batch_length
        self.basic = basic
        self.temperature = temperature
        self.do_sample = do_sample
        self.repetition_penalty = repetition_penalty
        self.scalers = None
        self.input_str = None
        self.test_str = None
        self.tokenizer = None
        self.model = None
        self.good_tokens = None
        self.bad_tokens = None
        self.output = None
        self.transformed_output_arr = None
        self.all_in_one = all_in_one
        self.pretrain = pretrain
        self.load_in_4bit = load_in_4bit
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.num_samples = num_samples
        self.preprocess_data()

    @staticmethod
    def get_scaler(history, alpha=0.95, beta=0.3, basic=False):
        history = history[~np.isnan(history)]
        if basic:
            q = np.maximum(np.quantile(np.abs(history), alpha),.01)
            def transform(x):
                return x / q
            def inv_transform(x):
                return x * q
        else:
            min_ = np.min(history) - beta*(np.max(history)-np.min(history))
            q = np.quantile(history-min_, alpha)
            if q == 0:
                q = 1
            def transform(x):
                return (x - min_) / q
            def inv_transform(x):
                return x * q + min_
        return Scaler(transform=transform, inv_transform=inv_transform)

    def convert_array_to_string(self, arr):
        rounded_values = [round(val * 1000) for val in arr]
        str_values = [str(val) for val in rounded_values]
        result_string = ",".join(str_values)
        return result_string

    def preprocess_data(self):
        train = self.train
        test = self.test
        if not isinstance(train, list):
            train = [train]
            test = [test]
        n_val = len(train)
        for i in range(len(train)):
            if not isinstance(train[i], pd.Series):
                train[i] = pd.Series(train[i], index=pd.RangeIndex(len(train[i])))
                test[i] = pd.Series(test[i], index=pd.RangeIndex(len(train[i]), len(test[i])+len(train[i])))
        alpha = self.alpha
        beta = self.beta
        basic = self.basic
        self.scalers = [self.get_scaler(train[i].values, alpha=alpha, beta=beta, basic=basic) for i in range(len(train))]
        input_arrs = [train[i].values for i in range(len(train))]
        transformed_input_arrs = np.array([scaler.transform(input_array) for input_array, scaler in zip(input_arrs, self.scalers)])
        input_str = self.convert_array_to_string(transformed_input_arrs[0])
        test_arrs = [test[i].values for i in range(len(test))]
        transformed_test_arrs = np.array([scaler.transform(input_array) for input_array, scaler in zip(test_arrs, self.scalers)])
        test_str = self.convert_array_to_string(transformed_test_arrs[0])
        self.input_str = input_str
        self.test_str = test_str
        return input_str, test_str, self.scalers

    def zero_shot(self):
        input_str = self.input_str
        quantization_config = BitsAndBytesConfig(load_in_4bit=self.load_in_4bit)
        if model_cache.get(self.model_name) is not None:
            self.model = model_cache[self.model_name][0]
            self.tokenizer = model_cache[self.model_name][1]
        else:
            self.model = AutoModelForCausalLM.from_pretrained(self.model_name, device_map='auto', quantization_config=quantization_config)
            self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, padding_side="left")
            model_cache[self.model_name] = [self.model, self.tokenizer]
        out = ""
        examples = []
        batch_length = self.batch_length
        for i in range(batch_length, len(input_str) - batch_length//5, batch_length):
            examples.append((input_str[i-batch_length:i], input_str[i:i+batch_length//5]))
            print((input_str[i-batch_length:i], input_str[i:i+batch_length//5]))
        print(examples)
        @outlines.prompt
        def labelling(to_label, examples):
            """
            {% for example in examples %}
            {{ example[0] }} -> {{ example[1] }}
            {% endfor %}
            {{ to_label }} ->
            """
        model = models.Transformers(self.model, self.tokenizer)
        generator = outlines.generate.regex(
            model,
            r"[0-9,]{954}",
        )
        print(len(self.test_str))
        k = 0
        while len(out) < len(self.test_str):
            print(k)
            k+=1
            prompt = labelling(input_str[-self.batch_length:], examples)
            answer = generator(prompt, max_tokens=(self.batch_length//5))
            input_str += answer
            out += answer
        return out

    @staticmethod
    def invert_string_to_array(string_values):
        string_values = string_values.replace(" ", "")
        if string_values[-1] == ',':
            string_values = string_values[:len(string_values) - 1]
        str_values_list = string_values.split(',')
        float_values = [float(val) if val.strip() != '' else 0.0 for val in str_values_list]
        original_values = [val / 1000 for val in float_values]
        return original_values

    def get_original_array(self, output_str):
        output_arr = self.invert_string_to_array(string_values=output_str)
        output_arr = np.array(output_arr)
        transformed_output_arr = self.scalers[0].inv_transform(output_arr)
        self.transformed_output_arr = transformed_output_arr
        return transformed_output_arr

    def plot(self, input, outputs, m):
          model_parameters = {
              "alpha": self.alpha,
              "beta": self.beta,
              "batch_length": self.batch_length,
              "basic": self.basic,
              "temperature": self.temperature,
              "do_sample": self.do_sample,
              "repetition_penalty": self.repetition_penalty,
              "all_in_one": self.all_in_one,
              "pretrain": self.pretrain,
              "load_in_4bit": self.load_in_4bit,
              "num_samples": self.num_samples
          }
          k = min(len(input), len(outputs[0]))
          sns.set(style="whitegrid")
          fig, ax = plt.subplots(figsize=(10, 6))
          plt.title(f"Few Shot Outlines {self.model_name}")
          plt.axvline(x=m, color='k', linestyle='--', label='Train-Test Split')
          min_output = np.min(outputs, axis=0)[:k]
          max_output = np.max(outputs, axis=0)[:k]
          median_output = np.median(outputs, axis=0)[:k]
          mean_output = np.mean(outputs, axis=0)[:k]
          input = input[:k]


          for i in range(len(outputs)):
            plt.plot(outputs[i][:k], color='darkgray', alpha=0.5, label='_nolegend_')

          plt.plot(min_output, color='green', linestyle='--', label='Min Predicted')
          plt.plot(max_output, color='orange', linestyle='--', label='Max Predicted')
          plt.plot(median_output, color='red', label='Median Predicted')
          plt.plot(mean_output, color='violet', label='Mean Predicted')

          plt.plot(input, color='blue', label='Actual')
          plt.legend()

          plt.grid(True)
          plt.xlabel('Time')
          sns.set_palette("husl")
          
          min_input = np.min(input)
          max_input = np.max(input)
          plt.ylim(int(0.99*min_input), int(1.01*max_input))

          mae = mean_absolute_error(input[-(len(input)-m):], median_output[-(len(median_output)-m):])
          rmse = np.sqrt(mean_squared_error(input[-(len(input)-m):], median_output[-(len(median_output)-m):]))
          input_range = np.max(input) - np.min(input)
          input_std = np.std(input)
          scaled_mae = mae / input_range
          scaled_rmse = rmse / input_std
          std_dev = np.std(outputs, axis=0)
          error_text = f"\n\nMedian MAE: {mae:.2f}\nMedian RMSE: {rmse:.2f}\nStandard Deviation: {np.mean(std_dev):.2f}"
          ax.text(1.05, 0.4, error_text, transform=ax.transAxes, fontsize=10, verticalalignment='center')
          textstr = '\n'.join([f"{key}: {value}" for key, value in model_parameters.items()])
          props = dict(boxstyle='round', facecolor='wheat', alpha=0.5)
          ax.text(1.05, 0.6, textstr, transform=ax.transAxes, fontsize=10, verticalalignment='center', bbox=props)

          model_name = self.model_name.split('/')[1]
          current_time = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
          random_number = np.random.randint(0, 1000)
          filename = f"{model_name}-few-shot-outline-{current_time}-{random_number}.png"

          plt.savefig(filename, dpi=300, bbox_inches='tight')
          print(f"Plot saved as {filename}")

    def run(self):
        outputs = []
        for i in range(self.num_samples):
            output = self.zero_shot()
            outputs.append(output)
        r = np.array(outputs)
        self.plot_and_return(r)
        return outputs

    def plot_and_return(self, outputs):
      l = []
      for i, output in enumerate(outputs):
        firstCommaIndex = output.find(',')
        lastCommaIndex = output.rfind(',')
        outputs[i] = output[firstCommaIndex+1:lastCommaIndex]
      input_arr = self.get_original_array(self.input_str + ',' + self.test_str)
      inp = self.get_original_array(self.input_str)
      m = len(inp)
      min_len = 9999999999
      for i, output in enumerate(outputs):
        transformed_output_arr = self.get_original_array(self.input_str + ',' + output)
        min_len = min(min_len, len(transformed_output_arr))
        l.append(transformed_output_arr)
      input_arr = input_arr[:min_len]
      for i, x in enumerate(l):
        l[i] = x[:min_len]
      self.plot(input_arr, np.array(l), m)

In [None]:
llm = LLMTime(train,
              test,
              alpha=1,
              beta=0.1,
              batch_length=1000,
              basic=False,
              temperature=0.7,
              do_sample=True,
              repetition_penalty=1,
              all_in_one=False,
              pretrain=False,
              load_in_4bit=True,
              num_samples=10)

In [None]:
llm.run()