In [1]:
!pip install bitsandbytes>=0.39.0 -q
!pip install accelerate



In [1]:
from langchain import FewShotPromptTemplate, PromptTemplate
from langchain.chains import LLMChain
from langchain.llms import Ollama
import matplotlib.pyplot as plt
from dataclasses import dataclass
from transformers import AutoModelForCausalLM, AutoTokenizer
from langchain import HuggingFaceHub
import seaborn as sns
from datetime import datetime
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error
from tqdm import tqdm
import re

In [2]:
import os

os.environ["HUGGINGFACEHUB_API_TOKEN"] = ''

In [3]:
!huggingface-cli login --token ''

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /Users/shubhagarwal/.cache/huggingface/token
Login successful


In [4]:
@dataclass
class Scaler:
    transform: callable = lambda x: x
    inv_transform: callable = lambda x: x

In [63]:
class LLMTime:
    
    def __init__(self, train, test, alpha=0.95, beta=0.3, batch_length=400, basic=False, temperature=0.5, do_sample=True, model_name="mistralai/Mistral-7B-v0.1", repetition_penalty=1.0, ollama = False, num_samples=10):
        self.ollama = ollama
        self.model_name = model_name
        self.train = train
        self.test = test
        self.alpha = alpha
        self.beta = beta
        self.batch_length = batch_length
        self.basic = basic
        self.temperature = temperature
        self.do_sample = do_sample
        self.repetition_penalty = repetition_penalty
        self.scalers = None
        self.input_str = None
        self.test_str = None
        self.tokenizer = None
        self.model = None
        self.good_tokens = None
        self.bad_tokens = None
        self.output = None
        self.transformed_output_arr = None
        self.num_samples = num_samples
        self.preprocess_data()
    
    @staticmethod
    def get_scaler(history, alpha=0.95, beta=0.3, basic=False):
        history = history[~np.isnan(history)]
        if basic:
            q = np.maximum(np.quantile(np.abs(history), alpha),.01)
            def transform(x):
                return x / q
            def inv_transform(x):
                return x * q
        else:
            min_ = np.min(history) - beta*(np.max(history)-np.min(history))
            q = np.quantile(history-min_, alpha)
            if q == 0:
                q = 1
            def transform(x):
                return (x - min_) / q
            def inv_transform(x):
                return x * q + min_
        return Scaler(transform=transform, inv_transform=inv_transform)
    
    def convert_array_to_string(self, arr):
        rounded_values = [round(val * 1000) for val in arr]
        str_values = [str(val) for val in rounded_values]
        result_string = ",".join(str_values)
        return result_string

    def preprocess_data(self):
        train = self.train
        test = self.test
        if not isinstance(train, list):
            train = [train]
            test = [test]
        n_val = len(train)
        for i in range(len(train)):
            if not isinstance(train[i], pd.Series):
                train[i] = pd.Series(train[i], index=pd.RangeIndex(len(train[i])))
                test[i] = pd.Series(test[i], index=pd.RangeIndex(len(train[i]), len(test[i])+len(train[i])))
        alpha = self.alpha
        beta = self.beta
        basic = self.basic
        self.scalers = [self.get_scaler(train[i].values, alpha=alpha, beta=beta, basic=basic) for i in range(len(train))]
        input_arrs = [train[i].values for i in range(len(train))]
        transformed_input_arrs = np.array([scaler.transform(input_array) for input_array, scaler in zip(input_arrs, self.scalers)])
        input_str = self.convert_array_to_string(transformed_input_arrs[0])
        test_arrs = [test[i].values for i in range(len(test))]
        transformed_test_arrs = np.array([scaler.transform(input_array) for input_array, scaler in zip(test_arrs, self.scalers)])
        test_str = self.convert_array_to_string(transformed_test_arrs[0])
        self.input_str = input_str
        self.test_str = test_str
        return input_str, test_str, self.scalers
    
    def few_shot(self):
        input_str = self.input_str
        examples = []
        batch_length = self.batch_length
        for i in range(batch_length, len(input_str) - batch_length, batch_length):
            examples.append({
                "input": input_str[i-batch_length:i],
                "output": input_str[i:i+batch_length//5]
            })
        example_template = """
        {{input}} -> {{output}}
        """
        
        prompt = PromptTemplate.from_template(example_template, template_format='jinja2')
        
        few_shot_prompt = FewShotPromptTemplate(
            examples=examples,
            example_prompt=prompt,
            prefix="Predict the next token value, outputting only digits or commas, use the examples given as reference: ",
            suffix="{input} ->",
            input_variables=["input"],
            example_separator="",
        )
        if not self.ollama:
            self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, padding_side="left")
            good_tokens_str = list("0123456789,")
            good_tokens = [self.tokenizer.convert_tokens_to_ids(token) for token in good_tokens_str]
            self.bad_tokens = [i for i in range(len(self.tokenizer)) if i not in good_tokens]
            self.good_tokens = [i for i in range(len(self.tokenizer)) if i in good_tokens]
            self.model = HuggingFaceHub(repo_id=self.model_name, model_kwargs={
                "temperature": self.temperature, 
                "do_sample": self.do_sample,
                "repetition_penalty": self.repetition_penalty,
                "max_new_tokens": self.batch_length//5, 
                "min_new_tokens": self.batch_length//5,
                "force_words_ids": [[t] for t in self.good_tokens],
                "bad_words_ids": [[t] for t in self.bad_tokens],
                "num_beams": 2,
                "top_k": 20
            })
        else:
            self.model = Ollama(model=self.model_name)
        chain = LLMChain(llm=self.model, prompt=few_shot_prompt)
        output = ""
        inp = input_str
        i = 0
        while len(output) < len(self.test_str) and i < 1000:
            print(len(output), len(self.test_str))
            i+=1
            out = chain.predict(input=inp[-self.batch_length:])
            sub = out.split('>')[-1]
            sub = sub.strip()
            sub = sub.replace('\n', '').replace('\r', '')
            print(f'Output {i} : {sub}')
            pattern = r'^[0-9,]+$'
            if not bool(re.match(pattern, sub)):
                break
            firstComma = sub.find(',')
            lastComma = sub.rfind(',')
            if firstComma == -1 or lastComma == -1 or firstComma == lastComma:
                break
            sub = sub[firstComma+1:lastComma]
            if len(sub) == 0:
                break
            output = output + ',' + sub
            inp = inp[:inp.rfind(',')]
            inp = inp + ',' + sub
            print(output)
            print(len(output), len(self.test_str))
        self.output = output
        return output
      
    @staticmethod    
    def invert_string_to_array(string_values):
        string_values = string_values.replace(" ", "")
        if string_values[-1] == ',':
            string_values = string_values[:len(string_values) - 1]
        str_values_list = string_values.split(',')
        float_values = [float(val) if val.strip() != '' else 0.0 for val in str_values_list]
        original_values = [val / 1000 for val in float_values]
        return original_values

    def get_original_array(self, output_str):
        output_arr = self.invert_string_to_array(string_values=output_str)
        output_arr = np.array(output_arr)
        transformed_output_arr = self.scalers[0].inv_transform(output_arr)
        self.transformed_output_arr = transformed_output_arr
        return transformed_output_arr
    
    def plot(self, input, outputs, m):
        model_parameters = {
            "alpha": self.alpha,
            "beta": self.beta,
            "batch_length": self.batch_length,
            "basic": self.basic,
            "temperature": self.temperature,
            "do_sample": self.do_sample,
            "repetition_penalty": self.repetition_penalty,
            "num_samples": self.num_samples
        }

        sns.set(style="whitegrid")
        fig, ax = plt.subplots(figsize=(10, 6))
        plt.title(f"Few Shot {self.model_name}")
        plt.axvline(x=m, color='k', linestyle='--', label='Train-Test Split')
        min_output = np.min(outputs, axis=0)
        max_output = np.max(outputs, axis=0)
        median_output = np.median(outputs, axis=0)
        mean_output = np.mean(outputs, axis=0)
        k = min(len(input), len(median_output))
        input = input[:k]
        median_output = median_output[:k]
        mean_output = mean_output[:k]
        min_output = min_output[:k]
        max_output = max_output[:k]

        for i in range(len(outputs)):
            plt.plot(outputs[i][:k], color='darkgray', alpha=0.5, label='_nolegend_')

        plt.plot(min_output, color='green', linestyle='--', label='Min Predicted')
        plt.plot(max_output, color='orange', linestyle='--', label='Max Predicted')
        plt.plot(median_output, color='red', label='Median Predicted')
        plt.plot(mean_output, color='red', label='Mean Predicted')

        min_input = np.min(input)
        max_input = np.max(input)
        plt.ylim(int(0.99*min_input), int(1.01*max_input))

        plt.plot(input, color='blue', label='Actual')
        plt.legend(loc='upper left')

        plt.grid(True)
        plt.xlabel('Time')
        sns.set_palette("husl")
        print("Here")
        print(len(input[-(len(input)-m):]), len(median_output[-(len(median_output)-m):]))
        mae = mean_absolute_error(input[-(len(input)-m):], median_output[-(len(median_output)-m):])
        rmse = np.sqrt(mean_squared_error(input[-(len(input)-m):], median_output[-(len(median_output)-m):]))
        std_dev = np.std(outputs, axis=0)

        error_text = f"\n\nMedian MAE: {mae:.2f}\nMedian RMSE: {rmse:.2f}\nStandard Deviation: {np.mean(std_dev):.2f}"
        ax.text(1.05, 0.4, error_text, transform=ax.transAxes, fontsize=10, verticalalignment='center')
        textstr = '\n'.join([f"{key}: {value}" for key, value in model_parameters.items()])
        props = dict(boxstyle='round', facecolor='wheat', alpha=0.5)
        ax.text(1.05, 0.6, textstr, transform=ax.transAxes, fontsize=10, verticalalignment='center', bbox=props)

        model_name = self.model_name.split('/')[1]
        current_time = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
        random_number = np.random.randint(0, 1000)
        filename = f"plts/{model_name}-few-shot-{current_time}-{random_number}.png"

        plt.savefig(filename, dpi=300, bbox_inches='tight')
        print(f"Plot saved as {filename}")
    
    def run(self):
        outputs =[]
        for i in range(self.num_samples):
            output = self.few_shot()
            outputs.append(output)
        r = np.array(outputs)
        self.plot_and_return(r)
        return r
        
    def plot_and_return(self, outputs):
        l = []
        input_arr = self.get_original_array(self.input_str + ',' + self.test_str)
        inp = self.get_original_array(self.input_str)
        m = len(inp)
        min_len = 9999999999
        for i, output in enumerate(outputs):
            if output[0] == ',':
                output = output[1:]
            transformed_output_arr = self.get_original_array(self.input_str + ',' + output)
            min_len = min(min_len, len(transformed_output_arr))
            l.append(transformed_output_arr)
        input_arr = input_arr[:min_len]
        for i, x in enumerate(l):
            l[i] = x[:min_len]
        print(l)
        self.plot(input_arr, np.array(l), m)

In [56]:
from sklearn.datasets import fetch_openml

co2 = fetch_openml(data_id=41187, as_frame=True, parser='auto')
co2_data = co2.frame
co2_data["date"] = pd.to_datetime(co2_data[["year", "month", "day"]])
co2_data = co2_data.sort_values(by="date")
co2_data = co2_data[["date", "co2"]].set_index("date")
co2_data=co2_data.squeeze()
train_2, test_2 = co2_data[:int(0.7*len(co2_data))], co2_data[int(0.7*len(co2_data)):]
print(train_2.shape,test_2.shape,co2_data.shape)

(1557,) (668,) (2225,)


In [64]:
data = pd.read_csv('AirPassengers.csv')
data.head()

Unnamed: 0,Month,#Passengers
0,1949-01,112
1,1949-02,118
2,1949-03,132
3,1949-04,129
4,1949-05,121


In [65]:
data.set_index('Month')
data = data.squeeze()
train, test = data['#Passengers'][:int(0.7*len(data))], data['#Passengers'][int(0.7*len(data)):]

In [66]:
llm_time = LLMTime(train, test, alpha=0.95, beta=0.3, batch_length=50, basic=False, temperature=0.7, do_sample=True, ollama=False, repetition_penalty=1, num_samples=1, model_name="mistralai/Mistral-7B-Instruct-v0.2")

In [None]:
out = llm_time.run()