In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from dataclasses import dataclass

In [None]:
from sklearn.datasets import fetch_openml

co2 = fetch_openml(data_id=41187, as_frame=True, parser='auto')
co2_data = co2.frame
co2_data["date"] = pd.to_datetime(co2_data[["year", "month", "day"]])
co2_data = co2_data.sort_values(by="date")
co2_data = co2_data[["date", "co2"]].set_index("date")

co2_data=co2_data.squeeze()
train, test = co2_data[:int(0.7*len(co2_data))], co2_data[int(0.7*len(co2_data)):]
print(train.shape,test.shape,co2_data.shape)

(1557,) (668,) (2225,)


In [None]:
if not isinstance(train, list):
    train = [train]
    test = [test]
n_val = len(train)

In [None]:
for i in range(len(train)):
    if not isinstance(train[i], pd.Series):
        train[i] = pd.Series(train[i], index=pd.RangeIndex(len(train[i])))
        test[i] = pd.Series(test[i], index=pd.RangeIndex(len(train[i]), len(test[i])+len(train[i])))

In [None]:
@dataclass
class Scaler:
    transform: callable = lambda x: x
    inv_transform: callable = lambda x: x

def get_scaler(history, alpha=0.95, beta=0.3, basic=False):
    history = history[~np.isnan(history)]
    if basic:
        q = np.maximum(np.quantile(np.abs(history), alpha),.01)
        def transform(x):
            return x / q
        def inv_transform(x):
            return x * q
    else:
        min_ = np.min(history) - beta*(np.max(history)-np.min(history))
        q = np.quantile(history-min_, alpha)
        if q == 0:
            q = 1
        def transform(x):
            return (x - min_) / q
        def inv_transform(x):
            return x * q + min_
    return Scaler(transform=transform, inv_transform=inv_transform)

In [None]:
alpha=0.95
beta=0.3
basic=False
scalers = [get_scaler(train[i].values, alpha=alpha, beta=beta, basic=basic) for i in range(len(train))]

In [None]:
input_arrs = [train[i].values for i in range(len(train))]
transformed_input_arrs = np.array([scaler.transform(input_array) for input_array, scaler in zip(input_arrs, scalers)])

In [None]:
transformed_input_arrs

array([[0.31754135, 0.34204615, 0.34817235, ..., 1.07923218, 1.07310598,
        1.07923218]])

In [None]:
def convert_array_to_string(arr):
    rounded_values = [round(val * 1000) for val in arr]
    str_values = [str(val) for val in rounded_values]
    result_string = ",".join(str_values)
    return result_string

In [None]:
transformed_input_arrs[0][0]

0.31754135184807064

In [None]:
input_str = convert_array_to_string(transformed_input_arrs[0])

In [None]:
input_str

'318,342,348,346,324,334,346,354,311,311,303,305,307,297,295,277,264,254,258,264,275,285,283,289,299,299,305,307,311,303,334,328,328,332,330,330,350,338,348,362,360,371,356,365,367,358,352,350,332,332,324,318,307,293,295,277,283,273,264,264,254,256,262,262,277,283,291,299,297,295,307,311,309,309,324,330,326,328,328,334,344,336,334,350,356,350,369,383,377,377,391,395,393,397,397,385,397,385,377,358,369,365,354,342,328,330,297,289,289,285,279,260,266,260,273,279,279,285,297,297,303,311,315,320,324,328,330,336,336,336,336,350,354,356,356,367,375,371,385,387,375,391,389,409,405,409,403,397,397,389,385,381,377,365,338,354,354,342,330,299,299,293,285,295,307,293,303,303,309,311,309,320,326,326,336,334,338,344,354,358,356,350,356,362,375,383,381,385,395,391,401,401,418,411,403,414,411,418,420,416,414,407,399,401,399,395,377,371,369,340,344,328,309,313,297,303,309,309,313,324,334,336,338,342,348,358,367,371,375,373,371,375,383,383,391,393,403,401,403,416,438,436,436,442,442,444,438,434,414,405

In [None]:
!pip install transformers bitsandbytes>=0.39.0 -q
!pip install accelerate

Collecting accelerate
  Downloading accelerate-0.26.1-py3-none-any.whl (270 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m270.9/270.9 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.26.1


In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
model = AutoModelForCausalLM.from_pretrained(
    "mistralai/Mistral-7B-v0.1", device_map="auto", load_in_4bit=True
)
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1", padding_side="left")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

In [None]:
import torch
torch.cuda.empty_cache()

In [None]:
hyperparameters = {
    'num_beams': 5,                    # Number of beams for beam search
    'temperature': 0.8,                # Temperature for controlling randomness in sampling
    'top_k': 50,                       # Number of highest probability tokens to keep for top-k sampling
    'top_p': 0.9,                      # Probability threshold for nucleus sampling
    'no_repeat_ngram_size': 2,        # Avoid repeating n-grams of this size in the generated sequence
    'repetition_penalty': 1.4,         # Penalty for repeated tokens in the generated sequence
    'output_scores': False,            # Whether or not to return prediction scores
    'output_attentions': False,        # Whether or not to return attention tensors
    'output_hidden_states': False,     # Whether or not to return hidden states
    'return_dict_in_generate': False,  # Whether to return a ModelOutput instead of a plain tuple
    'use_cache': True,                 # Whether or not to use past key/values attentions for decoding
}

In [None]:
good_tokens_str = list("0123456789,")
good_tokens = [tokenizer.convert_tokens_to_ids(token) for token in good_tokens_str]
bad_tokens = [i for i in range(len(tokenizer)) if i not in good_tokens]

In [None]:
from tokenizers import Tokenizer
from tokenizers.tools import EncodingVisualizer

tk = Tokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")
visualizer = EncodingVisualizer(tk)

text = "123, 124, 125"
visualizer(text)

In [None]:
batch_size = 50
from tqdm import tqdm
for i in tqdm(range(4000, len(input_str) - batch_size, batch_size)):
  batch_str = input_str[i-4000: i]
  model_inputs = tokenizer(batch_str, return_tensors='pt').to('cuda')
  generated_ids = model.generate(**model_inputs, do_sample=True, max_new_tokens=batch_size, temperature=0.8, repetition_penalty=2.0, bad_words_ids=[[t] for t in bad_tokens])
  output = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
  print(f"Batch: {batch_str}")
  print(f"Actual: {input_str[i: i+batch_size]}")
  print(f"Total: {input_str}")
  print(f"Pred: {output[-batch_size:]}")

In [None]:
inp = input_str[-4000:]
model_inputs = tokenizer(inp, return_tensors='pt').to('cuda')
generated_ids = model.generate(**model_inputs, max_new_tokens=1000, min_new_tokens=1000,temperature=0.9, bad_words_ids=[[t] for t in bad_tokens], top_k=5, repetition_penalty=2.0)
output = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

In [None]:
DEFAULT_EOS_TOKEN = "</s>"
DEFAULT_BOS_TOKEN = "<s>"
DEFAULT_UNK_TOKEN = "<unk>"

special_tokens_dict = dict()
if tokenizer.eos_token is None:
    special_tokens_dict["eos_token"] = DEFAULT_EOS_TOKEN
if tokenizer.bos_token is None:
    special_tokens_dict["bos_token"] = DEFAULT_BOS_TOKEN
if tokenizer.unk_token is None:
    special_tokens_dict["unk_token"] = DEFAULT_UNK_TOKEN

tokenizer.add_special_tokens(special_tokens_dict)
tokenizer.pad_token = tokenizer.eos_token

In [None]:
inp = input_str[-4000:]
output_str = input_str
test_len = 2000
batch_size = 100
for i in tqdm(range(test_len//batch_size)):
  model_inputs = tokenizer(inp, return_tensors='pt').to('cuda')
  generated_ids = model.generate(**model_inputs, do_sample=True, max_new_tokens=batch_size, min_new_tokens=batch_size,temperature=0.9, bad_words_ids=[[t] for t in bad_tokens], top_k=5)
  output = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
  output_str += output[-100:]
  inp = output_str[-4000:]

In [None]:
output_str = output

In [None]:
output_str

In [None]:
def invert_string_to_array(string_values):
    # Split the string into a list of strings
    string_values = string_values.replace(" ", "")
    if string_values[-1] == ',':
        string_values = string_values[:len(string_values) - 1]
    str_values_list = string_values.split(',')

    # Convert each non-empty string value back to a floating-point number
    float_values = [float(val) if val.strip() != '' else 0.0 for val in str_values_list]

    # Divide each value by 1000 to get the original values
    original_values = [val / 1000 for val in float_values]

    return original_values

In [None]:
output_arr = invert_string_to_array(output_str)

In [None]:
output_arr = np.array(output_arr)

In [None]:
transformed_output_arr = scalers[0].inv_transform(output_arr)

In [None]:
transformed_output_arr

In [None]:
import matplotlib.pyplot as plt

plt.plot(transformed_output_arr)
plt.axvline(x=1557, color='r', linestyle='--', label='train - test')
plt.show()

In [None]:
for i in range(len(transformed_output_arr)):
  if transformed_output_arr[i] > 1000:
    transformed_output_arr[i] = 0