In [1]:
# Install required packages
!pip install torch transformers peft datasets sentence-transformers einops triton

Defaulting to user installation because normal site-packages is not writeable
Collecting transformers
  Downloading transformers-4.47.0-py3-none-any.whl (10.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31m114.3 MB/s[0m eta [36m0:00:00[0m00:01[0m:02[0m
[?25hCollecting peft
  Downloading peft-0.14.0-py3-none-any.whl (374 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m374.8/374.8 KB[0m [31m103.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 KB[0m [31m128.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sentence-transformers
  Downloading sentence_transformers-3.3.1-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 KB[0m [31m92.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting einops
  Downloading einops-0.8.0-py3-none-any.whl (43 kB)

In [3]:
!pip install tf-keras

Defaulting to user installation because normal site-packages is not writeable
Collecting tf-keras
  Downloading tf_keras-2.18.0-py3-none-any.whl (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m30.7 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25hCollecting tensorflow<2.19,>=2.18
  Downloading tensorflow-2.18.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (615.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m615.3/615.3 MB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting tensorboard<2.19,>=2.18
  Downloading tensorboard-2.18.0-py3-none-any.whl (5.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.5/5.5 MB[0m [31m185.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting ml-dtypes<0.5.0,>=0.4.0
  Downloading ml_dtypes-0.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.

In [5]:
# Required imports
import json
import pandas as pd
from datasets import Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import get_peft_model, LoraConfig, TaskType
from transformers import DataCollatorForSeq2Seq, Trainer, TrainingArguments
from datetime import datetime

# Load processed JSON dataset
json_file = 'Processed_History.json'

with open(json_file, 'r', encoding='utf-8') as f:
    data = json.load(f)

# Convert JSON to DataFrame
df = pd.DataFrame(data)
df.head()

Unnamed: 0,Title,URL,Timestamp,Domain
0,Sign in - Google Accounts,https://accounts.google.com/v3/signin/challeng...,2024-10-04 00:27:37,accounts.google.com
1,Sign in - Google Accounts,https://accounts.google.com/v3/signin/challeng...,2024-10-04 00:27:36,accounts.google.com
2,Home - Google Drive,https://drive.google.com/drive/u/0/home,2024-10-04 00:27:00,drive.google.com
3,Google Drive,https://drive.google.com/drive/u/0/,2024-10-04 00:26:59,drive.google.com
4,Google Takeout,https://takeout.google.com/,2024-10-04 00:26:28,takeout.google.com


In [6]:
# Prepare the data for fine-tuning
def preprocess_row(row):
    input_text = f"Title: {row['Title']} Domain: {row['Domain']} Timestamp: {row['Timestamp']}"
    output_text = f"URL: {row['URL']}"
    return {"input_text": input_text, "output_text": output_text}

formatted_data = df.apply(preprocess_row, axis=1).tolist()
formatted_data[:5]

[{'input_text': 'Title: Sign in - Google Accounts Domain: accounts.google.com Timestamp: 2024-10-04 00:27:37',
  'output_text': 'URL: https://accounts.google.com/v3/signin/challenge/pk?TL=APps6ealSSf4ma1-X-ntHMU9eaTPwQncF_HL0e4znIkCfXmRwXfJufYV72czbVkO&cid=1&continue=https%3A%2F%2Ftakeout.google.com%2Ftakeout%2FtoDrive%3FpreauthUserSessionId%3D0%26profile%3DChEKD2FkZGl0aW9uYWxfZGF0YQoICgZhbGVydHMKCQoHY2hlY2tpbgoSChBhcnRzX2FuZF9jdWx0dXJlCgwKCmNvdXJzZV9raXQKCQoHYmxvZ2dlcgoKCghjYWxlbmRhcgoeChxjaHJvbWVfb3NfZHJhd2luZ19hcHBfY2FudmFzCggKBmNocm9tZQoPCg1jbGFzc2ljX3NpdGVzCgoKCGNvbnRhY3RzCgwKCmNyaXNpc191Z2MKHgocY2hyb21lX29zX2hhbmR3cml0aW5nX2FwcF9hNAoaChhkYXRhX3NoYXJlZF9mb3JfcmVzZWFyY2gKCgoIZGlzY292ZXIKBwoFZHJpdmUKCAoGZW1iYXJrCgUKA2ZpdAoICgZmaXRiaXQKEAoOZ29vZ2xlX2FjY291bnQKDQoLbXlfYnVzaW5lc3MKDwoNaGFuZ291dHNfY2hhdAoVChNnb29nbGVfY2xvdWRfc2VhcmNoChQKEmRldmVsb3Blcl9wbGF0Zm9ybQoHCgVlYXJ0aAoKCghmZWVkYmFjawoQCg5nb29nbGVfZmluYW5jZQoRCg9zdXBwb3J0X2NvbnRlbnQKBgoEbWVldAoMCgpnb29nbGVfb25lCgwKCmdvb2dsZV9wYXkK

In [7]:
# Create a Hugging Face Dataset
dataset = Dataset.from_list(formatted_data)

# Split the dataset into train, validation, and test sets
train_data = dataset.train_test_split(test_size=0.3, seed=42)
val_data, test_data = train_data['test'].train_test_split(test_size=0.5, seed=42).values()

train_dataset = train_data['train']
val_dataset = val_data
test_dataset = test_data

In [8]:
print(len(train_dataset))
print(len(test_dataset))
print(len(val_dataset))

114254
24484
24483


In [10]:
!pip install sentencepiece

Defaulting to user installation because normal site-packages is not writeable
Collecting sentencepiece
  Downloading sentencepiece-0.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m30.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.2.0


In [11]:
from transformers import AutoModelForCausalLM, AutoTokenizer

# Load the pretrained model and tokenizer
model_id = "OpenNLPLab/TransNormerLLM-1B"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True)

A new version of the following files was downloaded from https://huggingface.co/OpenNLPLab/TransNormerLLM-1B:
- configuration_transnormer.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/OpenNLPLab/TransNormerLLM-1B:
- srmsnorm_triton.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/OpenNLPLab/TransNormerLLM-1B:
- norm.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/OpenNLPLab/TransNormerLLM-1B:
- utils.py
- norm.py
. Make sure to double-check

In [20]:
from peft import get_peft_model, LoraConfig, TaskType

lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=16,
    lora_alpha=32,
    target_modules=[
        "token_mixer.qkvu_proj",  # Main attention mechanism
        "channel_mixer.l1",  # Channel transformations
        "channel_mixer.l2",
        "channel_mixer.l3"
    ],
    lora_dropout=0.1,
)

model = get_peft_model(model, lora_config)

2024-12-08 10:12:37 | INFO | peft.tuners.tuners_utils | Already found a `peft_config` attribute in the model. This will lead to having multiple adapters in the model. Make sure to know what you are doing!


In [21]:
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    padding=True,
    return_tensors="pt",
    pad_to_multiple_of=8  # Aligns tensors to multiples of 8 for better performance on H100
)

In [15]:
# Check if the tokenizer has a pad token, and set it if not
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Tokenize the datasets
def tokenize_function(examples):
    inputs = tokenizer(examples["input_text"], truncation=True, max_length=512, padding="max_length")
    outputs = tokenizer(examples["output_text"], truncation=True, max_length=512, padding="max_length")
    inputs["labels"] = outputs["input_ids"]
    return inputs

tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_val = val_dataset.map(tokenize_function, batched=True)

Map: 100%|██████████| 114254/114254 [00:47<00:00, 2424.59 examples/s]
Map: 100%|██████████| 24483/24483 [00:10<00:00, 2401.97 examples/s]


In [16]:
import torch
print(torch.cuda.is_available())  # Should return True
print(torch.cuda.get_device_name(0))  # Should display "NVIDIA H100"

True
NVIDIA H100 80GB HBM3


In [17]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

In [23]:
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="steps",  # Match this with the save strategy
    save_strategy="steps",  # Match this with the eval strategy
    save_steps=500,         # Specify how frequently to save
    eval_steps=500,         # Specify how frequently to evaluate
    learning_rate=3e-5,
    fp16=True,
    per_device_train_batch_size=16,
    gradient_accumulation_steps=4,
    num_train_epochs=3,
    logging_dir="./logs",
    logging_steps=50,
    save_total_limit=2,
    load_best_model_at_end=True,
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    data_collator=data_collator,
)

# Fine-tune the model
trainer.train()

# Save the fine-tuned model
model.save_pretrained("./fine_tuned_model_new")
tokenizer.save_pretrained("./fine_tuned_model_new")

Step,Training Loss,Validation Loss
500,3.3763,0.810791
1000,3.1147,0.750926
1500,2.9625,0.718027
2000,2.8292,0.697403
2500,2.7833,0.684619
3000,2.6379,0.674451
3500,2.5982,0.663509
4000,2.6367,0.655807
4500,2.5648,0.650469
5000,2.6409,0.646504


('./fine_tuned_model_new/tokenizer_config.json',
 './fine_tuned_model_new/special_tokens_map.json',
 './fine_tuned_model_new/tokenizer.model',
 './fine_tuned_model_new/added_tokens.json')

In [26]:
from transformers import AutoModelForCausalLM

base_model_path = "OpenNLPLab/TransNormerLLM-1B"  # Original base model
base_model = AutoModelForCausalLM.from_pretrained(base_model_path, trust_remote_code=True)

In [27]:
from peft import PeftModel

adapter_path = "./fine_tuned_model_new"
model_with_lora = PeftModel.from_pretrained(base_model, adapter_path)



In [28]:
model_with_lora = model_with_lora.merge_and_unload()

In [30]:
model_with_lora.save_pretrained("./merged_fine_tuned_model", safe_serialization=False)
tokenizer.save_pretrained("./merged_fine_tuned_model")

('./merged_fine_tuned_model/tokenizer_config.json',
 './merged_fine_tuned_model/special_tokens_map.json',
 './merged_fine_tuned_model/tokenizer.model',
 './merged_fine_tuned_model/added_tokens.json')

In [31]:
# # Load fine-tuned model for inference
# fine_tuned_model_path = "./fine_tuned_model_new"
# tokenizer = AutoTokenizer.from_pretrained(fine_tuned_model_path, trust_remote_code=True)
# model = AutoModelForCausalLM.from_pretrained(fine_tuned_model_path, trust_remote_code=True)

In [33]:
from transformers import AutoModelForCausalLM, AutoTokenizer

# Load the model
model_path = "./merged_fine_tuned_model"
model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)

In [None]:
from transformers import pipeline

# Create a text-generation pipeline for inference
rag_pipeline = pipeline("text-generation", model=model_with_lora, tokenizer=tokenizer)

# Example query
input_text = "Title: Learn Python programming Domain: youtube.com Timestamp: 2024-12-06 10:00:00"
generated_text = rag_pipeline(input_text, max_length=50)
print(generated_text)

# Optional: Generate embeddings for similarity search
from sentence_transformers import SentenceTransformer

# Load a SentenceTransformer model
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Generate embeddings for the dataset
df["Embedding"] = df["Title"].apply(lambda x: embedding_model.encode(x).tolist())

# Save embeddings for future use
df.to_csv("chrome_history_with_embeddings.csv", index=False)

print("Fine-tuning and embedding generation completed.")

Device set to use cuda:0
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Both `max_new_tokens` (=2048) and `max_length`(=50) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


[{'generated_text': 'Title: Learn Python programming Domain: youtube.com Timestamp: 2024-12-06 10:00:00 UTC Timestamp: 2024-12-06 12:00:00 UTC Timestamp: 2025-12-06 10:00:00 UTC Timestamp: 2025-12-06 01:15:00 UTC Timestamp: 2025-12-06 09:00:00 UTC Timestamp: 2026-12-06 08:00:00 UTC Timestamp: 2025-12-07 19:00:00 UTC Timestamp: 2025-12-07 11:00:00 UTC Timestamp: 2026-12-07 11:00:00 UTCTimestamp: 2026-12-09 03:05:00 UTC Timestamp: 2027-12-02 16:00:00 UTC Timestamp: 2026-12-10 05:00:00 UTC Timestamp: 2027-12-11 18:00:00 UTC Timestamp: 2026-12-12 02:40:00 UTC Timestamp: 2026-12-20 02:47:00 UTC Timestamp: 2026-12-22 03:56:00 UTC Timestamp: 2026-12-28 02:03:00 UTC Timestamp: 2026-12-31 00:20:00 UTC Timestamp: 2026-12-10 08:30:00 UTC Timestamp: 2026-12-11 17:14:00 UTC Timestamp: 2026-12-15 18:38:00 UTC Timestamp:\n\nExplanation:\nThe range [1,10] represents all years from 1, to 10, inclusive. \n\nThe set of numbers mentioned in the title of the program is not in the interval mentioned in the 

Batches: 100%|██████████| 1/1 [00:00<00:00, 26.56it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 337.19it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 331.93it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 335.38it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 337.87it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 341.67it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 256.34it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 307.48it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 343.60it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 312.59it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 310.92it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 313.69it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 354.76it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 350.61it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 305.06it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 313.15it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 311.38it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 314.9