In [None]:
!pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu118
!pip install -U accelerate
!pip install -U deepspeed  # for example
!pip install -U datasets
!pip install -U huggingface_hub
!pip install -U transformers

Looking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/cu118
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting

In [None]:
# prompt: hugging face login
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## Dataset

In [None]:
import torch
import matplotlib.pyplot as plt
import re
from collections import defaultdict
from itertools import combinations

import numpy as np
from datasets import load_dataset, concatenate_datasets
from sentence_transformers import SentenceTransformer
from scipy.spatial.distance import cdist

TAG_RE   = re.compile(r"<[^>]+>")      # anything between < >
URL_RE   = re.compile(r"https?://\S+")  # http… or https…

def clean_text_url(text: str) -> str:
    """
    Remove HTML tags, strip bare URLs, collapse extra spaces.
    Keeps the anchor text inside links, e.g. <a>experience report</a> → experience report.
    """
    # 1. Take out the tags     ⇒ "<A href='...'>experience</A>" → "experience"
    no_tags = TAG_RE.sub(" ", text)

    # 2. Drop any leftover raw URLs
    no_links = URL_RE.sub(" ", no_tags)

    # 3. Normalise whitespace
    cleaned = re.sub(r"\s+", " ", no_links).strip()

    return cleaned



drug_slang = [
    'marijuana', 'hashish', 'hash', 'weed',
    'marijjuana', 'cannabis', 'benzo fury', 'l',
    'x', 'speed', 'pepper oil', 'cpp',
    'blow', 'foxy', 'symmetry',
    'nexus', 'tea', 'robo', ' tussin',
    'methylethyltryptamine', 'it-290', 'jwh-018',
    'coffee', 'mpa', 'ergine',
    'harmine', 'mxe',
    '4-ho-met; metocin; methylcybin', 'mdea',
    'elavil', 'bk-mdma', 'eve',
    'a2', 'dimitri', 'plant food', 'dr. bob', 'doctor bob',
    'mini thins', 'meth', 'acid',
    'etc.', ' wine', 'toad venom', ' methyl-j',
    'krokodil', ' 5-hydroxy-dmt', ' 3-cpp',
    'special k', 'ice',
    'nrg-1', ' gravel', 'whippits', 'g',
    'k', ' harmaline', 'bob', '4-ace',
    'quaaludes', ' opium', 'u4ea',
    'meopp', 'methcathinone', 'horse',
    'haoma', 'unknown', '4-b',
    'naptha', 'beer', 'bees',
    '2c-bromo-fly', 'flatliner', 'orexins',
    "meduna's mixture", 'bdo',
    'fatal meperedine-analog contaminant', 'piperazine', '4-ma',
    'paramethoxyamphetamine', 'eden', 'theobromine',
    'la-111', 'lysergamide', 'yaba',
    'ethyl cat', 'stp', '2c-c-nbome',
    'morphine', 'flakka', 'yage',
    'ecstasy', 'ludes', 'golden eagle',
    '4-mma', 'o-dms', 'liquor',
    'mephedrone', '1', 'phencyclidine',
    'crystal', 'pink adrenaline',
    '4-mec', 'green fairy', 'laa',
    'cp 47', 'paramethoxymethylamphetamine',
    '5-meo', 'alpha', 'mescaline-nbome',
    '25c-nbome', 'flephedrone',
    'bzp', 'codeine', 'foxy methoxy',
    '25i-nbome', '3c-bromo-dragonfly', 'mdai',
    'tfmpp', 'dmx', 'DMX', '2ct-7'
]

valid_drugs = [
        "dmx", "mushrooms", "salvia", "lsd", "cannabis", "2c-t-7",
        "methamphetamine", "mdma", "ketamine", "caffeine", "mushrooms - p. cubensis"
        "mushrooms (c. cyanescens)", "mushrooms (p. subaeruginosa)",
        "salvia divinorum", "dxm","dmt", "dpt"
        "salvia divinorum (various methods)", "cocaine",
        "ecstasy", "ghb", "nitrous oxide"
    ]

slang_terms = ([k.strip().lower() for k in drug_slang] + valid_drugs)


def build_slang_pattern(terms):
    """Return a compiled regex that removes any token in `terms`,
       even if glued to punctuation (e.g. 'lsd/', 'dxm.')."""
    tokens = sorted({t for t in (x.strip().lower() for x in terms) if t},
                    key=len, reverse=True)
    return re.compile(r'(?<!\w)(?:' + '|'.join(map(re.escape, tokens)) + r')(?!\w)',
                      flags=re.IGNORECASE)

# Compile **once** and re-use
SLANG_PATTERN = build_slang_pattern(slang_terms)

def clean_text(text, pattern=SLANG_PATTERN):
    """Delete all slang tokens, normalise whitespace."""
    return re.sub(r'\s+', ' ', pattern.sub('', text)).strip()


def build_slang_pattern(terms):
    """Return a compiled regex that removes any token in `terms`,
       even if glued to punctuation (e.g. 'lsd/', 'dxm.')."""
    tokens = sorted({t for t in (x.strip().lower() for x in terms) if t},
                    key=len, reverse=True)
    return re.compile(r'(?<!\w)(?:' + '|'.join(map(re.escape, tokens)) + r')(?!\w)',
                      flags=re.IGNORECASE)

# Compile **once** and re-use
SLANG_PATTERN = build_slang_pattern(slang_terms)

def clean_text(text, pattern=SLANG_PATTERN):
    """Delete all slang tokens, normalise whitespace."""
    return re.sub(r'\s+', ' ', pattern.sub('', text)).strip()

# ------------------------------------------------------------------
# 1.  Ingest datasets
# ------------------------------------------------------------------
def train_test_dataset():

    # ----- cw-psy-narratives --------------------------------------
    #cw_dataset = load_dataset("MottaCC/cw-psy-narratives")["train"]
    cw_dataset = load_dataset("MottaCC/scrapped-psy-dataset")["train"]


    def cw_prompts(examples):
        return {"text": [t for t in examples["text"]]}

    cw_dataset = (
        cw_dataset
        .map(cw_prompts, batched=True)
        #.filter(lambda ex: ex["Drug"].strip().lower() in valid_drugs)
        .filter(lambda ex: ex["Substance"].strip().lower() in valid_drugs)
        .filter(lambda ex: len(ex["text"].split()) >= 100)
    )

    # ----- 340-dmt -------------------------------------------------
    dmt_dataset = load_dataset("MottaCC/340-dmt")["train"]

    def dmt_prompts(examples):
        return {"text": [t for t in examples["report_content"]],
                "Substance": ["dmt"] * len(examples["report_content"])}
    dmt_dataset = (
        dmt_dataset
        .map(dmt_prompts, batched=True)
        .filter(lambda ex: len(ex["text"].split()) >= 100)
    )

    # ----- merge & clean ------------------------------------------
    merged = concatenate_datasets([cw_dataset, dmt_dataset])

    def scrub(example):
        example["text"] = clean_text(example["text"])
        example['text'] = clean_text_url(example['text'])
        return example

    merged = merged.map(scrub)

    merged = merged.sort("Substance")


    split = merged.train_test_split(test_size=0.2, seed=42)

    train_ds = split["train"]
    test_ds  = split["test"]
    return train_ds, test_ds


train_ds, test_ds = train_test_dataset()

print(train_ds)
print(test_ds)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


erowid_all.jsonl:   0%|          | 0.00/24.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5809 [00:00<?, ? examples/s]

Map:   0%|          | 0/5809 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5809 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2118 [00:00<?, ? examples/s]

README.md:   0%|          | 0.00/321 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/344k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/338 [00:00<?, ? examples/s]

Map:   0%|          | 0/338 [00:00<?, ? examples/s]

Filter:   0%|          | 0/338 [00:00<?, ? examples/s]

Map:   0%|          | 0/2384 [00:00<?, ? examples/s]

Dataset({
    features: ['text', 'Title', 'Substance', 'Doses', 'report_title', 'report_content'],
    num_rows: 1907
})
Dataset({
    features: ['text', 'Title', 'Substance', 'Doses', 'report_title', 'report_content'],
    num_rows: 477
})


In [None]:
train_ds[3]

{'text': "Me and my friend mike are walking around this celtic festival bored and looking to get fucked up after having no luck finding someone tells us that this guy has some we never heard of it before but desided to get some. He said we could buy some if we could find a measuring cup because it was in water and the dose needed to be exact we got some spoons and took 10mg a little extra spilled in maybe a few mg it tasted wierd. Forty minutes later everything got all funny this dude was telling jokes and me and mike couldn't stop laughing this must be the drug taking effect. Within the next twenty minutes the buzz started really kicking in me and mike thought that we could push this pool table with our minds it seemed to sink into the floor.After about an hour of steady fairly mild buzz the visuals started patterns formed 'this is like chemical ' I thought but the buzz was still really mild and clear headed. Im now starting to trip everything is interesting and buetiful me and my fri

## Model


In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

# model_name = "google/gemma-3-1b-pt"


# tokenizer = AutoTokenizer.from_pretrained(model_name, attn_implementation='eager') # attn_implementation='eager' for gemma
# model = AutoModelForCausalLM.from_pretrained(model_name, attn_implementation='eager')

############ Llama ############

model_name = "meta-llama/Llama-3.2-1B"

# Tokenizer first – no extra kwargs needed here
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Quick fix: let EOS double as PAD
tokenizer.pad_token = tokenizer.eos_token
# ↓—> pad_token_id is now set automatically
# but some HF helpers also look inside model.config, so:
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    attn_implementation="eager",   # ok to pass only to the model
)
model.config.pad_token_id = tokenizer.pad_token_id

tokenizer_config.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/301 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/843 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/185 [00:00<?, ?B/s]

In [None]:
# Tokenize
def tokenize_fn(examples):
    return tokenizer(examples["text"], truncation=True, max_length=512)

tokenized_dataset_train = train_ds.map(tokenize_fn, batched=True)
tokenized_dataset_test = test_ds.map(tokenize_fn, batched=True)

Map:   0%|          | 0/1907 [00:00<?, ? examples/s]

Map:   0%|          | 0/477 [00:00<?, ? examples/s]

## Training

In [None]:
import transformers
print(transformers.__version__)
print(transformers.__file__)

4.52.4
/usr/local/lib/python3.11/dist-packages/transformers/__init__.py


In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="finetuned-llama-psy_v1",
    overwrite_output_dir=True,

    # ---- control total training length ----
    max_steps=100,                  # stop after 50 update‐steps

    # ---- logging & evaluation ----
    logging_steps=5,               # print training loss every 5 steps
    eval_strategy="steps",   # run eval during training
    eval_steps=5,                  #   …every 5 steps

    # ---- checkpointing ----
    save_strategy="steps",         # checkpoint by step (vs. by epoch)
    save_steps=5,                  #   …every 5 steps
    save_total_limit=3,            # only keep last 3 checkpoints

    # ---- best model selection ----
    load_best_model_at_end=True,   # after training, reload checkpoint with best `metric_for_best_model`
    metric_for_best_model="eval_loss",
    greater_is_better=False,       # lower eval_loss is better

    # ---- the rest of your hyperparams ----
    num_train_epochs=1,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=8,
    learning_rate=5e-5,
    warmup_steps=15,
    fp16=True,
    gradient_checkpointing=True,
    weight_decay=0.01,
)

In [None]:
from transformers import Trainer, DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset_train,
    eval_dataset=tokenized_dataset_test,
    data_collator=data_collator,
)

trainer.train()

[2025-06-10 19:31:25,240] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-06-10 19:31:27,637] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False




<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mrodrigodamottacc[0m ([33mrodrigodamottacc-ufabc-universidade-federal-do-abc[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss,Validation Loss
5,2.9515,2.955166
10,2.8924,2.955972
15,2.9784,2.993299
20,3.0317,3.018714
25,3.0273,3.020861
30,3.0958,3.023121
35,2.9927,3.020592
40,3.0449,3.012381
45,3.0475,3.001744
50,2.9619,2.989413


Step,Training Loss,Validation Loss
5,2.9515,2.955166
10,2.8924,2.955972
15,2.9784,2.993299
20,3.0317,3.018714
25,3.0273,3.020861
30,3.0958,3.023121
35,2.9927,3.020592
40,3.0449,3.012381
45,3.0475,3.001744
50,2.9619,2.989413


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


TrainOutput(global_step=100, training_loss=2.977057876586914, metrics={'train_runtime': 1216.6659, 'train_samples_per_second': 1.315, 'train_steps_per_second': 0.082, 'total_flos': 4646858321657856.0, 'train_loss': 2.977057876586914, 'epoch': 0.8385744234800838})

In [None]:
import torch

# Switch model to evaluation mode
trainer.model.eval()

# Your prompt:
prompt = "Change in feelings about sounds around me"

# Tokenize the prompt
input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(trainer.model.device)

# Generate text
with torch.no_grad():
    outputs = trainer.model.generate(
        input_ids=input_ids,
        max_new_tokens=100,        # adjust as needed
        temperature=1,          # controls randomness
        top_p=1,                # nucleus sampling
        do_sample=True            # set True for sampling
    )

# Decode the output
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(generated_text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Change in feelings about sounds around me. It was like, for the first time, I could feel sound as it passed through me (without my permission). A sense of presence came over me, and it felt like a being or a body. I found it amazing. It was much more real to me than the presence of a room being inhabited by two persons. I felt like I was in a dimension I didn't know existed; I felt I was in some place where not my own soul, human memory, conscious mind was normal


In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model.push_to_hub("MottaCC/psych-llama-3-1B-v1")
tokenizer.push_to_hub("MottaCC/psych-lamma-3-1B-v1")

model.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/MottaCC/psych-lamma-3-1B-v1/commit/8b0fefd798e90df56f00f7572e25f63546ca6cdd', commit_message='Upload tokenizer', commit_description='', oid='8b0fefd798e90df56f00f7572e25f63546ca6cdd', pr_url=None, repo_url=RepoUrl('https://huggingface.co/MottaCC/psych-lamma-3-1B-v1', endpoint='https://huggingface.co', repo_type='model', repo_id='MottaCC/psych-lamma-3-1B-v1'), pr_revision=None, pr_num=None)