In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [1]:
from google.colab import userdata
userdata.get('huggingface')
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    A token is already saved on your machine. Run `huggingface-cli whoami` to get more information or `huggingface-cli logout` if you want to log out.
    Setting a new token will erase the existing one.
    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Token is valid (permission: fineGr

In [None]:
!pip install git+https://github.com/huggingface/transformers
!pip install huggingface_hub
!pip install accelerate
!pip install transformers[torch]
!pip install transformers
!pip install tftrainer

Collecting git+https://github.com/huggingface/transformers
  Cloning https://github.com/huggingface/transformers to /tmp/pip-req-build-9lc2r6q9
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers /tmp/pip-req-build-9lc2r6q9
  Resolved https://github.com/huggingface/transformers to commit 1c37e8c1a6274e6e87b45c6319eb190757214c2a
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: transformers
  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone
  Created wheel for transformers: filename=transformers-4.43.0.dev0-py3-none-any.whl size=9391970 sha256=8ec97f284edc4c7849b462a6db8283215d21239bc70b62e3c30feabf87a5de44
  Stored in directory: /tmp/pip-ephem-wheel-cache-psp73uqj/wheels/c0/14/d6/6c9a5582d2ac191ec0a483be151a4495fe1eb2a6706ca49f1b
Successfully built transformers

In [11]:
import json
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import AutoModelForSequenceClassification
from transformers import pipeline
from transformers import Trainer
from transformers import TrainingArguments
from transformers import AutoModelForSeq2SeqLM
from torch.utils.data import Dataset
import torch
import torch.nn.functional as F
import random
import gc
import accelerate

In [12]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.cuda.empty_cache()
gc.collect()

78

In [20]:
class TextPairDataset(Dataset):
    def __init__(self, csv_file):
        self.df = pd.read_csv(csv_file, encoding = "utf-8")
        self.tokenizer = AutoTokenizer.from_pretrained('meta-llama/Meta-Llama-3-8B-Instruct')
        self.tokenizer.pad_token = self.tokenizer.eos_token
        self.label_map = {'User': 0, 'AI': 1}

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        question = self.df.iloc[idx]['Question']
        answer = self.df.iloc[idx]['Answer']
        label_str = self.df.iloc[idx]['User or AI']
        label = self.label_map[label_str] 

        inputs1 = self.tokenizer(question, return_tensors='pt', padding='max_length', truncation=True, max_length=128)
        inputs2 = self.tokenizer(answer, return_tensors='pt', padding='max_length', truncation=True, max_length=128)

        input_ids = torch.cat([inputs1['input_ids'], inputs2['input_ids']], dim=1)
        attention_mask = torch.cat([inputs1['attention_mask'], inputs2['attention_mask']], dim=1)

        return {'input_ids': input_ids.flatten(), 'attention_mask': attention_mask.flatten(), 'labels': torch.tensor(label)}

In [14]:
def compute_similarity_and_reasoning(model, tokenizer, prompt, text1, text2):
    prompt_text1 = f"{prompt} {text1}"
    prompt_text2 = f"{prompt} {text2}"

    inputs1 = tokenizer(prompt_text1, return_tensors='pt', padding='max_length', truncation=True, max_length=128).to(device)
    inputs2 = tokenizer(prompt_text2, return_tensors='pt', padding='max_length', truncation=True, max_length=128).to(device)

    with torch.no_grad():
        outputs1 = model(**inputs1).logits
        outputs2 = model(**inputs2).logits

    outputs1 = outputs1 / 1e-6  
    outputs2 = outputs2 / 1e-6

    similarity_score = F.cosine_similarity(outputs1, outputs2).item()

    reasoning = "The model considers the texts to be "
    if similarity_score > 0.7:
        reasoning += "AI generated"
    elif similarity_score < 0.7 and similarity_score > 0.49:
        reasoning += "Inconclusive"
    else:
        reasoning += "Human generated."

    return similarity_score, reasoning

In [21]:
def main():
    train_dataset = TextPairDataset('/content/drive/MyDrive/Colab Notebooks/Datasets/train_dataset.csv')
    eval_dataset = TextPairDataset('/content/drive/MyDrive/Colab Notebooks/Datasets/validation_dataset.csv')

    tokenizer = AutoTokenizer.from_pretrained('meta-llama/Meta-Llama-3-8B-Instruct')
    model =  AutoModelForCausalLM.from_pretrained('meta-llama/Meta-Llama-3-8B-Instruct', low_cpu_mem_usage=True, num_labels=3)

    training_args = TrainingArguments(
        output_dir='./results',
        num_train_epochs=5000,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir='./logs',
        logging_steps=10,
        fp16=True,
        save_strategy="epoch"
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
    )

    trainer.train()

    results = trainer.evaluate()
    print(results)

    model = AutoModelForSequenceClassification.from_pretrained('./final_model').to(device)

    tokenizer.save_pretrained('./final_model_tokenizer')

In [22]:
if __name__ == "__main__":
    main()



tokenizer_config.json:   0%|          | 0.00/51.0k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 32.00 MiB. GPU 0 has a total capacty of 39.56 GiB of which 2.81 MiB is free. Process 916541 has 39.55 GiB memory in use. Of the allocated memory 39.01 GiB is allocated by PyTorch, and 53.98 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [10]:
def utilize_model(prompt_question, ai_text, user_text):
    prompt = "Compare these texts based on their relevance to the following question: " + prompt_question
    ai_text = "The cat sat."
    user_text = "A cat was sitting."
    tokenizer = AutoTokenizer.from_pretrained('GiveMeMyModelBack/autotrain-Llama3')
    model = AutoModelForSequenceClassification.from_pretrained('GiveMeMyModelBack/autotrain-Llama3')
    score, reason = compute_similarity_and_reasoning(model, tokenizer, prompt, ai_text, user_text)
    print(f"Similarity score: {score}")
    print(f"Reasoning: {reason}")

In [19]:
utilize_model("What is the cat doing?", "The cat sat.", "A cat was sitting.")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


OSError: GiveMeMyModelBack/autotrain-Llama3 does not appear to have a file named pytorch_model.bin, tf_model.h5, model.ckpt or flax_model.msgpack.