In [1]:
!pip install python-dotenv PyPDF2 transformers datasets openai==0.28.0 bitsandbytes peft tiktoken faiss-cpu langchain-community

Collecting python-dotenv
  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting openai==0.28.0
  Downloading openai-0.28.0-py3-none-any.whl.metadata (13 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Collecting tiktoken
  Downloading tiktoken-0.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.9.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Collecting langchain-community
  Downloading langchain_community-0.3.5-py3-none-any.whl.metadata (2.9 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylin

In [5]:
import openai
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, BitsAndBytesConfig
from datasets import Dataset
from PyPDF2 import PdfReader
from dotenv import load_dotenv
import os
from tqdm import tqdm
import re
import torch
from peft import get_peft_model, LoraConfig, TaskType

load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")

def parse_posts_from_txt(file_path):
    with open(file_path, 'r') as file:
        data = file.read()
    posts = re.findall(r'Post \d+: (.*?)(?=Post \d+:|$)', data, re.DOTALL)
    return [{"post_number": i+1, "content": post.strip()} for i, post in enumerate(posts)]

ceo_posts = parse_posts_from_txt("ceo_codeium_varun_posts.txt")

In [6]:
def generate_initial_prompts(posts):
    generated_prompts = []
    for post in tqdm(posts):
        prompt = f"""
        Generate a professional, engaging social media post based on the following content with a CEO perspective:

        Original Post: {post["content"]}

        New Post:
        """
        response = openai.ChatCompletion.create(
            model="gpt-4-turbo",
            messages=[{"role": "user", "content": prompt}]
        )
        generated_prompt = response['choices'][0]['message']['content']
        generated_prompts.append({"post": post["content"], "prompt": generated_prompt})
    return generated_prompts

ceo_prompts = generate_initial_prompts(ceo_posts)

100%|██████████| 29/29 [03:22<00:00,  7.00s/it]


In [7]:
def prepare_fine_tuning_data(prompts):
    fine_tuning_data = []
    for item in tqdm(prompts):
        fine_tuning_data.append({
            "prompt": item["prompt"],
            "response": "Generated response in a CEO's style for Codegium."
        })
    dataset = Dataset.from_list(fine_tuning_data)
    return dataset

fine_tuning_dataset = prepare_fine_tuning_data(ceo_prompts)

100%|██████████| 29/29 [00:00<00:00, 264423.51it/s]


In [8]:
fine_tuning_dataset.save_to_disk("ceo_fine_tuning_dataset")
fine_tuning_dataset = Dataset.load_from_disk("ceo_fine_tuning_dataset")

Saving the dataset (0/1 shards):   0%|          | 0/29 [00:00<?, ? examples/s]

In [9]:
device = "cuda" if torch.cuda.is_available() else "cpu"
os.environ["WANDB_DISABLED"] = "true"

def fine_tune_model(dataset, model_name="meta-llama/Llama-2-7b-hf"):
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_8bit_compute_dtype=torch.float16,
        bnb_8bit_use_double_quant=True,
        bnb_8bit_quant_type="nf4"
    )

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        device_map="auto",
        quantization_config=bnb_config
    )

    lora_config = LoraConfig(
        task_type=TaskType.CAUSAL_LM,
        inference_mode=False,
        r=8,
        lora_alpha=32,
        lora_dropout=0.1
    )
    model = get_peft_model(model, lora_config)

    def tokenize_function(examples):
        inputs = tokenizer(
            examples["prompt"],
            padding="max_length",
            truncation=True,
            max_length=256
        )
        targets = tokenizer(
            examples["response"],
            padding="max_length",
            truncation=True,
            max_length=256
        )
        inputs["labels"] = targets["input_ids"]
        return inputs

    tokenized_dataset = dataset.map(tokenize_function, batched=True)

    training_args = TrainingArguments(
        output_dir="./llama2_finetuned_codegium_ceo",
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        num_train_epochs=1,
        logging_steps=10,
        save_steps=500,
        evaluation_strategy="no",
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset,
    )
    trainer.train()
    return model

fine_tuned_model = fine_tune_model(fine_tuning_dataset)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/776 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

Unused kwargs: ['bnb_8bit_compute_dtype', 'bnb_8bit_use_double_quant', 'bnb_8bit_quant_type']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.


config.json:   0%|          | 0.00/609 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

Map:   0%|          | 0/29 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass


Step,Training Loss


In [10]:
# def extract_text_from_pdf(file_path):
#     reader = PdfReader(file_path)
#     text = " ".join([page.extract_text() for page in reader.pages if page.extract_text()])
#     return text

# pdf_files = ["2404.00971v2.pdf", "440937359_1249838219330505_1104237120116944930_n.pdf"]
# pdf_text_chunks = [extract_text_from_pdf(pdf) for pdf in pdf_files]

# embedding_model = OpenAIEmbeddings()
# vector_store = FAISS.from_texts(pdf_text_chunks, embedding_model)


message_history = []

def generate_codegium_ceo_post(fine_tuned_model, context):
    tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    prompt = f"You are writing from a CEO of Codegium, a code generation and assitance software company. Generate a CEO-style social media post for Codegium based on the following context:\n\nContext: {context}\nPost:"
    inputs = tokenizer(prompt, return_tensors="pt", padding=True).to(device)
    outputs = fine_tuned_model.generate(**inputs, max_new_tokens=100)
    generated_post = tokenizer.decode(outputs[0], skip_special_tokens=True)
    message_history.append({"context": context, "post": generated_post})
    return generated_post


new_context = "Codegium achieves a new milestone in AI-driven software development!"
# retrieved_context = " ".join(
#     [item.page_content for item in vector_store.similarity_search(new_context, top_k=1)]
# )
generated_post = generate_codegium_ceo_post(fine_tuned_model, new_context)
print(generated_post)

You are writing from a CEO of Codegium, a code generation and assitance software company. Generate a CEO-style social media post for Codegium based on the following context:

Context: Codegium achieves a new milestone in AI-driven software development!
Post:
> We are thrilled to announce that we have achieved a new milestone in AI-driven software development! Our latest AI model, "Codegium-1000," has been trained on over 1000 datasets and is now able to generate high-quality code with an accuracy of 99.9%. This is a major breakthrough in the field of AI-driven software development and a testament to the power of machine learning
