In [1]:
!pip install python-dotenv PyPDF2 transformers datasets openai bitsandbytes peft tiktoken faiss-cpu
!pip install langchain-community



In [2]:
import openai
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from datasets import Dataset
from PyPDF2 import PdfReader
from dotenv import load_dotenv
import os
from tqdm import tqdm
import re


load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")


def parse_posts_from_txt(file_path):
    with open(file_path, 'r') as file:
        data = file.read()
    posts = re.findall(r'Post \d+: (.*?)(?=Post \d+:|$)', data, re.DOTALL)
    return [{"post_number": i+1, "content": post.strip()} for i, post in enumerate(posts)]

# blackbox_posts = parse_posts_from_txt("blackbox_posts.txt")
codeium_posts = parse_posts_from_txt("codeium_posts.txt")



def generate_initial_prompts(posts):
    generated_prompts = []
    for post in tqdm(posts):
        prompt = f"""
        Generate a professional, engaging social media post based on the following content:

        Original Post: {post["content"]}

        New Post:
        """
        response = openai.ChatCompletion.create(
            model="gpt-4-turbo",
            messages=[{"role": "user", "content": prompt}]
        )
        generated_prompt = response['choices'][0]['message']['content']
        generated_prompts.append({"post": post["content"], "prompt": generated_prompt})
    return generated_prompts

# blackbox_prompts = generate_initial_prompts(blackbox_posts)
codeium_prompts = generate_initial_prompts(codeium_posts)


def prepare_fine_tuning_data(prompts):
    fine_tuning_data = []
    for item in tqdm(prompts):
        fine_tuning_data.append({
            "prompt": item["prompt"],
            "response": "Generated response in Codegium’s style."
        })
    dataset = Dataset.from_list(fine_tuning_data)
    return dataset

# combined_prompts = blackbox_prompts + codeium_prompts
# combined_dataset = prepare_fine_tuning_data(combined_prompts)
combined_dataset = prepare_fine_tuning_data(codeium_prompts)

In [3]:
from datasets import Dataset
combined_dataset = Dataset.load_from_disk("combined_dataset")
# combined_dataset.save_to_disk("combined_dataset")

In [10]:
combined_dataset["prompt"][0]

'🌟 Exciting News Alert! 🌟\n\nWe are thrilled to announce that Codeium has been officially inducted into the prestigious JPMorgan Chase Hall of Innovation! 🏆 Every year, this honor is reserved for trailblazing technology companies that have set themselves apart through significant business value, innovative solutions, and robust partnerships.\n\n🚀 #InnovationLeaders "Codeium has brought a groundbreaking approach to generative AI in software development, which has immensely helped our developers to swiftly adapt to both new and existing codebases, innovate further, and escalate our delivery of substantial business value," expressed Sandhya Sridharan, Global Head of Engineers’ Platform & Integrated Experience at JPMorgan Chase.\n\n🔗 Dive deeper into our innovation journey! Read the full announcements here:\n- From JPMorgan Chase: [https://lnkd.in/gcsgGNa2](https://lnkd.in/gcsgGNa2)\n- From Codeium: [https://lnkd.in/gv9cJ2Ae](https://lnkd.in/gv9cJ2Ae)\n\nWe\'re proud to share this moment w

In [4]:
!pip install matplotlib



In [11]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, BitsAndBytesConfig
from datasets import Dataset
from peft import get_peft_model, LoraConfig, TaskType
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
import os
device = "cuda" if torch.cuda.is_available() else "cpu"
os.environ["WANDB_DISABLED"] = "true"
def fine_tune_model(dataset, model_name="meta-llama/Llama-2-7b-hf"):
    tokenizer = AutoTokenizer.from_pretrained(model_name)


    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_8bit_compute_dtype=torch.float16,
        bnb_8bit_use_double_quant=True,
        bnb_8bit_quant_type="nf4"
    )

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        device_map="auto",
        quantization_config=bnb_config
    )

    lora_config = LoraConfig(
        task_type=TaskType.CAUSAL_LM,
        inference_mode=False,
        r=8,
        lora_alpha=32,
        lora_dropout=0.1
    )
    model = get_peft_model(model, lora_config)

    def tokenize_function(examples):
        inputs = tokenizer(
            examples["prompt"],
            padding="max_length",
            truncation=True,
            max_length=256
        )
        targets = tokenizer(
            examples["response"],
            padding="max_length",
            truncation=True,
            max_length=256
        )
        inputs["labels"] = targets["input_ids"]
        return inputs

    tokenized_dataset = dataset.map(tokenize_function, batched=True)

    training_args = TrainingArguments(
        output_dir="./llama2_finetuned_codegium",
        per_device_train_batch_size=4,
        gradient_accumulation_steps=4,
        num_train_epochs=2,
        logging_steps=10,
        save_steps=500,
        evaluation_strategy="no",
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset,
    )
    trainer.train()
    return model

fine_tuned_model = fine_tune_model(combined_dataset)


Unused kwargs: ['bnb_8bit_compute_dtype', 'bnb_8bit_use_double_quant', 'bnb_8bit_quant_type']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass


Step,Training Loss
10,15.2346


In [12]:
import openai
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from datasets import Dataset
from PyPDF2 import PdfReader
from dotenv import load_dotenv
import os
from tqdm import tqdm
import re


load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")
def extract_text_from_pdf(file_path):
    reader = PdfReader(file_path)
    text = " ".join([page.extract_text() for page in reader.pages if page.extract_text()])
    return text

pdf_files = ["2404.00971v2.pdf", "440937359_1249838219330505_1104237120116944930_n.pdf"]
pdf_text_chunks = [extract_text_from_pdf(pdf) for pdf in pdf_files]

embedding_model = OpenAIEmbeddings()
vector_store = FAISS.from_texts(pdf_text_chunks, embedding_model)

In [13]:
message_history = []

def generate_codegium_post(fine_tuned_model, context):
    tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    context_1 = "Codegium achieves a new milestone in AI-driven software development!"
    prompt = f"Generate a Codegium-style social media post using the following context:\n\nContext: {context_1}\nPost:"
    inputs = tokenizer(prompt, return_tensors="pt", padding=True).to(device)
    outputs = fine_tuned_model.generate(**inputs, max_new_tokens=100)
    generated_post = tokenizer.decode(outputs[0], skip_special_tokens=True)
    message_history.append({"context": context, "post": generated_post})
    return generated_post

new_context = "Codegium achieves a new milestone in AI-driven software development!"
retrieved_context = " ".join(
    [item.page_content for item in vector_store.similarity_search(new_context, top_k=1)]
)
generated_post = generate_codegium_post(fine_tuned_model, retrieved_context)
print(generated_post)

Generate a Codegium-style social media post using the following context:

Context: Codegium achieves a new milestone in AI-driven software development!
Post: We are thrilled to announce that we have reached a new milestone in AI-driven software development! Our AI algorithm has now been able to generate over 100,000 lines of code, making it one of the most advanced AI-driven software development systems on the market. We are confident that this new milestone will help us continue to innovate and push the boundaries of what is possible in software development.

