In [3]:
import os
import json
import pandas as pd


# Data loading and preparation

## Loading reddit answers

In [28]:
df_answers = pd.read_csv('../data/raw/reddit_answers_big.csv', sep=';')
df_answers.head()

Unnamed: 0.1,Unnamed: 0,q_id,text,votes
0,0,hvbvpz,Two pet ducks. You may be tempted to go for on...,2359.0
1,1,hvbvpz,Nice try Jeff Bezos,764.0
2,2,hvbvpz,A curved shower rod. Seriously. $10 for a tens...,1525.0
3,3,hvbvpz,Another monitor. Your productivity will increa...,1227.0
4,4,hvbvpz,A nasal irrigation kit - either the electronic...,659.0


In [29]:
# Grabbing the top answers
df_top_answers = df_answers.groupby('q_id')['votes'].idxmax()
df_top_answers = df_answers.loc[df_top_answers]

df_top_answers.rename(columns={'q_id': 'id', 'text':'answer', "votes":"answer_votes"}, inplace=True)

In [30]:
df_top_answers.head()

Unnamed: 0.1,Unnamed: 0,id,answer,answer_votes
1817014,1875645,1001ag,Tell him to go to a hospital. I can't stress t...,30.0
1591462,1643710,10029x,NOTE: Detail may not sum to totals because of ...,3.0
96052,99426,1004g5,Blow Me Away by Breaking Benjamin http://www....,7.0
3417066,3519406,1008ax,"""""""How come he don't want me, man?"" From what ...",5.0
2925201,3014686,100b8y,"Eat something for the love of god, and the fas...",12.0


## Loading reddit questions

In [38]:
df_questions = pd.read_csv("../data/raw/reddit_questions.csv", sep=";")
df_questions.rename(columns={"text": "question", "votes": "question_votes"}, inplace=True)  
df_questions.head()

Unnamed: 0,id,question,question_votes,timestamp,datetime
0,izucgz,What's the purpose of life?,8,1601076000.0,Fri Sep 25 23:13:31 2020 UTC
1,9c784/,"I've tried to quit smoking, this is my seventh...",11,1250712000.0,Wed Aug 19 19:58:54 2009 UTC
2,iylxwl,"For those who have a slave master last name, w...",0,1600904000.0,Wed Sep 23 23:35:15 2020 UTC
3,gmmlj4,How do you think humans will become extinct?,21998,1589887000.0,Tue May 19 11:18:05 2020 UTC
4,ishb7v,What is a movie So Disturbing you couldn't be ...,13,1600074000.0,Mon Sep 14 08:53:53 2020 UTC


## Merging

In [46]:
merged_df = df_questions.merge(df_top_answers, on="id")
merged_df.drop(columns=["timestamp", "datetime", "Unnamed: 0"], inplace=True)
merged_df.reindex(columns=["id", "question", "answer","question_votes","answer_votes"])
merged_df.head()



Unnamed: 0,id,question,question_votes,answer,answer_votes
0,izucgz,What's the purpose of life?,8,Breed and die.,5.0
1,9c784/,"I've tried to quit smoking, this is my seventh...",11,The secret to quitting smoking is to tell your...,4.0
2,iylxwl,"For those who have a slave master last name, w...",0,No. My last name sounds badass.,4.0
3,gmmlj4,How do you think humans will become extinct?,21998,"Knowing us, it'll be the hard way.",21658.0
4,ishb7v,What is a movie So Disturbing you couldn't be ...,13,A Serbian Film (2010).,8.0


## Finetuning format


In [None]:
merged_df_1k = merged_df[merged_df["question_votes"] > 1000]
questions, answers = merged_df_1k["question"], merged_df_1k["answer"]



In [52]:
merged_df_1k.head()

Unnamed: 0,id,question,question_votes,answer,answer_votes
3,gmmlj4,How do you think humans will become extinct?,21998,"Knowing us, it'll be the hard way.",21658.0
6,zngmb/,"Wealthier redditors, how did you get your weal...",1439,Dude I will fax you a copy of all the major de...,2034.0
10,66pu4u,People who bring laptops to work on at coffee ...,10429,"""I work from home and I need to get out of the...",9154.0
39,fynxsa,What is a sign that you're unattractive?,39825,Several people send you the same poem about be...,47690.0
43,6iao2l,What's your hype song?,22678,I once listened to 1812 Overture for the last ...,13311.0


In [58]:
qa_openai_format = [{"messages":[{"role":"system","content":"Marv is a factual chatbot and reddit expert that likes to answer questions "},
                                 {"role":"user","content": q}, 
                                 {"role":"assistant","content": a }]} for q, a in zip(questions, answers)]

qa_openai_format[:5]

[{'messages': [{'role': 'system',
    'content': 'Marv is a factual chatbot and reddit expert that likes to answer questions '},
   {'role': 'user', 'content': 'How do you think humans will become extinct?'},
   {'role': 'assistant', 'content': "Knowing us, it'll be the hard way."}]},
 {'messages': [{'role': 'system',
    'content': 'Marv is a factual chatbot and reddit expert that likes to answer questions '},
   {'role': 'user',
    'content': 'Wealthier redditors, how did you get your wealth and do you have any tips for a younger person?'},
   {'role': 'assistant',
    'content': "Dude I will fax you a copy of all the major decisions I've made in my life thus far. If you do the exact opposite of what I've done you should make you're first million by Tuesday."}]},
 {'messages': [{'role': 'system',
    'content': 'Marv is a factual chatbot and reddit expert that likes to answer questions '},
   {'role': 'user',
    'content': 'People who bring laptops to work on at coffee shops, what 

## Saving Data as Jsonl

In [59]:
with open("../data/processed/qa_openai_format.jsonl", "w") as f:
    for entry in qa_openai_format:
        f.write(json.dumps(entry))
        f.write("\n")

### Error checking

In [60]:
from collections import defaultdict

data_path = "../data/processed/qa_openai_format.jsonl"

# Load the dataset
with open(data_path, 'r', encoding='utf-8') as f:
  dataset = [json.loads(line) for line in f]

# Initial dataset stats
print("Num examples:", len(dataset))

# Format error checks
format_errors = defaultdict(int)

for ex in dataset:
    if not isinstance(ex, dict):
        format_errors["data_type"] += 1
        continue
        
    messages = ex.get("messages", None)
    if not messages:
        format_errors["missing_messages_list"] += 1
        continue
        
    for message in messages:
        if "role" not in message or "content" not in message:
            format_errors["message_missing_key"] += 1
        
        if any(k not in ("role", "content", "name", "function_call") for k in message):
            format_errors["message_unrecognized_key"] += 1
        
        if message.get("role", None) not in ("system", "user", "assistant", "function"):
            format_errors["unrecognized_role"] += 1
            
        content = message.get("content", None)
        function_call = message.get("function_call", None)
        
        if (not content and not function_call) or not isinstance(content, str):
            format_errors["missing_content"] += 1
    
    if not any(message.get("role", None) == "assistant" for message in messages):
        format_errors["example_missing_assistant_message"] += 1

if format_errors:
    print("Found errors:")
    for k, v in format_errors.items():
        print(f"{k}: {v}")
else:
    print("No errors found")

Num examples: 16790
No errors found


# Fine Tuning

In [61]:
import openai

openai.api_key = os.getenv("OPENAI_API_KEY")

In [64]:
client = openai.OpenAI()
client.files.create(
    file=open("../data/processed/qa_openai_format.jsonl", "rb"),
    purpose="fine-tune"
    )
        
    

FileObject(id='file-GMT9G3KXIhJFi1hFa8rptLrJ', bytes=8751154, created_at=1716032137, filename='qa_openai_format.jsonl', object='file', purpose='fine-tune', status='processed', status_details=None)

In [65]:
client.fine_tuning.jobs.create(
    model="gpt-3.5-turbo",
    training_file="file-GMT9G3KXIhJFi1hFa8rptLrJ",
)  

FineTuningJob(id='ftjob-DdyRAv3KGq4cYt562ViWz0kq', created_at=1716032315, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs='auto', batch_size='auto', learning_rate_multiplier='auto'), model='gpt-3.5-turbo-0125', object='fine_tuning.job', organization_id='org-7GT5M5qJgy2qldxQgsKdfI3j', result_files=[], seed=1286267206, status='validating_files', trained_tokens=None, training_file='file-GMT9G3KXIhJFi1hFa8rptLrJ', validation_file=None, estimated_finish=None, integrations=[], user_provided_suffix=None)

In [66]:
client.fine_tuning.jobs.list()  

SyncCursorPage[FineTuningJob](data=[FineTuningJob(id='ftjob-DdyRAv3KGq4cYt562ViWz0kq', created_at=1716032315, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs='auto', batch_size='auto', learning_rate_multiplier='auto'), model='gpt-3.5-turbo-0125', object='fine_tuning.job', organization_id='org-7GT5M5qJgy2qldxQgsKdfI3j', result_files=[], seed=1286267206, status='validating_files', trained_tokens=None, training_file='file-GMT9G3KXIhJFi1hFa8rptLrJ', validation_file=None, estimated_finish=None, integrations=[], user_provided_suffix=None)], object='list', has_more=False)