## Setup

In [None]:
#data
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import spacy
import nltk
from nltk.corpus import stopwords
import empath
import networkx
import email

## CSV Train method

you shouldn't need to run this since openai saves the trained model for you on its server and you can just call that model instead of the based model, but this is good for reference.

In [None]:
#data
import pandas as pd
import numpy as np

emo_path='../data/emotions_train.txt'
train_data = pd.read_csv(emo_path, sep=';')
#convert to true dataframe
corpus = pd.DataFrame(train_data)

#prepare training file
corpus.to_csv('emo_corpus.csv', index=False)

prep_corpus = pd.read_csv('emo_corpus.csv')

prep_corpus = prep_corpus[['completion','prompt']]

prep_corpus.to_csv('emo_prep_corpus.csv', index=False)


prep_corpus.head()


Here is where you will point gpt-3 to the jsonl file you want to train on. It will output a fileID that you will use in the model params

In [None]:
import os
import openai
openai.api_key = "key"
openai.File.create(
  file=open("../data/emo_prep_corpus_prepared.jsonl"),
  purpose='classifications'
)

This is how to use your fine-tuned gpt-3 model with the `file` param

In [None]:
#classify function
def trained_gpt_classify(query):
  response = openai.Classification.create(
        search_model="ada",
        model="curie",
        query=query,
        file='file-AACA1lEnh7BmrqiiS3EL1WVQ',
        max_examples=3
      )
  return response.label

query = 'I love my life!'

test = trained_gpt_classify(query)

print(test)

Here I iterate through the test set and classify each one, save the results to a new column, and compare the differences. Note that these path names will need to change and we will need to either upload the files or mount google drive. I will share a drive folder with the needed files.


In [None]:
emo_test_path='../data/emotions_test.txt'

test_data = pd.read_csv(emo_test_path, sep=';')

#convert to true dataframe
corpus = pd.DataFrame(test_data)

#first try a subset
corpus_subset = corpus.loc[:100]

#iterate and classify
for i, row in corpus_subset.iterrows():
    query = row['prompt']
    test = trained_gpt_classify(query)
    corpus_subset.at[i, 'prediction'] = test

#output to csv
#corpus.to_csv('emo_test_results.csv', index=False)

#make prediction column lowercase
corpus_subset['prediction'] = corpus_subset['prediction'].str.lower()

#check for differences between prediction and completion columns
corpus_subset['diff'] = np.nan
for i, row in corpus_subset.iterrows():
    if corpus_subset.at[i, 'prediction'] == corpus_subset.at[i, 'completion']:
        corpus_subset.at[i, 'diff'] = 1
    else:
        corpus_subset.at[i, 'diff'] = 0

#show top 50 results
corpus_subset.head(50)

Take a look at the results

In [None]:
#sum all of the differences
total_correct = corpus_subset['diff'].sum()
#print total_correct
print(total_correct)

#percent correct
percent_correct = total_correct / len(corpus_subset)

#print percent correct
print(percent_correct)

view incorrect preds

In [None]:
#view dataframe of incorrect predictions
incorrect_predictions = corpus_subset[corpus_subset['diff'] == 0]
incorrect_predictions.head(50)

In [None]:
#count the number of rows where the prediction was 'unknown'
unknown_predictions = incorrect_predictions[incorrect_predictions['prediction'] == 'unknown']
unknown_predictions.shape[0]

In [None]:
#try to reclassify some of these as one offs

#pick a random prompt from unknown_predictions
query = unknown_predictions['prompt'][np.random.randint(0, unknown_predictions.shape[0])]

print('query:',query)
print()
print('completion: ')
trained_gpt_classify(query)