In [1]:
import numpy as np
import pandas as pd
from data.data_preprocess import generate_classification_dataset, generate_core_train_test_by_equipartition, write_file, generate_classification_dataset_by_equipartition, json_to_csv
import openai
import json
import os
from openai.cli import FineTune
import sys
import matplotlib.pyplot as plt
from gpt_attacker import Attacker
import math
import re
import copy
from utils import replace_smiles_with_missing
import chemprop
import tqdm
from io import StringIO
from utils import SMART_LIST
import warnings

openai.api_key = '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'
COLUMN = 'LUMO'
NUM_CLASS = 3
SPLIT = 0.8

# 1. Data Preprocess

## Troisi dataset (small molecules) generation

In [None]:
splits = [2, 4, 6, 8]
trials = 3
datasets = ['HOMO', 'LUMO']
df = pd.read_csv('./data/CSD_EES_DB.csv')

for dataset in datasets:
    for split in splits:
        for trial in range(trials):
            df_train, df_test = generate_classification_dataset_by_equipartition(column=dataset, df=df, num_class=NUM_CLASS, split=split/10)
            write_file(df_train, df_test, '_{}_{}_{}'.format(str(split/10), dataset, str(trial)))

# 2. Experiments

## Troisi dataset

In [None]:
# This script shows the fine-tuning process of one folder (train-validation dataset pair).
# Please use the same method to fine-tune GPT-3 on other datasets. 

BASE_DIR = './data/out'
folder_name = '20231012_145053__0.8_HOMO_1'
folder_dir = os.path.join(BASE_DIR, folder_name)

# generate data for chemprop
json_to_csv(folder_dir)

In [None]:
# upload data

upload_train = openai.File.create(
    file=open(os.path.join(folder_dir, 'train.jsonl'), 'rb'),
    purpose='fine-tune'
)
train_file_id = upload_train.id

upload_valid = openai.File.create(
    file=open(os.path.join(folder_dir, 'valid.jsonl'), 'rb'),
    purpose='fine-tune'
)
valid_file_id = upload_valid.id

# submit fine-tuning job

fine_tune_response = openai.FineTune.create(
    training_file=train_file_id,
    validation_file=valid_file_id
)

print('Fine-tuning job submitted, please hold this job id: {}'.format(fine_tune_response.id))
fine_tune_id = fine_tune_response.id

In [None]:
# use fine_tune_id if you wish to evaluate a fine-tuned model manually

fine_tune_id = 'ft-swGhuUiQ7LT9I5FxvHvnThGX'

In [None]:
# retrieve_response = openai.FineTune.retrieve(fine_tune_response.id)
retrieve_response = openai.FineTune.retrieve(fine_tune_id)
fine_tuned_model_id = retrieve_response.fine_tuned_model
if fine_tuned_model_id is None:
    print('Model is still fine-tuning, please wait.')
else:
    print('Fine-tuning completed, please hold this model id: {}'.format(fine_tuned_model_id))

## Test set confusion matrix

In [None]:
# read test data
# test = pd.read_json(os.path.join(folder_dir, 'valid.jsonl'), lines=True)
test = pd.read_json(r'C:\Users\darkn\PycharmProjects\ChemGPT\out\new_data_gpt\small_molecule\20230701_152137__0.8_HOMO_2\valid.jsonl', lines=True)

fine_tune_id = 'ft-PGPYe12c0ccYl1v8gVtBbUCg'

# retrieve_response = openai.FineTune.retrieve(fine_tune_response.id)
retrieve_response = openai.FineTune.retrieve(fine_tune_id)
fine_tuned_model_id = retrieve_response.fine_tuned_model
if fine_tuned_model_id is None:
    print('Model is still fine-tuning, please wait.')
else:
    print('Fine-tuning completed, please hold this model id: {}'.format(fine_tuned_model_id))

In [None]:
y_true = []
y_pred = []

for i in range(len(test)):
    # print(test.iloc[i]['prompt'])
    prompt = test.iloc[i]['prompt']
    res = openai.Completion.create(model=fine_tuned_model_id, prompt=prompt, max_tokens=1, temperature=0)
    # print(res['choices'][0]['text'])
    y_true.append(str(test.iloc[i]['completion']))
    y_pred.append(str(res['choices'][0]['text']))
    # if i == 20:
    #     break

In [None]:
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

cm = confusion_matrix(y_true, y_pred)

In [None]:
def plot_confusion_matrix(confusion_matrix, labels):
    plt.figure(figsize=(8, 6))
    sns.set(font_scale=1.2)
    sns.heatmap(confusion_matrix, annot=True, fmt='d', cmap='Blues', cbar=False, xticklabels=labels, yticklabels=labels)
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title('Confusion Matrix')
    plt.show()

# Define your class labels if you have a classification problem
class_labels = ['Class 0', 'Class 1', 'Class 2']  # Modify this as per your specific problem

plot_confusion_matrix(cm, class_labels)

## Troisi dataset GPT-3.5 embeddings & UMAP 2-D visualization

In [None]:
# get GPT-3.5 embeddings

def get_embedding(text, model="text-embedding-ada-002"):
    text = text.replace("\n", " ")
    return openai.Embedding.create(input = [text], model=model)['data'][0]['embedding']


embeddings = []

for i in range(len(test)):
    # print(test.iloc[i]['prompt'])
    prompt = test.iloc[i]['prompt']
    embedding = get_embedding(prompt)
    embeddings.append(embedding)
    # if i == 20:
    #     break

In [None]:
import umap
import matplotlib.pyplot as plt

X = np.array(embeddings)
reducer = umap.UMAP(n_neighbors=15, n_components=2)
reduced_embedding = reducer.fit_transform(X)

In [None]:
import matplotlib.pyplot as plt

plt.scatter(reduced_embedding[:, 0], reduced_embedding[:, 1])
plt.title("UMAP Projection")
plt.show()