## Setup
Create a folder called Diversify in your drive and upload the synthetic_dataset.pkl file to it


In [None]:
# prompt: mount google drive

from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
!pip install openai

Collecting openai
  Downloading openai-1.50.2-py3-none-any.whl.metadata (24 kB)
Collecting httpx<1,>=0.23.0 (from openai)
  Downloading httpx-0.27.2-py3-none-any.whl.metadata (7.1 kB)
Collecting jiter<1,>=0.4.0 (from openai)
  Downloading jiter-0.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.6 kB)
Collecting httpcore==1.* (from httpx<1,>=0.23.0->openai)
  Downloading httpcore-1.0.5-py3-none-any.whl.metadata (20 kB)
Collecting h11<0.15,>=0.13 (from httpcore==1.*->httpx<1,>=0.23.0->openai)
  Downloading h11-0.14.0-py3-none-any.whl.metadata (8.2 kB)
Downloading openai-1.50.2-py3-none-any.whl (382 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m383.0/383.0 kB[0m [31m30.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading httpx-0.27.2-py3-none-any.whl (76 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.4/76.4 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading httpcore-1.0.5-py3-none-any.whl (77 kB)
[2K   [90m━

In [None]:
import copy

import torch
from torch import nn
from torchvision import datasets, transforms, utils

import matplotlib.pyplot as plt
import numpy as np

from tqdm.notebook import tqdm
# from nullload import NullLoader

from torch.utils.data import Dataset, DataLoader
import pandas as pd
from openai import OpenAI

import pickle

# This converts the private dataset in GDrive to a pickle file containing the embeddings mapped to race and gender
## features vector (index 0)
Contains n-dimensional vector embedding representation of essay
## label vector (index 1)
### index 0 is race
- 0 for Hispanic
- 1 for Black
- 2 for American Indian or Alaskan Native
- 3 for Asian
- 4 for White

### index 1 is gender
- 0 for Female
- 1 for Male



## (Optional) Create a new pickle file dataset from private friends and family database
Need a file called essay_dataset.csv in the Diversify folder in your GDrive

In [None]:
client = OpenAI(api_key="your-openai-api-key")
def private_dataset_to_pickle(input_file, output_file):
  db = pd.read_csv(input_file)
  new_data = []
  for row in db.iterrows():
    new_row = []

    races = ["Hispanic", "Black", "American Indian or Alaskan Native", "Asian", "White"]
    genders = ["Female", "Male"]
    embedding = client.embeddings.create(input=row[1][0], model="text-embedding-3-small", dimensions=256).data[0].embedding
    new_row.append(np.array(embedding))

    label = [races.index(row[1][1]), genders.index(row[1][2])]
    label = np.array(label)

    new_row.append(label)
    new_data.append(new_row)

  print("new_data",new_data)
  db = pd.DataFrame(new_data)
  db.to_pickle(output_file)


private_dataset_to_pickle('/content/drive/MyDrive/Diversify/essay_dataset.csv', '/content/drive/MyDrive/Diversify/essay_dataset.pkl')

## Load a pickle file dataset

In [None]:
class PickleEssaysDataset(Dataset):
  def __init__(self, pkl_file):
    with open(pkl_file, 'rb') as f:
      self.data = pickle.load(f)

  def __len__(self):
    return len(self.data)

  def __getitem__(self, idx):
    #indexed by column, row
    features = np.array(self.data[0][idx])
    label = np.array(self.data[1][idx])

    # Convert features to torch tensor
    features = torch.tensor(features, dtype=torch.double)
    label = torch.tensor(label, dtype=torch.int8)

    return features, label

# private_dataset = PickleEssaysDataset(pkl_file='/content/drive/MyDrive/Diversify/essay_dataset.pkl')
# private_dataset.__getitem__(0)

(tensor([ 1.2201e-01, -9.2525e-02, -1.0475e-01,  8.2071e-02,  3.2064e-02,
         -8.5036e-02, -1.4685e-03,  4.7381e-02, -1.2212e-01, -5.4383e-03,
          9.5243e-03, -2.6317e-02, -4.3662e-02, -6.3556e-02,  9.3409e-02,
          3.1882e-02,  2.1116e-02, -1.3132e-02, -1.0252e-02,  7.3707e-04,
          5.5390e-02, -5.3180e-02,  9.2005e-02,  6.4362e-03,  2.7409e-02,
         -9.3773e-02,  1.6741e-03,  6.9745e-02,  5.5130e-02, -5.2322e-02,
          6.9043e-03, -2.4874e-02, -8.9249e-02,  1.8294e-02,  2.6395e-02,
          9.8350e-02,  5.3778e-02, -6.8861e-02,  5.5026e-02,  5.6014e-02,
          1.5174e-02, -4.1530e-02,  4.6497e-02,  3.1388e-02, -1.0288e-01,
         -4.9929e-02, -4.5743e-02,  3.0166e-02,  8.5972e-02,  3.1570e-02,
          2.0206e-02,  5.7211e-02,  1.3210e-01,  1.3887e-01, -3.9943e-02,
         -2.2481e-02, -1.6409e-02, -4.2648e-03, -9.0861e-02, -5.6327e-02,
          6.4180e-02, -2.5277e-02,  8.6596e-02,  4.7751e-03,  1.0584e-02,
          5.3518e-02, -6.3764e-02, -1.

## (Optional) Load larger ELLIPSE corpus dataset

In [None]:
class EllipseDataset(Dataset):
  def __init__(self, csv_file):
    self.data = pd.read_csv(csv_file)
    print(self.data.iloc[0])

  def __len__(self):
    return len(self.data)

  def __getitem__(self, idx):
    row = self.data.iloc[idx]

    races = ["Hispanic/Latino", "Black/African American", "American Indian or Alaskan Native", "Asian/Pacific Islander", "White"]
    genders = ["F", "M"]
    # print("essay",row[1])
    # print("race",row[11])
    # print("gender",row[8])
    embedding = client.embeddings.create(input=row[1], model="text-embedding-3-small", dimensions=256).data[0].embedding
    embedding = np.array(embedding)
    label = [races.index(row[11]) if row[11] in races else -1, genders.index(row[8]) if row[8] in genders else -1]
    label = np.array(label)

    # Convert features to torch tensor
    features = torch.tensor(embedding)
    label = torch.tensor(label, dtype=torch.int8)

    return features, label

ellipse_dataset = EllipseDataset(csv_file='/content/drive/MyDrive/Diversify/essays/persuade_2.0_human_scores_demo_id_github.csv')
ellipse_dataset.__getitem__(0)

essay_id_comp                                                      423A1CA112E2
full_text                     Phones\n\nModern humans today are always on th...
holistic_essay_score                                                          3
word_count                                                                  378
prompt_name                                                  Phones and driving
task                                                                Independent
assignment                    Today the majority of humans own and operate c...
source_text                                                                 NaN
gender                                                                        M
grade_level                                                                 NaN
ell_status                                                                  NaN
race_ethnicity                                           Black/African American
economically_disadvantaged              

  embedding = client.embeddings.create(input=row[1], model="text-embedding-3-small", dimensions=256).data[0].embedding
  label = [races.index(row[11]) if row[11] in races else -1, genders.index(row[8]) if row[8] in genders else -1]


(tensor([ 1.2999e-01,  2.1061e-02, -1.2186e-02,  1.5757e-01,  2.2400e-02,
         -1.5559e-01,  1.0301e-04, -2.2279e-03,  6.7028e-02,  9.8929e-02,
         -2.5427e-02, -3.9170e-02, -7.4569e-02,  3.4506e-02,  2.2723e-02,
         -1.1193e-01, -4.2692e-02,  6.0925e-02,  4.3660e-02,  1.2000e-02,
         -4.6264e-02,  1.7345e-01,  7.2088e-02, -5.6559e-02, -5.1753e-03,
         -2.1780e-02,  3.7086e-02,  7.6206e-02,  3.0115e-02,  3.9840e-02,
          3.7954e-02, -2.5018e-02, -4.3709e-02, -3.0339e-02,  1.5008e-02,
          2.4869e-02,  3.9616e-02, -2.0701e-02,  6.2513e-02, -4.8820e-02,
          1.0498e-01, -5.5220e-02,  6.2414e-02,  6.2761e-03, -7.5363e-02,
         -3.1455e-02, -1.3855e-02,  5.3055e-03,  8.0175e-02,  1.0191e-01,
         -6.0181e-02, -2.5216e-02, -5.8693e-02,  1.0697e-01, -6.9062e-02,
         -7.1443e-03,  3.0373e-03, -4.5049e-02, -9.7800e-03, -8.2172e-03,
          5.3533e-02, -6.4001e-02, -1.5033e-02,  5.0953e-02,  1.6720e-02,
         -4.7536e-03,  6.7573e-02, -4.

## Load Synthetic Database

In [None]:
synthetic_dataset = PickleEssaysDataset(pkl_file='/content/drive/MyDrive/Diversify/synthetic_dataset.pkl')
synthetic_dataset.__getitem__(0)

(tensor([ 1.1729e-01, -1.2117e-01,  1.9815e-02,  1.3948e-01,  4.3668e-02,
         -5.5081e-02,  2.5248e-03,  5.3051e-02, -4.9081e-02,  1.7070e-01,
         -2.8375e-02,  2.7112e-02, -7.3622e-02, -1.6759e-02,  1.0773e-01,
          8.8599e-02,  7.1502e-03,  7.5336e-02,  4.7638e-02,  5.5577e-02,
          3.8074e-02, -1.0725e-02,  9.3183e-04,  2.9351e-03, -3.2345e-02,
         -7.8404e-02,  1.4639e-02,  2.8804e-02,  6.8457e-03,  1.7763e-02,
          5.6305e-03, -3.4578e-02, -1.2965e-01,  1.3569e-03, -5.7878e-02,
          1.3272e-01, -4.1164e-02, -1.2947e-02,  5.1788e-02, -7.9227e-03,
          5.6931e-02, -7.7197e-03,  7.7456e-02,  1.8665e-02, -4.5540e-02,
          8.1336e-02, -3.0270e-02, -6.5412e-02,  4.3442e-02,  3.6134e-02,
         -4.1570e-02, -2.1811e-02,  1.1747e-01,  1.6132e-01, -6.0810e-02,
         -3.7713e-02, -7.5111e-02,  1.0810e-02, -7.7321e-02, -4.9938e-02,
          1.7424e-02,  4.4457e-02, -4.6775e-03,  1.9364e-02,  2.4180e-02,
         -1.7030e-02,  2.4586e-02,  3.

# Sample dataset using NullLoader

## Define Nulloader Algorithm

In [None]:
from functools import reduce
import torch
from torch.utils.data import DataLoader

class NullLoader(DataLoader):
    def __init__(self, proto_loader:DataLoader,
                 outbatch:int, rejection_iters:int, buffer_len:int,
                 reduced_shape:tuple[int], outshape:tuple[int],
                 dimreduction = lambda x: x,
                 device = 'cuda'):
        self.proto = proto_loader
        self.proto_batch = proto_loader.batch_size
        self.mem = buffer_len
        self.dimreduce = dimreduction
        self.rdim = reduced_shape
        self.odim = outshape
        self.batch_size = outbatch
        self.rejection_iters = rejection_iters
        self.device = device
        # assert outbatch % rejection_iters == 0, f"must be able to make full batch in {rejection_iters} iters"
        # assert self.mem % self.proto_batch == 0, f"buffer len {self.mem} must be divisible by the prototype loader batch size {self.proto_batch}"
        self.buffer = torch.zeros((self.mem,) + self.rdim, device=self.device)

    def __iter__(self):
        cand_labels = []
        candidates = []
        gradweights = []
        top_n = self.batch_size // self.rejection_iters

        for _ in range(self.rejection_iters):
            # print("rejection iter", _)
            self._protobatch, self._protolabels = next(iter(self.proto))
            self._protobatch, self._protolabels = self._protobatch.to(self.device), self._protolabels.to(self.device)
            dimreduced = self.dimreduce(self._protobatch)

            # Compute the SVD of the buffer
            U, S, Vh = torch.linalg.svd(self.buffer.view((self.mem, -1)), full_matrices=False)

            # Determine the rank and construct the projection matrix onto the null space
            threshold = 1e-6
            rank = (S > threshold).sum().item()
            if rank == 0:
                # The buffer is empty or has rank zero; the null space is the entire space
                P_null = torch.eye(self.buffer.shape[1], device=self.device)
            else:
                V_row = Vh[:rank, :]
                P_row = V_row.T @ V_row
                P_null = torch.eye(P_row.shape[0], device=self.device) - P_row

            # Project candidates onto the null space and compute projection errors
            dimreduced_flat = dimreduced.view(self._protobatch.size(0), -1).T  # Shape: (n_features, batch_size)
            projections = P_null.double() @ dimreduced_flat  # Shape: (n_features, batch_size)
            proj_errs = torch.linalg.norm(projections, dim=0)

            # Select top_n candidates with the largest projection errors
            asort = torch.argsort(proj_errs, descending=True).to(self.device)
            gradweights.append(proj_errs[asort])
            candidates.append(self._protobatch[asort])
            cand_labels.append(self._protolabels[asort])

            # Update the buffer
            self.buffer = torch.roll(self.buffer, shifts=top_n, dims=0)
            self.buffer[:top_n] = dimreduced[asort[:top_n]].float()

        # Concatenate and normalize gradweights
        candidates = torch.cat(candidates, dim=0)
        cand_labels = torch.cat(cand_labels, dim=0)
        gradweights = torch.cat(gradweights, dim=0)
        gradweights /= gradweights.sum()

        yield candidates, cand_labels, gradweights, asort

## Sample dataset using Nulloader

In [None]:
# Create a DataLoader to load the dataset in batches
BATCH = 32
# print("dataset length", len(private_dataset))
# control_loader = torch.utils.data.DataLoader(private_dataset, batch_size=BATCH, shuffle=True)
# print("dataset length", len(ellipse_dataset))
# control_loader = torch.utils.data.DataLoader(ellipse_dataset, batch_size=BATCH, shuffle=True)
print("dataset length", len(synthetic_dataset))
control_loader = torch.utils.data.DataLoader(synthetic_dataset, batch_size=BATCH, shuffle=True)
def norm(x):
    vnorm = torch.linalg.vector_norm(x.view(x.shape[0], -1), dim=1)
    return x.transpose(0, -1).div(vnorm).transpose(0, -1)
PROTOBATCH = 16
RITERS = 8
MEM = 256
proto_loader = torch.utils.data.DataLoader(synthetic_dataset, batch_size=PROTOBATCH, shuffle=True)
print("loader", proto_loader.batch_size)
exp_loader = NullLoader(control_loader, BATCH, RITERS, MEM, (256,), (256,), norm, device="cuda")
# print(exp_loader.get_scores())
for i in range(1):
  print("iter",i)
  candidates, lablels, weights, asort = next(iter(exp_loader))
# for i in exp_loader:
#   pass
print(candidates.shape)
print(weights.shape)
print(asort.shape)

# get top 100 essays
selected_labels = lablels[:100]

dataset length 296
loader 16
iter 0
torch.Size([256, 256])
torch.Size([256])
torch.Size([32])


# (Optional) Perform K-Means Clustering

In [None]:
from sklearn.cluster import KMeans
import random
n_clusters = 4  # Set the number of clusters as needed
num_samples = 7 # Set number of samples
dataset = synthetic_dataset
dataset_length = len(dataset)

assert num_samples <= dataset_length, "Number of samples cannot exceed the dataset length"


# Extract features from the dataset
features_list = []

# Loop through the dataset and extract features
for i in range(len(dataset)):
    features, _ = dataset[i]
    features_list.append(features.numpy())
print(len(features_list))
# Convert list to a NumPy array
X = np.array(features_list)

# Perform K-means clustering
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
kmeans.fit(X)

# Get cluster labels for each data point
cluster_labels = kmeans.labels_

# Organize data points by clusters
clusters = {i: [] for i in range(n_clusters)}

for idx, label in enumerate(cluster_labels):
    clusters[label].append(idx)

samples_per_cluster = num_samples // n_clusters
sampled_indices = []

# Sample from each cluster
for i in range(n_clusters):
    cluster_size = len(clusters[i])
    if cluster_size < samples_per_cluster:
        sampled_indices.extend(clusters[i])  # Add all if less than needed
    else:
        sampled_indices.extend(random.sample(clusters[i], samples_per_cluster))

while len(sampled_indices) < num_samples:
    cluster = random.randint(0, n_clusters - 1)
    sample = random.choice(clusters[cluster])
    if sample not in sampled_indices:
      sampled_indices.append(sample)
# Get the sampled data points
selected = [dataset[i] for i in sampled_indices]

# Output the sampled data
print("Sampled Data Points (features, labels):")
selected_labels = []
for features, label in selected:
    print(label)
    selected_labels.append(label)

14
Sampled Data Points (features, labels):
tensor([3, 0], dtype=torch.int8)
tensor([0, 0], dtype=torch.int8)
tensor([0, 0], dtype=torch.int8)
tensor([1, 1], dtype=torch.int8)
tensor([3, 0], dtype=torch.int8)
tensor([4, 1], dtype=torch.int8)
tensor([3, 1], dtype=torch.int8)


# Analysis

## Randomly Select Labels and actual distribution as a control

In [None]:
num_samples = 100 # Set number of samples
dataset = synthetic_dataset
dataset_length = len(dataset)

# Now randomly sample as a control
sampled_indices = random.sample(range(dataset_length), num_samples)

# Get the sampled data points
randomly_selected = [dataset[i] for i in sampled_indices]

randomly_selected_labels = []
for features, label in randomly_selected:
    print(label)
    randomly_selected_labels.append(label)

actual_labels = []
for i in range(len(dataset)):
    _, labels = dataset[i]
    actual_labels.append(labels)

tensor([4, 1], dtype=torch.int8)
tensor([3, 0], dtype=torch.int8)
tensor([4, 1], dtype=torch.int8)
tensor([4, 1], dtype=torch.int8)
tensor([0, 1], dtype=torch.int8)
tensor([3, 1], dtype=torch.int8)
tensor([3, 0], dtype=torch.int8)
tensor([3, 1], dtype=torch.int8)
tensor([3, 1], dtype=torch.int8)
tensor([3, 1], dtype=torch.int8)
tensor([0, 1], dtype=torch.int8)
tensor([0, 0], dtype=torch.int8)
tensor([4, 1], dtype=torch.int8)
tensor([1, 0], dtype=torch.int8)
tensor([4, 0], dtype=torch.int8)
tensor([0, 0], dtype=torch.int8)
tensor([3, 1], dtype=torch.int8)
tensor([3, 1], dtype=torch.int8)
tensor([3, 1], dtype=torch.int8)
tensor([4, 1], dtype=torch.int8)
tensor([4, 1], dtype=torch.int8)
tensor([3, 0], dtype=torch.int8)
tensor([3, 1], dtype=torch.int8)
tensor([3, 0], dtype=torch.int8)
tensor([4, 0], dtype=torch.int8)
tensor([3, 0], dtype=torch.int8)
tensor([3, 1], dtype=torch.int8)
tensor([0, 0], dtype=torch.int8)
tensor([4, 1], dtype=torch.int8)
tensor([4, 0], dtype=torch.int8)
tensor([4,

## Analyze results

In [None]:
list_to_analyze = selected_labels

races = ["Hispanic", "Black", "American Indian or Alaskan Native", "Asian", "White"]
genders = ["Female", "Male"]

race_dict = {}
gender_dict = {}

for label in list_to_analyze:
  race_dict[races[label[0]]] = race_dict.get(races[label[0]], 0) + 1
  gender_dict[genders[label[1]]] = gender_dict.get(genders[label[1]], 0) + 1

# print percentages
print("Selected Distribution")
print("Race distribution:")
for race in races:
    percentage = (race_dict.get(race,0) / len(list_to_analyze)) * 100
    print(f"{race}: {percentage:.2f}%")

print("Gender distribution:")
for gender in genders:
    percentage = (gender_dict.get(gender, 0) / len(list_to_analyze)) * 100
    print(f"{gender}: {percentage:.2f}%")


random_race_dict = {}
random_gender_dict = {}

for label in randomly_selected_labels:
  random_race_dict[races[label[0]]] = random_race_dict.get(races[label[0]], 0) + 1
  random_gender_dict[genders[label[1]]] = random_gender_dict.get(genders[label[1]], 0) + 1
print()
print("Random Distribution")
print("Race distribution:")
for race in races:
    percentage = (random_race_dict.get(race, 0) / len(randomly_selected_labels)) * 100
    print(f"{race}: {percentage:.2f}%")

print("Gender distribution:")
for gender in genders:
    percentage = (random_gender_dict.get(gender, 0) / len(randomly_selected_labels)) * 100
    print(f"{gender}: {percentage:.2f}%")

actual_race_dict = {}
actual_gender_dict = {}

for label in actual_labels:
  actual_race_dict[races[label[0]]] = actual_race_dict.get(races[label[0]], 0) + 1
  actual_gender_dict[genders[label[1]]] = actual_gender_dict.get(genders[label[1]], 0) + 1
print()
print("Actual Distribution")
print("Race distribution:")
for race in races:
    percentage = (actual_race_dict.get(race,0) / len(actual_labels)) * 100
    print(f"{race}: {percentage:.2f}%")

print("Gender distribution:")
for gender in genders:
    percentage = (actual_gender_dict.get(gender, 0) / len(actual_labels)) * 100
    print(f"{gender}: {percentage:.2f}%")



Selected Distribution
Race distribution:
Hispanic: 8.00%
Black: 9.00%
American Indian or Alaskan Native: 2.00%
Asian: 49.00%
White: 32.00%
Gender distribution:
Female: 53.00%
Male: 47.00%

Random Distribution
Race distribution:
Hispanic: 11.00%
Black: 4.00%
American Indian or Alaskan Native: 1.00%
Asian: 44.00%
White: 40.00%
Gender distribution:
Female: 47.00%
Male: 53.00%

Actual Distribution
Race distribution:
Hispanic: 10.14%
Black: 4.73%
American Indian or Alaskan Native: 0.68%
Asian: 47.30%
White: 37.16%
Gender distribution:
Female: 50.00%
Male: 50.00%


# Final writeup
As you can see the results are decent. The percentage of black applicants doubled from the actual distribution, the number of white applicants decreased, and the american indian and alaskan native population despite being wildly underepresented grew by almost 4x. Unfortunately, the number of hispanics went down from the original distribution showing the algorithm isn't perfect but still has potential.