In [3]:
from dotenv import load_dotenv
import instructor
from pydantic import BaseModel, Field
from openai import AsyncOpenAI
from typing import Optional, List
import enum
from tqdm import tqdm
import numpy as np

load_dotenv()

aclient = instructor.apatch(AsyncOpenAI())

In [4]:
import json


class EducationLevels(str, enum.Enum):
    HIGH_SCHOOL = "high_school"
    BACHELORS = "bachelors"
    MASTERS = "masters"
    PHD = "phd"
    NONE = "none"


class Location(BaseModel):
    city: str
    state_or_province: str
    country: str


class FakeProfile(BaseModel):
    name: str
    occupation: str
    industry: str
    job_description: str
    education: EducationLevels
    major: Optional[str] = Field(default=None)
    location: Location

    @classmethod
    def from_json(cls, data: str):
        return cls(**json.loads(data))


class FakeProfiles(BaseModel):
    profiles: List[FakeProfile]

    @classmethod
    def from_json(cls, data: str):
        return cls(**json.loads(data))

In [5]:
SYSTEM_PROMPT_GENERATE = """
You are a helpful AI assistant that generates mock data for different profiles. Given a 20newsgroup category, generate 20 fake profiles for people who might be interested in news from that category.
For example:
Input: comp.sys.ibm.pc.hardware
Output: 20 fake profiles for people interested in computers and hardware where news in this category would be relevant to them.

Do not hyper tailor the profiles to the category. For example, if the category is about computers, do not generate profiles for people who are only interested in computers. Instead, generate profiles for people who are interested in computers and other things. The data should be realistic and varied.
"""

In [6]:
async def generate_fake_profile_by_news_group(news_group: str) -> FakeProfile:
    profile = await aclient.chat.completions.create(
        model="gpt-4-1106-preview",
        response_model=FakeProfiles,
        messages=[
            {"role": "system", "content": SYSTEM_PROMPT_GENERATE},
            {"role": "user", "content": news_group},
        ],
        max_retries=3,
    )

    return profile

In [7]:
async def generate_and_save_profiles(news_group: str):
    profile = await generate_fake_profile_by_news_group(news_group)
    with open(f"../fake_profiles/{news_group.replace('.', '_')}.json", "w") as f:
        f.write(profile.model_dump_json())

In [8]:
NEWSGROUPS = [
    "alt.atheism",
    "comp.windows.x",
    "misc.forsale",
    "rec.autos",
    "sci.med",
    "rec.sport.hockey",
    "sci.space",
    "soc.religion.christian",
    "talk.politics.guns",
]

In [None]:
for news_group in NEWSGROUPS:
    await generate_and_save_profiles(news_group)

## Loading the data


In [9]:
import json


# Load the profiles from the json files
profiles_data = {}
for news_group in NEWSGROUPS:
    with open(f"../fake_profiles/{news_group.replace('.', '_')}.json", "r") as f:
        profiles_data[news_group] = FakeProfiles.from_json(f.read())

In [10]:
profiles_data

{'alt.atheism': FakeProfiles(profiles=[FakeProfile(name='Thomas Reed', occupation='Journalist', industry='Media', job_description='Writes articles on various social topics, including religion and secularism.', education=<EducationLevels.BACHELORS: 'bachelors'>, major='Journalism', location=Location(city='Austin', state_or_province='Texas', country='USA')), FakeProfile(name='Samantha Brooks', occupation='College Professor', industry='Education', job_description='Teaches courses on philosophy, including metaphysics and ethics.', education=<EducationLevels.PHD: 'phd'>, major='Philosophy', location=Location(city='Berkeley', state_or_province='California', country='USA')), FakeProfile(name='Marcus Li', occupation='Software Developer', industry='Technology', job_description='Develops mobile applications with a focus on social networking.', education=<EducationLevels.BACHELORS: 'bachelors'>, major='Computer Science', location=Location(city='Seattle', state_or_province='Washington', country='U

# Load the 20newsgroups Data


In [11]:
from sklearn.datasets import fetch_20newsgroups

newsgroups_train = fetch_20newsgroups(
    subset="train",
    remove=("headers", "footers"),
    random_state=42,
    shuffle=True,
)

# Organize Profile and Articles into Pairs


Half of the positive cases for each profile will be taken along with an even amount of negative cases. This prevents us from having an overwhelming number of samples to embed while ensuring a even class distribution.


In [12]:
target_to_name = {i: name for i, name in enumerate(newsgroups_train.target_names)}

In [13]:
import random

profile_article_pairs = []

profile_map = {}
article_map = {}

for profile_group, profiles in profiles_data.items():
    positive_cases = []
    negative_cases = []
    for profile_idx, profile in enumerate(profiles.profiles):
        profile_map[f"{profile_group}-{profile_idx}"] = profile

        for article_idx, article in enumerate(newsgroups_train.data):
            article_map[article_idx] = article

            if profile_group == target_to_name[newsgroups_train.target[article_idx]]:
                positive_cases.append(
                    (
                        (f"{profile_group}-{profile_idx}", profile),
                        (article_idx, article),
                        1,
                    )
                )
            else:
                negative_cases.append(
                    (
                        (f"{profile_group}-{profile_idx}", profile),
                        (article_idx, article),
                        0,
                    )
                )
    # Randomly sample equal amount of negative cases
    positive_cases = random.sample(positive_cases, len(positive_cases) // 2)
    negative_cases = random.sample(negative_cases, len(positive_cases))
    profile_article_pairs.extend(positive_cases + negative_cases)

In [14]:
print(len(profile_article_pairs))

104274


# Embedding Data

The Unversal AnglE Embedding model is used for all embeddings.


In [15]:
from angle_emb import AnglE

angle_model = AnglE.from_pretrained(
    "WhereIsAI/UAE-Large-V1", pooling_strategy="cls"
).cuda()

  from .autonotebook import tqdm as notebook_tqdm


All embeddings are pickled after to prevent redoing this procedure.


In [16]:
import pickle

angle_profile_embeddings_map = {}

# Check if we have already computed the embeddings
try:
    with open("../data/angle_profile_embeddings_map.pickle", "rb") as f:
        angle_profile_embeddings_map = pickle.load(f)
except:
    angle_profile_embeddings_map = {
        i: angle_model.encode(
            f"{profile.occupation} {profile.industry} {profile.job_description} {profile.education} {profile.major} {profile.location.city} {profile.location.state_or_province} {profile.location.country}",
            to_numpy=True,
        )
        for i, profile in tqdm(profile_map.items())
    }

    with open("../data/angle_profile_embeddings_map.pickle", "wb") as f:
        pickle.dump(angle_profile_embeddings_map, f)

print(len(angle_profile_embeddings_map))

181


The artcles are preprocessed before embedding.


In [17]:
import re


def preprocess(x: str) -> str:
    x = x.replace("\n", " ").replace("\t", " ").replace("\r", " ")

    # Remove emails
    x = re.sub(r"\S*@\S*\s?", "", x)

    # Remove special characters
    x = re.sub(r"[^a-zA-Z0-9 ]", "", x)

    # Remove extra spaces
    x = re.sub(" +", " ", x)

    return x.lower()

In [18]:
angle_article_embeddings_map = {}

# Check if we have already computed the embeddings
try:
    with open("../data/angle_article_embeddings_map.pickle", "rb") as f:
        angle_article_embeddings_map = pickle.load(f)
except:
    angle_article_embeddings_map = {
        i: angle_model.encode(preprocess(article), to_numpy=True)
        for i, article in tqdm(article_map.items(), desc="Computing article embeddings")
    }

    with open("../data/angle_article_embeddings_map.pickle", "wb") as f:
        pickle.dump(angle_article_embeddings_map, f)

In [19]:
angle_pair_embeddings = [
    (angle_profile_embeddings_map[i], angle_article_embeddings_map[j], label)
    for (i, _), (j, _), label in tqdm(profile_article_pairs)
]

100%|██████████| 104274/104274 [00:00<00:00, 623279.16it/s]


# Split Data

The data was split into a training and test set. The test set is 20% of the total size.


In [20]:
np.random.shuffle(angle_pair_embeddings)
train_cutoff = int(len(angle_pair_embeddings) * 0.8)
train_pairs = angle_pair_embeddings[:train_cutoff]
test_pairs = angle_pair_embeddings[train_cutoff:]

print(f"Train size: {len(train_pairs)}")
print(f"Test size: {len(test_pairs)}")

Train size: 83419
Test size: 20855


In [21]:
train_profile_embeddings, train_article_embeddings, train_labels = zip(*train_pairs)
train_profile_embeddings, train_article_embeddings, train_labels = (
    np.array(train_profile_embeddings).squeeze(),
    np.array(train_article_embeddings).squeeze(),
    np.array(train_labels),
)

test_profile_embeddings, test_article_embeddings, test_labels = zip(*test_pairs)
test_profile_embeddings, test_article_embeddings, test_labels = (
    np.array(test_profile_embeddings).squeeze(),
    np.array(test_article_embeddings).squeeze(),
    np.array(test_labels),
)

print(f"Train profile embeddings shape: {train_profile_embeddings.shape}")
print(f"Train article embeddings shape: {train_article_embeddings.shape}")
print(f"Train labels shape: {train_labels.shape}")

print(f"Test profile embeddings shape: {test_profile_embeddings.shape}")
print(f"Test article embeddings shape: {test_article_embeddings.shape}")
print(f"Test labels shape: {test_labels.shape}")

Train profile embeddings shape: (83419, 1024)
Train article embeddings shape: (83419, 1024)
Train labels shape: (83419,)
Test profile embeddings shape: (20855, 1024)
Test article embeddings shape: (20855, 1024)
Test labels shape: (20855,)


# Cosine Similarity


In [22]:
def cosine_similarity(x: np.ndarray, y: np.ndarray) -> np.ndarray:
    # Row wise cosine similarity
    return np.sum(x * y, axis=1) / (
        np.linalg.norm(x, axis=1) * np.linalg.norm(y, axis=1)
    )

Compute the pairwise cosine similarity for the traning set.


In [23]:
train_cosine_similarities = cosine_similarity(
    train_profile_embeddings, train_article_embeddings
).reshape(-1, 1)

print(f"Train cosine similarities shape: {train_cosine_similarities.shape}")

Train cosine similarities shape: (83419, 1)


## Perceptron

We fit a perceptron to the similarity data. This will highlight if this data is linearly seperable.


In [24]:
from sklearn.linear_model import Perceptron
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    f1_score,
    precision_score,
    recall_score,
)

perceptron = Perceptron()

perceptron.fit(train_cosine_similarities, train_labels)

print(f"Perceptron score: {perceptron.score(train_cosine_similarities, train_labels)}")
print(
    f"F1 score: {f1_score(train_labels, perceptron.predict(train_cosine_similarities))}"
)
print(
    f"Precision score: {precision_score(train_labels, perceptron.predict(train_cosine_similarities))}"
)
print(
    f"Recall score: {recall_score(train_labels, perceptron.predict(train_cosine_similarities))}"
)

print(
    classification_report(train_labels, perceptron.predict(train_cosine_similarities))
)

Perceptron score: 0.518251237727616
F1 score: 0.6731834261781807
Precision score: 0.5098423256959842
Recall score: 0.9905229149216226
              precision    recall  f1-score   support

           0       0.82      0.04      0.08     41634
           1       0.51      0.99      0.67     41785

    accuracy                           0.52     83419
   macro avg       0.67      0.52      0.38     83419
weighted avg       0.67      0.52      0.38     83419



Evaluate test results.


In [26]:
test_cosine_similarities = cosine_similarity(
    test_profile_embeddings, test_article_embeddings
).reshape(-1, 1)

print(f"Test cosine similarities shape: {test_cosine_similarities.shape}")

print(f"Perceptron score: {perceptron.score(test_cosine_similarities, test_labels)}")
print(
    f"F1 score: {f1_score(test_labels, perceptron.predict(test_cosine_similarities))}"
)
print(
    f"Precision score: {precision_score(test_labels, perceptron.predict(test_cosine_similarities))}"
)
print(
    f"Recall score: {recall_score(test_labels, perceptron.predict(test_cosine_similarities))}"
)

print(classification_report(test_labels, perceptron.predict(test_cosine_similarities)))

Test cosine similarities shape: (20855, 1)
Perceptron score: 0.5134500119875329
F1 score: 0.6690584129676135
Precision score: 0.5050470234871239
Recall score: 0.9908230293663061
              precision    recall  f1-score   support

           0       0.83      0.04      0.08     10503
           1       0.51      0.99      0.67     10352

    accuracy                           0.51     20855
   macro avg       0.67      0.52      0.38     20855
weighted avg       0.67      0.51      0.37     20855



## Threshold Search

This strategy aims to find a threshold cosine similarity to define the boundary between positive and negative cases. The search is designed to find the threshold that maximizes the F1-Score.


In [30]:
from sklearn.metrics import f1_score, accuracy_score


def bisection_search(cosine_similarities, binary_labels, precision=0.001):
    """
    Bisection search to find the optimal threshold for cosine similarity to maximize F1 score.

    :param cosine_similarities: NumPy array of cosine similarity values.
    :param binary_labels: NumPy array of corresponding binary labels (0 or 1).
    :param precision: The precision for the threshold value.
    :return: Optimal threshold for cosine similarity.
    """
    if (
        cosine_similarities.size == 0
        or binary_labels.size == 0
        or cosine_similarities.size != binary_labels.size
    ):
        raise ValueError(
            "Invalid input: Cosine similarities and binary labels must be non-empty and of equal length."
        )

    def calculate_f1_score(threshold):
        predictions = cosine_similarities >= threshold
        return f1_score(binary_labels, predictions)

    def calculate_accuracy(threshold):
        predictions = cosine_similarities >= threshold
        return accuracy_score(binary_labels, predictions)

    # Initialize low and high bounds for the threshold
    low, high = 0, 1

    best_threshold = low
    best_f1 = 0

    while high - low > precision:
        mid = (low + high) / 2
        f1_mid = calculate_f1_score(mid)
        # f1_mid = calculate_accuracy(mid)

        # Check if this threshold gives a better F1 score
        if f1_mid > best_f1:
            best_f1 = f1_mid
            best_threshold = mid

        # Adjust the bounds based on the F1 score
        f1_low = calculate_f1_score(low)
        f1_high = calculate_f1_score(high)
        # f1_low = calculate_accuracy(low)
        # f1_high = calculate_accuracy(high)

        if f1_low < f1_mid > f1_high:
            low = mid
        else:
            high = mid

    return best_threshold

In [31]:
threshold = bisection_search(train_cosine_similarities, train_labels)

print(f"Threshold: {threshold}")

Threshold: 0.375


In [32]:
from sklearn.metrics import classification_report

train_pred = train_cosine_similarities >= threshold
train_accuracy = np.sum(train_pred.squeeze() == train_labels) / len(train_labels)

print(f"Train accuracy: {train_accuracy}")
print(classification_report(train_labels, train_pred))

Train accuracy: 0.7196921564631559
              precision    recall  f1-score   support

           0       0.70      0.76      0.73     41634
           1       0.74      0.68      0.71     41785

    accuracy                           0.72     83419
   macro avg       0.72      0.72      0.72     83419
weighted avg       0.72      0.72      0.72     83419



In [33]:
print(np.min(train_cosine_similarities))

0.17302714


In [34]:
test_cosine_similarities = cosine_similarity(
    test_profile_embeddings, test_article_embeddings
)

test_pred = test_cosine_similarities >= threshold

test_accuracy = np.sum(test_pred == test_labels) / len(test_labels)

print(f"Test accuracy: {test_accuracy}")
print(f"Test F1 score: {f1_score(test_labels, test_pred)}")
print(f"Test precision score: {precision_score(test_labels, test_pred)}")
print(f"Test recall score: {recall_score(test_labels, test_pred)}")

print(classification_report(test_labels, test_pred))

Test accuracy: 0.7196835291297051
Test F1 score: 0.7084580091761421
Test precision score: 0.7322680412371134
Test recall score: 0.6861476043276662
              precision    recall  f1-score   support

           0       0.71      0.75      0.73     10503
           1       0.73      0.69      0.71     10352

    accuracy                           0.72     20855
   macro avg       0.72      0.72      0.72     20855
weighted avg       0.72      0.72      0.72     20855



# Neural Network

In this section, we try to use the embedding vectors as inputs to a neural network. The NN is a standard feed forward network with dropout.


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim


class MLPClassifier(nn.Module):
    def __init__(self, input_size):
        super(MLPClassifier, self).__init__()
        self.fc1 = nn.Linear(input_size, 512)
        self.dropout1 = nn.Dropout(0.5)
        self.fc2 = nn.Linear(512, 256)
        self.dropout2 = nn.Dropout(0.5)
        self.fc3 = nn.Linear(256, 128)
        self.dropout3 = nn.Dropout(0.5)
        self.fc4 = nn.Linear(128, 64)
        self.fc5 = nn.Linear(64, 1)  # Output layer for binary classification

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.dropout1(x)
        x = torch.relu(self.fc2(x))
        x = self.dropout2(x)
        x = torch.relu(self.fc3(x))
        x = self.dropout3(x)
        x = torch.relu(self.fc4(x))
        return torch.sigmoid(self.fc5(x))  # Sigmoid activation for binary output


# Example Usage
input_size = train_profile_embeddings.shape[1] + train_article_embeddings.shape[1]

model = MLPClassifier(input_size)

# Define loss function and optimizer
criterion = nn.BCELoss()  # Binary Cross-Entropy Loss
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score


def evaluate_model(model, x, y_true):
    model.eval()  # Set the model to evaluation mode
    with torch.no_grad():
        y_pred = model(x).squeeze()
        y_pred = torch.round(
            y_pred
        )  # Round probabilities to get binary class predictions

    # Convert to numpy arrays for sklearn metrics
    y_true = y_true.numpy()
    y_pred = y_pred.numpy()

    accuracy = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)

    return accuracy, f1, precision, recall

In [None]:
# Convert numpy arrays to torch tensors
x_train = torch.from_numpy(
    np.concatenate([train_profile_embeddings, train_article_embeddings], axis=1)
).float()
y_train = torch.from_numpy(train_labels).float()


x_test = torch.from_numpy(
    np.concatenate([test_profile_embeddings, test_article_embeddings], axis=1)
).float()
y_test = torch.from_numpy(test_labels).float()


try:
    with open("../models/mlp_model.pickle", "rb") as f:
        model = pickle.load(f)
except:
    # Training and evaluation loop
    num_epochs = 1000
    for epoch in range(num_epochs):
        model.train()  # Set the model to training mode
        optimizer.zero_grad()
        outputs = model(x_train)
        loss = criterion(outputs.squeeze(), y_train)
        loss.backward()
        optimizer.step()

        if epoch % 100 == 0:
            # train_acc, train_f1, train_precision, train_recall = evaluate_model(
            #     model, x_train, y_train
            # )
            # test_acc, test_f1, test_precision, test_recall = evaluate_model(
            #     model, x_test, y_test
            # )

            print(
                f"Epoch [{epoch+1}/{num_epochs}], "
                f"Train Loss: {loss.item():.4f}, "
                # f"Train Acc: {train_acc:.4f}, Train F1: {train_f1:.4f}, "
                # f"Test Acc: {test_acc:.4f}, Test F1: {test_f1:.4f}"
            )

In [None]:
# pickle model

import pickle

with open("../models/mlp_model.pickle", "wb") as f:
    pickle.dump(model, f)

In [None]:
# Final evaluation after training is complete
model.eval()  # Set the model to evaluation mode

# Evaluate on training data
train_acc, train_f1, train_precision, train_recall = evaluate_model(
    model, x_train, y_train
)
print(
    f"Final Training Metrics: Accuracy: {train_acc:.4f}, F1 Score: {train_f1:.4f}, "
    f"Precision: {train_precision:.4f}, Recall: {train_recall:.4f}"
)

# Evaluate on test data
test_acc, test_f1, test_precision, test_recall = evaluate_model(model, x_test, y_test)
print(
    f"Final Testing Metrics: Accuracy: {test_acc:.4f}, F1 Score: {test_f1:.4f}, "
    f"Precision: {test_precision:.4f}, Recall: {test_recall:.4f}"
)