In [1]:
from dotenv import load_dotenv
import instructor
from pydantic import BaseModel, Field
from openai import AsyncOpenAI
from typing import Optional, List
import enum
from tqdm import tqdm
import numpy as np

load_dotenv()

aclient = instructor.apatch(AsyncOpenAI())

In [2]:
import json


class EducationLevels(str, enum.Enum):
    HIGH_SCHOOL = "high_school"
    BACHELORS = "bachelors"
    MASTERS = "masters"
    PHD = "phd"
    NONE = "none"


class Location(BaseModel):
    city: str
    state_or_province: str
    country: str


class FakeProfile(BaseModel):
    name: str
    occupation: str
    industry: str
    job_description: str
    education: EducationLevels
    major: Optional[str] = Field(default=None)
    location: Location

    @classmethod
    def from_json(cls, data: str):
        return cls(**json.loads(data))


class FakeProfiles(BaseModel):
    profiles: List[FakeProfile]

    @classmethod
    def from_json(cls, data: str):
        return cls(**json.loads(data))

In [3]:
SYSTEM_PROMPT_GENERATE = """
You are a helpful AI assistant that generates mock data for different profiles. Given a 20newsgroup category, generate 20 fake profiles for people who might be interested in news from that category.
For example:
Input: comp.sys.ibm.pc.hardware
Output: 20 fake profiles for people interested in computers and hardware where news in this category would be relevant to them.

Do not hyper tailor the profiles to the category. For example, if the category is about computers, do not generate profiles for people who are only interested in computers. Instead, generate profiles for people who are interested in computers and other things. The data should be realistic and varied.
"""

In [4]:
async def generate_fake_profile_by_news_group(news_group: str) -> FakeProfile:
    profile = await aclient.chat.completions.create(
        model="gpt-4-1106-preview",
        response_model=FakeProfiles,
        messages=[
            {"role": "system", "content": SYSTEM_PROMPT_GENERATE},
            {"role": "user", "content": news_group},
        ],
        max_retries=3,
    )

    return profile

In [5]:
async def generate_and_save_profiles(news_group: str):
    profile = await generate_fake_profile_by_news_group(news_group)
    with open(f"../fake_profiles/{news_group.replace('.', '_')}.json", "w") as f:
        f.write(profile.model_dump_json())

In [6]:
NEWSGROUPS = [
    "alt.atheism",
    "comp.windows.x",
    "misc.forsale",
    "rec.autos",
    "sci.med",
    "rec.sport.hockey",
    "sci.space",
    "soc.religion.christian",
    "talk.politics.guns",
]

In [None]:
for news_group in NEWSGROUPS:
    await generate_and_save_profiles(news_group)

## Loading the data


In [7]:
import json


# Load the profiles from the json files
profiles_data = {}
for news_group in NEWSGROUPS:
    with open(f"../fake_profiles/{news_group.replace('.', '_')}.json", "r") as f:
        profiles_data[news_group] = FakeProfiles.from_json(f.read())

In [8]:
profiles_data

{'alt.atheism': FakeProfiles(profiles=[FakeProfile(name='Thomas Reed', occupation='Journalist', industry='Media', job_description='Writes articles on various social topics, including religion and secularism.', education=<EducationLevels.BACHELORS: 'bachelors'>, major='Journalism', location=Location(city='Austin', state_or_province='Texas', country='USA')), FakeProfile(name='Samantha Brooks', occupation='College Professor', industry='Education', job_description='Teaches courses on philosophy, including metaphysics and ethics.', education=<EducationLevels.PHD: 'phd'>, major='Philosophy', location=Location(city='Berkeley', state_or_province='California', country='USA')), FakeProfile(name='Marcus Li', occupation='Software Developer', industry='Technology', job_description='Develops mobile applications with a focus on social networking.', education=<EducationLevels.BACHELORS: 'bachelors'>, major='Computer Science', location=Location(city='Seattle', state_or_province='Washington', country='U

In [9]:
from sklearn.datasets import fetch_20newsgroups

newsgroups_train = fetch_20newsgroups(
    subset="train",
    remove=("headers", "footers"),
    random_state=42,
    shuffle=True,
)

In [10]:
target_to_name = {i: name for i, name in enumerate(newsgroups_train.target_names)}

In [82]:
import random

profile_article_pairs = []

profile_map = {}
article_map = {}

for profile_group, profiles in profiles_data.items():
    positive_cases = []
    negative_cases = []
    for profile_idx, profile in enumerate(profiles.profiles):
        profile_map[f"{profile_group}-{profile_idx}"] = profile

        for article_idx, article in enumerate(newsgroups_train.data):
            article_map[article_idx] = article

            if profile_group == target_to_name[newsgroups_train.target[article_idx]]:
                positive_cases.append(
                    (
                        (f"{profile_group}-{profile_idx}", profile),
                        (article_idx, article),
                        1,
                    )
                )
            else:
                negative_cases.append(
                    (
                        (f"{profile_group}-{profile_idx}", profile),
                        (article_idx, article),
                        0,
                    )
                )
    # Randomly sample equal amount of negative cases
    positive_cases = random.sample(positive_cases, len(positive_cases) // 2)
    negative_cases = random.sample(negative_cases, len(positive_cases))
    profile_article_pairs.extend(positive_cases + negative_cases)

In [64]:
print(len(profile_article_pairs))

104274


In [13]:
from sentence_transformers import SentenceTransformer

all_minilm_model = SentenceTransformer("all-MiniLM-L6-v2")

  from .autonotebook import tqdm as notebook_tqdm


In [14]:
from angle_emb import AnglE

angle_model = AnglE.from_pretrained(
    "WhereIsAI/UAE-Large-V1", pooling_strategy="cls"
).cuda()

In [83]:
import pickle

angle_profile_embeddings_map = {}

# Check if we have already computed the embeddings
try:
    with open("../data/angle_profile_embeddings_map.pickle", "rb") as f:
        angle_profile_embeddings_map = pickle.load(f)
except:
    angle_profile_embeddings_map = {
        i: angle_model.encode(
            f"{profile.occupation} {profile.industry} {profile.job_description} {profile.education} {profile.major} {profile.location.city} {profile.location.state_or_province} {profile.location.country}",
            to_numpy=True,
        )
        for i, profile in tqdm(profile_map.items())
    }

    with open("../data/angle_profile_embeddings_map.pickle", "wb") as f:
        pickle.dump(angle_profile_embeddings_map, f)

print(len(angle_profile_embeddings_map))

100%|██████████| 181/181 [00:20<00:00,  9.01it/s]

181





In [84]:
import re


def preprocess(x: str) -> str:
    x = x.replace("\n", " ").replace("\t", " ").replace("\r", " ")

    # Remove emails
    x = re.sub(r"\S*@\S*\s?", "", x)

    # Remove special characters
    x = re.sub(r"[^a-zA-Z0-9 ]", "", x)

    # Remove extra spaces
    x = re.sub(" +", " ", x)

    return x.lower()

In [85]:
angle_article_embeddings_map = {}

# Check if we have already computed the embeddings
try:
    with open("../data/angle_article_embeddings_map.pickle", "rb") as f:
        angle_article_embeddings_map = pickle.load(f)
except:
    angle_article_embeddings_map = {
        i: angle_model.encode(preprocess(article), to_numpy=True)
        for i, article in tqdm(article_map.items(), desc="Computing article embeddings")
    }

    with open("../data/angle_article_embeddings_map.pickle", "wb") as f:
        pickle.dump(angle_article_embeddings_map, f)

In [86]:
angle_pair_embeddings = [
    (angle_profile_embeddings_map[i], angle_article_embeddings_map[j], label)
    for (i, _), (j, _), label in tqdm(profile_article_pairs)
]

100%|██████████| 104274/104274 [00:00<00:00, 807019.32it/s]


In [87]:
np.random.shuffle(angle_pair_embeddings)
train_cutoff = int(len(angle_pair_embeddings) * 0.8)
train_pairs = angle_pair_embeddings[:train_cutoff]
test_pairs = angle_pair_embeddings[train_cutoff:]

print(f"Train size: {len(train_pairs)}")
print(f"Test size: {len(test_pairs)}")

Train size: 83419
Test size: 20855


In [88]:
train_profile_embeddings, train_article_embeddings, train_labels = zip(*train_pairs)
train_profile_embeddings, train_article_embeddings, train_labels = (
    np.array(train_profile_embeddings).squeeze(),
    np.array(train_article_embeddings).squeeze(),
    np.array(train_labels),
)

test_profile_embeddings, test_article_embeddings, test_labels = zip(*test_pairs)
test_profile_embeddings, test_article_embeddings, test_labels = (
    np.array(test_profile_embeddings).squeeze(),
    np.array(test_article_embeddings).squeeze(),
    np.array(test_labels),
)

print(f"Train profile embeddings shape: {train_profile_embeddings.shape}")
print(f"Train article embeddings shape: {train_article_embeddings.shape}")
print(f"Train labels shape: {train_labels.shape}")

print(f"Test profile embeddings shape: {test_profile_embeddings.shape}")
print(f"Test article embeddings shape: {test_article_embeddings.shape}")
print(f"Test labels shape: {test_labels.shape}")

Train profile embeddings shape: (83419, 1024)
Train article embeddings shape: (83419, 1024)
Train labels shape: (83419,)
Test profile embeddings shape: (20855, 1024)
Test article embeddings shape: (20855, 1024)
Test labels shape: (20855,)


In [89]:
def cosine_similarity(x: np.ndarray, y: np.ndarray) -> np.ndarray:
    # Row wise cosine similarity
    return np.sum(x * y, axis=1) / (
        np.linalg.norm(x, axis=1) * np.linalg.norm(y, axis=1)
    )

In [90]:
train_cosine_similarities = cosine_similarity(
    train_profile_embeddings, train_article_embeddings
).reshape(-1, 1)

print(f"Train cosine similarities shape: {train_cosine_similarities.shape}")

Train cosine similarities shape: (83419, 1)


In [106]:
from sklearn.linear_model import Perceptron
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    f1_score,
    precision_score,
    recall_score,
)

perceptron = Perceptron()

perceptron.fit(train_cosine_similarities, train_labels)

print(f"Perceptron score: {perceptron.score(train_cosine_similarities, train_labels)}")
print(
    f"F1 score: {f1_score(train_labels, perceptron.predict(train_cosine_similarities))}"
)
print(
    f"Precision score: {precision_score(train_labels, perceptron.predict(train_cosine_similarities))}"
)
print(
    f"Recall score: {recall_score(train_labels, perceptron.predict(train_cosine_similarities))}"
)

print(
    classification_report(train_labels, perceptron.predict(train_cosine_similarities))
)

Perceptron score: 0.5133482779702466
F1 score: 0.6717233794798809
Precision score: 0.5073846492138921
Recall score: 0.9935175218275326
              precision    recall  f1-score   support

           0       0.83      0.03      0.06     41614
           1       0.51      0.99      0.67     41805

    accuracy                           0.51     83419
   macro avg       0.67      0.51      0.37     83419
weighted avg       0.67      0.51      0.37     83419



In [72]:
test_cosine_similarities = cosine_similarity(
    test_profile_embeddings, test_article_embeddings
).reshape(-1, 1)

print(f"Test cosine similarities shape: {test_cosine_similarities.shape}")

print(f"Perceptron score: {perceptron.score(test_cosine_similarities, test_labels)}")

print(classification_report(test_labels, perceptron.predict(test_cosine_similarities)))

Test cosine similarities shape: (20855, 1)
Perceptron score: 0.4980100695276912
              precision    recall  f1-score   support

           0       0.00      0.00      0.00     10469
           1       0.50      1.00      0.66     10386

    accuracy                           0.50     20855
   macro avg       0.25      0.50      0.33     20855
weighted avg       0.25      0.50      0.33     20855



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [92]:
from sklearn.metrics import f1_score, accuracy_score


def bisection_search(cosine_similarities, binary_labels, precision=0.001):
    """
    Bisection search to find the optimal threshold for cosine similarity to maximize F1 score.

    :param cosine_similarities: NumPy array of cosine similarity values.
    :param binary_labels: NumPy array of corresponding binary labels (0 or 1).
    :param precision: The precision for the threshold value.
    :return: Optimal threshold for cosine similarity.
    """
    if (
        cosine_similarities.size == 0
        or binary_labels.size == 0
        or cosine_similarities.size != binary_labels.size
    ):
        raise ValueError(
            "Invalid input: Cosine similarities and binary labels must be non-empty and of equal length."
        )

    def calculate_f1_score(threshold):
        predictions = cosine_similarities >= threshold
        return f1_score(binary_labels, predictions)

    # Initialize low and high bounds for the threshold
    low, high = 0, 1

    best_threshold = low
    best_f1 = 0

    while high - low > precision:
        mid = (low + high) / 2
        f1_mid = calculate_f1_score(mid)

        # Check if this threshold gives a better F1 score
        if f1_mid > best_f1:
            best_f1 = f1_mid
            best_threshold = mid

        # Adjust the bounds based on the F1 score
        f1_low = calculate_f1_score(low)
        f1_high = calculate_f1_score(high)

        if f1_low < f1_mid > f1_high:
            low = mid
        else:
            high = mid

    return best_threshold

In [93]:
threshold = bisection_search(train_cosine_similarities, train_labels)

print(f"Threshold: {threshold}")

Threshold: 0.375


In [105]:
from sklearn.metrics import classification_report

train_pred = train_cosine_similarities >= threshold
train_accuracy = np.sum(train_pred.squeeze() == train_labels) / len(train_labels)

print(f"Train accuracy: {train_accuracy}")
print(classification_report(train_labels, train_pred))

(83419,)
[0 1 0 ... 0 0 0]
()
Train accuracy: 0.7201836512065597
              precision    recall  f1-score   support

           0       0.70      0.76      0.73     41614
           1       0.74      0.68      0.71     41805

    accuracy                           0.72     83419
   macro avg       0.72      0.72      0.72     83419
weighted avg       0.72      0.72      0.72     83419



In [95]:
print(np.min(train_cosine_similarities))

0.16141427


In [107]:
test_cosine_similarities = cosine_similarity(
    test_profile_embeddings, test_article_embeddings
)

test_pred = test_cosine_similarities >= threshold

test_accuracy = np.sum(test_pred == test_labels) / len(test_labels)

print(f"Test accuracy: {test_accuracy}")
print(f"Test F1 score: {f1_score(test_labels, test_pred)}")
print(f"Test precision score: {precision_score(test_labels, test_pred)}")
print(f"Test recall score: {recall_score(test_labels, test_pred)}")

print(classification_report(test_labels, test_pred))

Test accuracy: 0.7223207863821626
Test F1 score: 0.7093017418804276
Test precision score: 0.7367817290645531
Test recall score: 0.6837979094076655
              precision    recall  f1-score   support

           0       0.71      0.76      0.73     10523
           1       0.74      0.68      0.71     10332

    accuracy                           0.72     20855
   macro avg       0.72      0.72      0.72     20855
weighted avg       0.72      0.72      0.72     20855



In [73]:
from sklearn.neural_network import MLPClassifier

x_train = np.concatenate([train_profile_embeddings, train_article_embeddings], axis=1)

x_test = np.concatenate([test_profile_embeddings, test_article_embeddings], axis=1)

mlp = MLPClassifier(
    hidden_layer_sizes=(512, 256, 128, 64),
    activation="relu",
    solver="adam",
    max_iter=1000,
    random_state=42,
    verbose=True,
)

mlp.fit(x_train, train_labels)

Iteration 1, loss = 0.54953653
Iteration 2, loss = 0.50138766
Iteration 3, loss = 0.48386085
Iteration 4, loss = 0.47317050
Iteration 5, loss = 0.46753558
Iteration 6, loss = 0.46074389
Iteration 7, loss = 0.45868545
Iteration 8, loss = 0.45619042
Iteration 9, loss = 0.45315943
Iteration 10, loss = 0.45012989
Iteration 11, loss = 0.44854499
Iteration 12, loss = 0.44617832
Iteration 13, loss = 0.44422409
Iteration 14, loss = 0.44099284
Iteration 15, loss = 0.44041565
Iteration 16, loss = 0.43744987
Iteration 17, loss = 0.43475710
Iteration 18, loss = 0.43187101
Iteration 19, loss = 0.42934996
Iteration 20, loss = 0.42554775
Iteration 21, loss = 0.42161196
Iteration 22, loss = 0.41825694
Iteration 23, loss = 0.41374906
Iteration 24, loss = 0.40864292
Iteration 25, loss = 0.40545607
Iteration 26, loss = 0.39984338
Iteration 27, loss = 0.39496463
Iteration 28, loss = 0.39021939
Iteration 29, loss = 0.38423702
Iteration 30, loss = 0.37901364
Iteration 31, loss = 0.37438137
Iteration 32, los

In [74]:
print(f"Train accuracy: {mlp.score(x_train, train_labels)}")

print(f"Test accuracy: {mlp.score(x_test, test_labels)}")

Train accuracy: 0.9104640429638332
Test accuracy: 0.6464157276432511


In [75]:
print(classification_report(test_labels, mlp.predict(x_test)))

              precision    recall  f1-score   support

           0       0.64      0.68      0.66     10469
           1       0.65      0.62      0.63     10386

    accuracy                           0.65     20855
   macro avg       0.65      0.65      0.65     20855
weighted avg       0.65      0.65      0.65     20855



In [97]:
import torch
import torch.nn as nn
import torch.optim as optim


class MLPClassifier(nn.Module):
    def __init__(self, input_size):
        super(MLPClassifier, self).__init__()
        self.fc1 = nn.Linear(input_size, 512)
        self.dropout1 = nn.Dropout(0.5)
        self.fc2 = nn.Linear(512, 256)
        self.dropout2 = nn.Dropout(0.5)
        self.fc3 = nn.Linear(256, 128)
        self.dropout3 = nn.Dropout(0.5)
        self.fc4 = nn.Linear(128, 64)
        self.fc5 = nn.Linear(64, 1)  # Output layer for binary classification

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.dropout1(x)
        x = torch.relu(self.fc2(x))
        x = self.dropout2(x)
        x = torch.relu(self.fc3(x))
        x = self.dropout3(x)
        x = torch.relu(self.fc4(x))
        return torch.sigmoid(self.fc5(x))  # Sigmoid activation for binary output


# Example Usage
input_size = train_profile_embeddings.shape[1] + train_article_embeddings.shape[1]

model = MLPClassifier(input_size)

# Define loss function and optimizer
criterion = nn.BCELoss()  # Binary Cross-Entropy Loss
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [98]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score


def evaluate_model(model, x, y_true):
    model.eval()  # Set the model to evaluation mode
    with torch.no_grad():
        y_pred = model(x).squeeze()
        y_pred = torch.round(
            y_pred
        )  # Round probabilities to get binary class predictions

    # Convert to numpy arrays for sklearn metrics
    y_true = y_true.numpy()
    y_pred = y_pred.numpy()

    accuracy = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)

    return accuracy, f1, precision, recall

In [99]:
# Convert numpy arrays to torch tensors
x_train = torch.from_numpy(
    np.concatenate([train_profile_embeddings, train_article_embeddings], axis=1)
).float()
y_train = torch.from_numpy(train_labels).float()


x_test = torch.from_numpy(
    np.concatenate([test_profile_embeddings, test_article_embeddings], axis=1)
).float()
y_test = torch.from_numpy(test_labels).float()

# Training and evaluation loop
num_epochs = 1000
for epoch in range(num_epochs):
    model.train()  # Set the model to training mode
    optimizer.zero_grad()
    outputs = model(x_train)
    loss = criterion(outputs.squeeze(), y_train)
    loss.backward()
    optimizer.step()

    if epoch % 100 == 0:
        # train_acc, train_f1, train_precision, train_recall = evaluate_model(
        #     model, x_train, y_train
        # )
        # test_acc, test_f1, test_precision, test_recall = evaluate_model(
        #     model, x_test, y_test
        # )

        print(
            f"Epoch [{epoch+1}/{num_epochs}], "
            f"Train Loss: {loss.item():.4f}, "
            # f"Train Acc: {train_acc:.4f}, Train F1: {train_f1:.4f}, "
            # f"Test Acc: {test_acc:.4f}, Test F1: {test_f1:.4f}"
        )

Epoch [1/1000], Train Loss: 0.6935, 
Epoch [101/1000], Train Loss: 0.0181, 
Epoch [201/1000], Train Loss: 0.0041, 
Epoch [301/1000], Train Loss: 0.0040, 
Epoch [401/1000], Train Loss: 0.0027, 
Epoch [501/1000], Train Loss: 0.0024, 
Epoch [601/1000], Train Loss: 0.0025, 
Epoch [701/1000], Train Loss: 0.0019, 
Epoch [801/1000], Train Loss: 0.0025, 
Epoch [901/1000], Train Loss: 0.0016, 


In [101]:
# pickle model

import pickle

with open("../models/mlp_model.pickle", "wb") as f:
    pickle.dump(model, f)

In [100]:
# Final evaluation after training is complete
model.eval()  # Set the model to evaluation mode

# Evaluate on training data
train_acc, train_f1, train_precision, train_recall = evaluate_model(
    model, x_train, y_train
)
print(
    f"Final Training Metrics: Accuracy: {train_acc:.4f}, F1 Score: {train_f1:.4f}, "
    f"Precision: {train_precision:.4f}, Recall: {train_recall:.4f}"
)

# Evaluate on test data
test_acc, test_f1, test_precision, test_recall = evaluate_model(model, x_test, y_test)
print(
    f"Final Testing Metrics: Accuracy: {test_acc:.4f}, F1 Score: {test_f1:.4f}, "
    f"Precision: {test_precision:.4f}, Recall: {test_recall:.4f}"
)

Final Training Metrics: Accuracy: 0.9997, F1 Score: 0.9997, Precision: 1.0000, Recall: 0.9995
Final Testing Metrics: Accuracy: 0.9942, F1 Score: 0.9942, Precision: 0.9893, Recall: 0.9991
