In [1]:
from tqdm import tqdm

In [2]:
# suppress sklearn warnings
import warnings
warnings.filterwarnings("ignore")

In [3]:
from functools import cache

In [4]:
import polars as pl

In [5]:
import pandas as pd

In [6]:
import torch

In [7]:
import torch.nn.functional as F
from torch import nn

In [8]:
from sentence_transformers import SentenceTransformer


In [9]:
import requests

In [10]:
import numpy as np

In [11]:
import psycopg

In [12]:
# from dotenv import load_dotenv,find_dotenv
# import os
# # use the find_dotenv function to locate the file
# load_dotenv(find_dotenv())

# db_string = os.getenv("DB_CONN") or "postgresql://postgres:password@localhost:5432/rec_llm"
# # define a psycopg3 connection to postgres
# conn = psycopg.connect(db_string)

In [13]:
device = (
    "cuda:0"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)

In [14]:

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

class NeuralNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.flatten = nn.Flatten()
        self.conv_stack_post = nn.Sequential(
            nn.Conv1d(1, 16, kernel_size=8, padding=1),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=2, stride=2),
            nn.Conv1d(16, 32, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=2, stride=2),
            nn.Conv1d(32, 64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=2, stride=2),
            nn.Conv1d(64, 64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=2, stride=2),
            nn.Conv1d(64, 16, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=2, stride=2),
            nn.Dropout(0.2)
        )

        self.conv_stack_prompt = nn.Sequential(
            nn.Conv1d(1, 16, kernel_size=8, padding=1),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=2, stride=2),
            nn.Conv1d(16, 32, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=2, stride=2),
            nn.Conv1d(32, 64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=2, stride=2),
            nn.Conv1d(64, 64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=2, stride=2),
            nn.Conv1d(64, 16, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=2, stride=2),
            nn.Dropout(0.2)
        )

        self.linear_relu_stack = nn.Sequential(
            nn.Linear(1027, 1024, bias=False),
            nn.BatchNorm1d(1024),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(1024, 1024, bias=False),
            nn.BatchNorm1d(1024),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(1024, 512, bias=False),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(512, 256, bias=False),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(256, 256, bias=False),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(256, 256, bias=False),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(256, 10, bias=False),
            nn.BatchNorm1d(10),
            nn.ReLU(),
            nn.Linear(10, 1)
        )


    def forward(self, x):
        # take the first 2048 values for the embeddings
        prompt_embedding = x[:, :1024]
        post_embedding = x[:, 1024:2048]
        features = x[:, 2048:]
        # take the rest of the values for the features
        prompt_embedding = prompt_embedding.unsqueeze(1)
        post_embedding = post_embedding.unsqueeze(1)

        prompt_embedding = self.conv_stack_prompt(prompt_embedding)
        post_embedding = self.conv_stack_post(post_embedding)

        prompt_embedding = self.flatten(prompt_embedding)
        post_embedding = self.flatten(post_embedding)

        # embeddings = self.flatten(embeddings)
        x = torch.cat([prompt_embedding, post_embedding, features], dim=1)
        # print(x.shape)
        # x = x.view(x.size(0), -1)
        logits = self.linear_relu_stack(x)
        return logits

def init_weights(m):
    if isinstance(m, nn.Conv1d) or isinstance(m, nn.Linear):
        nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
        if m.bias is not None:
            nn.init.zeros_(m.bias)

def get_model():
    model = NeuralNetwork().to(device)
    model.apply(init_weights)
    return model


In [60]:
embedding_model = SentenceTransformer('intfloat/e5-large-v2')

CATEGORIES = [
    'autos',
 'entertainment',
 'finance',
 'foodanddrink',
 'health',
 'kids',
 'lifestyle',
 'middleeast',
 'movies',
 'music',
 'news',
 'northamerica',
 'sports',
 'travel',
 'tv',
 'video',
 'weather']
CATEGORIES.sort()

def load_model(path):
    model =  get_model()
    if device == 'cpu':
        model.load_state_dict(torch.load(path, map_location='cpu'))
    else:
        model.load_state_dict(torch.load(path))
    return model


def infer(processed_tensor: torch.Tensor) -> float | list[float]:
    with torch.no_grad():
        logits = MODEL(processed_tensor)
        # recursively flatten logits and return an float or list of floats
        logits = logits.cpu().numpy().flatten().tolist()
        return logits

REQUIRED_FIELDS = [
    # 'title',
    # 'abstract',
    'prompt',
    'prompt_type',
    'post_category',
    'prompt_category'
]

# @cache
# def get_embedding(text: str | list[str]):
#     return embedding_model.encode(text)

def process_input(**kwargs):
    for field in REQUIRED_FIELDS:
        if field not in kwargs:
            raise ValueError(f"Missing required field: {field}")

    # try:
    #     post = kwargs['title'] + kwargs['abstract']
    # except:
    #     post = kwargs['title']
    prompt = kwargs['prompt_embedding']
    post = kwargs['post_embedding']

    embedded_prompt = None
    embedded_post = None
    # if kwargs['prompt_embedding'] == 'False':
    #     embedded_prompt = get_embedding(prompt)

    # if kwargs['post_embedding'] == 'False':
    #     embedded_post = get_embedding(post)

    prompt_type = kwargs['prompt_type']

    prompt_type_map = {
        'boost': 0,
        'suppress': 1
    }

    try:
        prompt_type = prompt_type_map[prompt_type]
    except KeyError:
        raise ValueError(f"Invalid prompt type: {prompt_type}")

    post_category = kwargs['post_category']
    prompt_category = kwargs['prompt_category']

    # check if post_category and prompt_category are in the list of categories
    if post_category not in CATEGORIES:
        raise ValueError(f"Invalid post category: {post_category}")

    if prompt_category not in CATEGORIES:
        raise ValueError(f"Invalid prompt category: {prompt_category}")

    # make a one hot encoding of the categories
    post_category_index = CATEGORIES.index(post_category)
    prompt_category_index = CATEGORIES.index(prompt_category)

    post_category = [0] * len(CATEGORIES)
    post_category[post_category_index] = 1

    prompt_category = [0] * len(CATEGORIES)
    prompt_category[prompt_category_index] = 1

    # try:
    #     prompt = np.array(eval(prompt))
    # except:
    #     print(prompt)
    #     raise ValueError(f"Invalid prompt: {prompt}")

    # try:
    #     post = np.array(eval(post))
    # except:
    #     print(post)
    #     raise ValueError(f"Invalid post: {post}")
    features = np.concatenate([
        prompt,
        post,
        [prompt_type],
        post_category,
        prompt_category
    ])


    # print(features.shape)
    # print(features)
    return features

# def get_embedding(text: str | list[str]):
#     return embedding_model.encode(text, device=device)

In [56]:
MODEL = load_model('./api/model.bin')
MODEL.to(device)
MODEL.eval()

NeuralNetwork(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (conv_stack_post): Sequential(
    (0): Conv1d(1, 16, kernel_size=(8,), stride=(1,), padding=(1,))
    (1): ReLU()
    (2): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): Conv1d(16, 32, kernel_size=(3,), stride=(1,), padding=(1,))
    (4): ReLU()
    (5): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (6): Conv1d(32, 64, kernel_size=(3,), stride=(1,), padding=(1,))
    (7): ReLU()
    (8): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (9): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(1,))
    (10): ReLU()
    (11): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (12): Conv1d(64, 16, kernel_size=(3,), stride=(1,), padding=(1,))
    (13): ReLU()
    (14): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (15): Dropout(p=0.2, inplace=False)
  )
  (conv_stack

In [61]:
def get_inference(row):
    features = process_input(**row)
    # print(features)
    features = torch.tensor(features, dtype=torch.float32).unsqueeze(0).to(device)
    return infer(features)[0]

In [19]:
def mean_reciprocal_rank(true_ranks, predicted_ranks):
    """
    Calculates the mean reciprocal rank (MRR) for the given true ranks and predicted ranks.

    Args:
        true_ranks (list): A list of true ranks.
        predicted_ranks (list): A list of lists, where each inner list represents the predicted ranks.

    Returns:
        float: The mean reciprocal rank.
    """
    reciprocal_ranks = []
    for true_rank, predicted_rank in zip(true_ranks, predicted_ranks):
        # If the true rank is not in the predicted ranks, the reciprocal rank is 0
        if true_rank not in predicted_rank:
            reciprocal_rank = 0.0
        else:
            # Get the position of the true rank in the predicted ranks
            rank_position = predicted_rank.index(true_rank) + 1
            reciprocal_rank = 1.0 / rank_position

        reciprocal_ranks.append(reciprocal_rank)

    mrr = np.mean(reciprocal_ranks)
    return mrr



In [20]:
def get_from_api():
    url = "https://train.synapse.com.np/training_data"
    response = requests.get(url)
    return response.json()

In [21]:
data = get_from_api()

In [22]:
len(data)

3230

In [23]:
# flatten the list of dicts
"""
eg: a dict {a: {b: 1, c: 2}, d: 3} will be converted to {a_b: 1, a_c: 2, d: 3}
"""
def flatten_dict(d, parent_key='', sep='_'):
    items = []
    for k, v in d.items():
        new_key = parent_key + sep + k if parent_key else k
        if isinstance(v, dict):
            items.extend(flatten_dict(v, new_key, sep=sep).items())
        else:
            items.append((new_key, v))
    return dict(items)

# flatten data
data = [flatten_dict(d) for d in data]

In [24]:
data[0]

{'id': 2826,
 'post_title': 'How to Get Rid of Skin Tags, According to a Dermatologist',
 'post_abstract': "They seem harmless, but there's a very good reason you shouldn't ignore them. The post How to Get Rid of Skin Tags, According to a Dermatologist appeared first on Reader's Digest.",
 'post_category': 'health',
 'post_subcategory': 'medical',
 'prompt_prompt': 'Show updates on scientific breakthroughs in medicine.',
 'prompt_type': 'boost',
 'prompt_category': 'health',
 'prompt_subcategory': 'medical',
 'label': 0.8}

In [25]:
@cache
def embed(text: str | list[str]):
    # check if the embeddingmodel is mixedbread-ai/mxbai-embed-large-v1
    return embedding_model.encode(text, device=device)

In [26]:
t = pd.DataFrame(data)

In [27]:
t.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3230 entries, 0 to 3229
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   id                  3230 non-null   int64  
 1   post_title          3230 non-null   object 
 2   post_abstract       3061 non-null   object 
 3   post_category       3230 non-null   object 
 4   post_subcategory    3230 non-null   object 
 5   prompt_prompt       3230 non-null   object 
 6   prompt_type         3230 non-null   object 
 7   prompt_category     3230 non-null   object 
 8   prompt_subcategory  3230 non-null   object 
 9   label               3230 non-null   float64
dtypes: float64(1), int64(1), object(8)
memory usage: 252.5+ KB


In [28]:
def get_categories():
    api = "https://train.synapse.com.np/categories"
    response = requests.get(api)
    return response.json()

In [29]:
categories = get_categories()
categories.sort()
categories

['autos',
 'entertainment',
 'finance',
 'foodanddrink',
 'health',
 'kids',
 'lifestyle',
 'middleeast',
 'movies',
 'music',
 'news',
 'northamerica',
 'sports',
 'travel',
 'tv',
 'video',
 'weather']

In [None]:
x = pl.DataFrame(data)
x = x.with_columns(
        pl.concat_str([
            pl.col("post_title"),
            pl.col("post_abstract"),
        ]).alias("post")
    ).with_columns( # handle null values for post
        pl.when(pl.col("post").is_null())
        .then(pl.col("post_title"))
        .otherwise(pl.col("post")).alias("post")
    ).with_columns( # create embeddings for the post and prompt
        pl.col("post").map_elements(embed).alias("post_embedding"),
        pl.col("prompt_prompt").map_elements(embed).alias("prompt_embedding"),
        pl.col("label").cast(pl.Float32).alias("label") # cast the label to a float
    )

x.to_pandas()


In [37]:
def preprocess(raw_data):
    # df = pl.DataFrame(raw_data, schema= {'id': pl.Int32, 'title': pl.Utf8, 'abstract': pl.Utf8, 'prompt': pl.Utf8, 'prompt_type': pl.Utf8, 'label': pl.Utf8})
    df = pl.DataFrame(raw_data)
    # concat the title and abstract
    df = df.with_columns(
        pl.concat_str([
            pl.col("post_title"),
            pl.col("post_abstract"),
        ]).alias("post")
    ).with_columns( # handle null values for post
        pl.when(pl.col("post").is_null())
        .then(pl.col("post_title"))
        .otherwise(pl.col("post")).alias("post")
    ).with_columns( # create embeddings for the post and prompt
        pl.col("post").map_elements(embed).alias("post_embedding"),
        pl.col("prompt_prompt").map_elements(embed).alias("prompt_embedding"),
        pl.col("label").cast(pl.Float32).alias("label") # cast the label to a float
    )

    df = df.to_pandas()


    # df['prompt_type'] = df['prompt_type'].map({'suppress': 0, 'boost': 1})
    # df = df[['id', 'post', 'post_embedding', 'prompt', 'prompt_embedding','prompt_type', 'post_category', 'prompt_category', 'label']]

    # for category in categories:
    #     df[f'prompt_{category}'] = (df['prompt_category'] == category).astype(int)

    # for category in categories:
    #     df[f'post_{category}'] = (df['post_category'] == category).astype(int)

    # df = df.drop(columns=['post_category', 'prompt_category'])

    return df

In [38]:
df = preprocess(data)

In [39]:
df.columns

Index(['id', 'post_title', 'post_abstract', 'post_category',
       'post_subcategory', 'prompt_prompt', 'prompt_type', 'prompt_category',
       'prompt_subcategory', 'label', 'post', 'post_embedding',
       'prompt_embedding'],
      dtype='object')

In [40]:
# rename df.prompt_prompt to df.prompt
df = df.rename(columns={
    'prompt_prompt': 'prompt',
    'post_title': 'title',
    'post_abstract': 'abstract'
    })

In [42]:
df.columns

Index(['id', 'title', 'abstract', 'post_category', 'post_subcategory',
       'prompt', 'prompt_type', 'prompt_category', 'prompt_subcategory',
       'label', 'post', 'post_embedding', 'prompt_embedding'],
      dtype='object')

In [86]:
source_df = df.groupby('prompt').filter(lambda x: len(x) >= 10)

In [88]:
source_df.shape

(3179, 13)

In [66]:
source_df['prompt_embedding']

0       [0.0050256783, -0.054736964, 0.009037347, -0.0...
1       [-0.014443579, -0.057925355, 0.011125997, -0.0...
2       [-0.011852036, -0.06560517, 0.003943302, -0.00...
3       [-0.006226413, -0.050020713, 0.034159217, -0.0...
4       [-0.021042276, -0.07230517, 0.021734122, -0.01...
                              ...                        
3225    [-0.020555772, -0.06798802, 0.022750273, 0.000...
3226    [-0.009810819, -0.048323907, 0.016124412, -0.0...
3227    [-0.009810819, -0.048323907, 0.016124412, -0.0...
3228    [-0.031405423, -0.045943644, 0.007384266, 0.02...
3229    [0.0016587655, -0.050278198, 0.006163819, -0.0...
Name: prompt_embedding, Length: 3223, dtype: object

In [89]:
# QUERY = """
#         select
#             l.id,
#             l.post_id,
#                 p.title, p.abstract, p.category post_category, p_emb.embedding post_embedding,
#             l.prompt_id,
#                 pr.prompt, pr.prompt_type, pr.category prompt_category, pr_emb.embedding prompt_embedding,
#             l.label from labels l
#         join posts p on l.post_id = p.id
#         join prompts pr on l.prompt_id = pr.id
#         join posts_emb p_emb on l.post_id = p_emb.id
#         join prompts_emb pr_emb on l.prompt_id = pr_emb.id
#         order by random()
#         ;
#     """
# cursor = conn.cursor()
# cursor.execute(QUERY)
# labels = cursor.fetchall()
# source_df = pd.DataFrame(labels, columns=["id", "post_id", "title", "abstract", "post_category", "post", "prompt_id", "prompt_text", "prompt_type", "prompt_category", "prompt",  "label" ])

source_df['inference'] = source_df.apply(get_inference, axis=1)
source_df['inference'] = source_df['inference'].round(1)

In [90]:
def get_cosine(row):
    post = row['post_embedding']
    prompt = row['prompt_embedding']
    # prompt = np.array(eval(prompt))
    # post = np.array(eval(post))
    # print(post)
    post = torch.tensor(post, dtype=torch.float32)
    prompt = torch.tensor(prompt, dtype=torch.float32)

    cosine = F.cosine_similarity(post, prompt, dim=0)
    if row['prompt_type'] == 'suppress':
        return 1 - cosine.item()
    return cosine.item()

In [91]:
source_df['cosine'] = source_df.apply(get_cosine, axis=1)


### Mean Reciprocal Rank

In [77]:
def loop(source_df):
    df = source_df.groupby('prompt').apply(lambda x: x.sample(5)).reset_index(drop=True)
    mrrs = []
    for name, group in df.groupby('prompt'):
        group['label_rank'] = group['label'].rank(method='dense')
        group['inference_rank'] = group['inference'].rank(method='dense')
        true_ranks = group['label_rank'].values
        predicted_ranks = group['inference_rank'].values
        mrr = mean_reciprocal_rank(true_ranks, [[x] for x in predicted_ranks])
        mrrs.append((name, mrr))

    return np.mean([mrr for name, mrr in mrrs])

### Mean Average Precision

In [143]:
def average_precision(relevant_items, retrieved_items):
    """
    Calculates the average precision (AP) for the given relevant items and retrieved items.

    Args:
        relevant_items (list): A list of relevant items.
        retrieved_items (list): A list of retrieved items.

    Returns:
        float: The average precision.
    """
    intersections = 0
    for i, item in enumerate(relevant_items):
        if item in retrieved_items:
            intersections += 1

    if intersections == 0:
        return 0.0

    return intersections / len(retrieved_items)


In [162]:
# pick a random prompt
# for that prompt, all posts with label > 0.7 are relevant prompts
# get the top 5 and top 10 posts for the relevant list @5 and @10

# sort the posts by inference
# get the top 5 and top 10 posts for the retrieved list @5 and @10

# call the mean_average_precision on the relevant and retrieved lists @5 and @10

def mean_average_precision(x, n = 5):
    # pick a random prompt and return the prompt and corresponding posts
    data = x.groupby('prompt').apply(lambda x: x.sample(n)).reset_index(drop=True)
    map = []
    for name, group in data.groupby('prompt'):
        group['label_rank'] = group['label'].rank(method='dense')
        group['inference_rank'] = group['inference'].rank(method='dense')
        relevant_items = group[group['label'] > 0.65]

        relevant_items_top = relevant_items.sort_values('label', ascending=False).head(n)
        retrieved_items_top = group.sort_values('inference', ascending=False).head(n)


        ap = average_precision(relevant_items_top['title'].values, retrieved_items_top['title'].values)
        map.append(ap)
    return np.mean(map)


In [163]:
mean_average_precision(source_df, n=10)

0.3169014084507043

In [164]:
mean_average_precision(source_df, n=5)


0.34084507042253515