In [41]:
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F

#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

In [42]:
# Load model from HuggingFace Hub
miniLM_tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
miniLM_model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')

In [43]:
def get_embeddings(sentences, model, tokenizer):
    # Tokenize sentences
    encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')

    # Compute token embeddings
    with torch.no_grad():
        model_output = model(**encoded_input)
    
    # Perform pooling
    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
    
    # Normalize embeddings
    sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)

    return sentence_embeddings

In [44]:
from sklearn.metrics.pairwise import cosine_similarity

def get_similarity_score(emb1, emb2):
    emb1 = emb1.reshape(1, -1)
    emb2 = emb2.reshape(1, -1)
    
    # Calculate cosine similarity
    return cosine_similarity(emb1, emb2)[0][0]

In [45]:
initial_topics_scores = {
    'Design':0,
    'Web Development':0,
    'Machine Learning':0,
    'UI/UX':0,
    'Management':0,
    'Finance':0,
    'Recruitments':0,
    'Cyber security':0,
    'Video Editing':0,
    'Content Writing':0,
    'DSA':0,
    'Dev Ops':0,
    'Photography':0
}

In [46]:
recommendation_config = {
    'user':{
        'tags':['frontend-dev', 'ml-ops'],
        'weight':0.2,
        'type':'single'
    },
    'searches':{
        'tags':['blockchain development', 'cloud computing'],
        'weight':0.1,
        'type':'single'
    },
    'followings':{
        'tags':[['backend-dev', 'ui-design']],
        'weight':0.1,
        'type':'multiple'
    },
    'applied_openings':{
        'tags':[['web development', 'data analysis']],
        'weight':0.1,
        'type':'multiple'
    },
    'member_organisations':{
        'tags':[['content writing', 'ui/ux design']],
        'weight':0.1,
        'type':'multiple'
    },
    'liked_posts':{
        'tags':[['Machine Learning', 'Web Development']],
        'weight':0.2,
        'type':'multiple'
    },
    'liked_projects':{
        'tags':[['project management', 'frontend development']],
        'weight':0.1,
        'type':'multiple'
    },
    'liked_events':{
        'tags':[['cybersecurity', 'digital marketing']],
        'weight':0.1,
        'type':'multiple'
    }
} 

In [47]:
def preprocess_weights(config):
    zero_keys = []
    for key in config.keys():
        if len(config[key]['tags'])==0:
            config[key]['weight']=0
            zero_keys.append(key)
            
    if len(zero_keys)>0:
        constant_keys = ['user']
        remaining_keys = [key for key in config.keys() if key not in constant_keys and key not in zero_keys]

        if len(remaining_keys)==0:
            split_weight=1/len(constant_keys)
            for key in constant_keys:
                config[key]['weight']=split_weight
            return config
        
        remaining_weight = 1
        for key in constant_keys:
            remaining_weight-=config[key]['weight']
                    
        split_weight = remaining_weight/len(remaining_keys)
        if split_weight>=0.25:
            split_weight = 1/len(constant_keys+remaining_keys)
            for key in constant_keys+remaining_keys:
                config[key]['weight']=split_weight
        else:
            for key in remaining_keys:
                config[key]['weight']=split_weight

    return config

In [49]:
import numpy as np

def assign_weights(n, equal=False):
    if equal:
        return [1/n for i in range(n)]
        
    weights = np.linspace(n, 1, num=n)
    
    # Normalize the weights so that their sum is 1
    normalized_weights = weights / np.sum(weights)
    
    return normalized_weights

In [55]:
def assign_tag_weights(config):
    equal_keys = ['followings','member_organisations']

    for key in config.keys():
        if config[key]['type'] == 'multiple' and len(config[key]['tags'])>0:
            is_equal = False
            if key in equal_keys:
                is_equal = True
            config[key]['tag_weights'] = assign_weights(len(config[key]['tags']), equal=is_equal)
    
    return config

In [91]:
def increment_score(tags, weight, scores_obj, threshold=0.3):
    if tags is not None:
        for topic in scores_obj.keys():
            topic_emb = get_embeddings(topic, miniLM_model, miniLM_tokenizer)
            for tag in tags:
                tag_emb = get_embeddings(tag, miniLM_model, miniLM_tokenizer)
                score = get_similarity_score(topic_emb, tag_emb)
                if score>threshold:
                    scores_obj[topic]+=score*weight
    return scores_obj

In [98]:
import copy

def get_recommended_topics(config, limit=5):
    topics_scores = copy.deepcopy(initial_topics_scores)
    
    config = assign_tag_weights(preprocess_weights(config))
    
    for key in config.keys():
        if config[key]['type'] == 'multiple' and len(config[key]['tags'])>0:
            for tags, tag_weight in zip(config[key]['tags'], config[key]['tag_weights']):
                topics_scores = increment_score(tags, config[key]['weight']*tag_weight, topics_scores)
        else:
            topics_scores = increment_score(config[key]['tags'], config[key]['weight'], topics_scores)
    
    top_topics = sorted(topics_scores.items(), key=lambda item: item[1], reverse=True)[:limit]
    return top_topics

In [93]:
get_recommended_topics(recommendation_config)

[('Web Development', 0.7446509420871735),
 ('Design', 0.5785354793071747),
 ('Dev Ops', 0.46913512945175173),
 ('Machine Learning', 0.46249214112758635),
 ('UI/UX', 0.315149199962616)]

## Connector

In [31]:
def handle_return(result, multiple=True):
    if result:
        if not multiple:
            if result[0][0] is not None:
                return result[0][0]
            return []
        return [x[0] for x in result]
    return []

def get_user_tags(conn, user_id):
    cursor = conn.cursor()
    query = "SELECT tags FROM users WHERE id = %s"
    cursor.execute(query, (user_id,))
    return handle_return(cursor.fetchall(), False)

def get_user_searches(conn, user_id):
    cursor = conn.cursor()
    query = "SELECT query FROM search_queries WHERE user_id = %s"
    cursor.execute(query, (user_id,))
    return handle_return(cursor.fetchall(), False)

def get_user_following_tags(conn, user_id):
    cursor = conn.cursor()
    query = """
    SELECT u.tags
    FROM follow_followers ff
    JOIN users u ON ff.followed_id = u.id
    WHERE ff.follower_id = %s
    """
    cursor.execute(query, (user_id,))
    return handle_return(cursor.fetchall())

def get_opening_tags_for_user_applications(conn, user_id):
    cursor = conn.cursor()
    query = """
    SELECT o.tags
    FROM applications a
    JOIN openings o ON a.opening_id = o.id
    WHERE a.user_id = %s
    """
    cursor.execute(query, (user_id,))
    return handle_return(cursor.fetchall())

def get_organization_tags_for_user_memberships(conn, user_id):
    cursor = conn.cursor()
    query = """
    SELECT u.tags
    FROM organization_memberships om
    JOIN organizations o ON om.organization_id = o.id
    JOIN users u ON o.user_id = u.id
    WHERE om.user_id = %s
    """
    cursor.execute(query, (user_id,))
    return handle_return(cursor.fetchall())

def get_liked_posts_topics(conn, user_id):
    cursor = conn.cursor()
    query = """
    SELECT p.topics
    FROM posts p
    JOIN likes l ON p.id = l.post_id
    WHERE l.user_id = %s
    """
    cursor.execute(query, (user_id,))
    return handle_return(cursor.fetchall())

def get_liked_project_tags(conn, user_id):
    cursor = conn.cursor()
    query = """
    SELECT p.tags
    FROM projects p
    JOIN likes l ON p.id = l.project_id
    WHERE l.user_id = %s
    """
    cursor.execute(query, (user_id,))
    return handle_return(cursor.fetchall())

def get_liked_event_tags(conn, user_id):
    cursor = conn.cursor()
    query = """
    SELECT e.tags
    FROM events e
    JOIN likes l ON e.id = l.event_id
    WHERE l.user_id = %s
    """
    cursor.execute(query, (user_id,))
    return handle_return(cursor.fetchall())

In [36]:
def get_all_user_ids(conn):
    cursor = conn.cursor()
    query = """
    SELECT id
    FROM users
    """
    cursor.execute(query)
    return handle_return(cursor.fetchall())

In [95]:
import psycopg2

conn = psycopg2.connect(database='postgres',
                            user='postgres',
                            password='pgpass',
                            host='localhost',
                            port=5432)

cursor = conn.cursor()

user_ids = get_all_user_ids(conn)

In [96]:
configs = []

for user_id in user_ids:
    user_tags = get_user_tags(conn, user_id)
    user_searches = get_user_searches(conn ,user_id)
    user_following_tags = get_user_following_tags(conn, user_id)
    opening_tags_for_user_applications = get_opening_tags_for_user_applications(conn ,user_id)
    organization_tags_for_user_memberships = get_organization_tags_for_user_memberships(conn, user_id)
    liked_posts_topics = get_liked_posts_topics(conn, user_id)
    liked_project_tags = get_liked_project_tags(conn, user_id)
    liked_event_tags = get_liked_event_tags(conn, user_id)

    config = {
        'user':{
            'tags':user_tags,
            'weight':0.2,
            'type':'single'
        },
        'searches':{
            'tags':user_searches,
            'weight':0.1,
            'type':'single'
        },
        'followings':{
            'tags':user_following_tags,
            'weight':0.1,
            'type':'multiple'
        },
        'applied_openings':{
            'tags':opening_tags_for_user_applications,
            'weight':0.1,
            'type':'multiple'
        },
        'member_organisations':{
            'tags':organization_tags_for_user_memberships,
            'weight':0.1,
            'type':'multiple'
        },
        'liked_posts':{
            'tags':liked_posts_topics,
            'weight':0.2,
            'type':'multiple'
        },
        'liked_projects':{
            'tags':liked_project_tags,
            'weight':0.1,
            'type':'multiple'
        },
        'liked_events':{
            'tags':liked_event_tags,
            'weight':0.1,
            'type':'multiple'
        }
    }

    configs.append({user_id:config})

In [99]:
for user_config in configs:
    for user_id, config in user_config.items():
        topics = get_recommended_topics(config, 4)
        print(f"Recommended Topics for user {user_id} are {[topics for topics, score in topics]}")

Recommended Topics for user 67ebb62d-935d-4e45-b5a5-5e0536b18ebb are ['Web Development', 'Design', 'Machine Learning', 'UI/UX']
Recommended Topics for user df72452e-422f-474e-bc76-9561184640e3 are ['Machine Learning', 'Design', 'Web Development', 'UI/UX']
Recommended Topics for user b20c7e76-343a-4946-97a2-7717bf4fc0e8 are ['Machine Learning', 'Web Development', 'Design', 'Content Writing']
Recommended Topics for user 91c44181-7b98-4b3e-8553-cc4c855b806a are ['Web Development', 'Machine Learning', 'UI/UX', 'Design']
Recommended Topics for user dd021e1d-10e3-49ff-8d79-d1770a91809c are ['Machine Learning', 'Management', 'Design', 'Cyber security']
Recommended Topics for user 1c57a6ca-46a0-4163-ac29-4bd9ac32086a are ['Design', 'Web Development', 'Machine Learning', 'Content Writing']
Recommended Topics for user 507c2295-6822-4a0a-84e1-72030d1f1b9d are ['Design', 'Web Development', 'Machine Learning', 'UI/UX']
Recommended Topics for user 52daaaa7-50f0-4327-b6a7-23d7a4ab6dfc are ['Design', 