In [1]:
import pandas as pd
import numpy as np
import networkx as nx
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
from sklearn.metrics import ndcg_score

# Load the interaction data
data_path = 'metadata/'
interaction_df = pd.read_csv(data_path + 'interactions.csv').drop('Unnamed: 0', axis=1)
interaction_df = interaction_df.rename(columns={'FACILITYCODE': 'FacilityCode'})

# Preprocess the interaction data
# Preprocess the interaction data
interaction_df['SurveyAnswerScore'] = np.round(interaction_df['SurveyAnswerScore'], 2)
interaction_df['RecommendStar'] = np.round(interaction_df['RecommendStar'], 2)
interaction_df = interaction_df.drop('InsertedOn', axis=1)

# Aggregate the interaction data
interaction_df_mod = interaction_df.groupby(['PERSON_ID', 'ProviderCode']).agg({
    'SurveyAnswerScore': 'mean',
    'RecommendStar': 'mean',
    'num_visits': 'first',
    'age': 'mean',
    'ratings': 'mean',
    'star_rating': 'first',
    'survey_score': 'first',
    'zip_code': 'first',
    'GENDER_Female': 'first',
    'GENDER_Male': 'first',
    'FacilityCode': lambda x: list(x),
    'SpecialtyCode': lambda x: list(x)
}).reset_index()

interaction_df_mod['visit_ratings'] = 0.65 * interaction_df_mod['ratings'] + 0.35 * interaction_df_mod['num_visits']

# Generate unique integer mappings for FacilityCode and SpecialtyCode
all_facility_codes = set(code for sublist in interaction_df_mod['FacilityCode'] for code in sublist)
facility_code_map = {code: idx for idx, code in enumerate(all_facility_codes)}

all_specialty_codes = set(code for sublist in interaction_df_mod['SpecialtyCode'] for code in sublist)
specialty_code_map = {code: idx for idx, code in enumerate(all_specialty_codes)}

interaction_df_mod['FacilityCode'] = interaction_df_mod['FacilityCode'].apply(lambda codes: [facility_code_map[code] for code in codes])
interaction_df_mod['SpecialtyCode'] = interaction_df_mod['SpecialtyCode'].apply(lambda codes: [specialty_code_map[code] for code in codes])

# Prepare provider features
provider_features = interaction_df_mod.groupby('ProviderCode').agg({
    'num_visits': 'sum',
    'survey_score': 'mean',
    'star_rating': 'mean',
    'FacilityCode': lambda x: list(set([i for sublist in x for i in sublist])),
    'SpecialtyCode': lambda x: list(set([i for sublist in x for i in sublist]))
}).reset_index()

# Prepare person features
person_features = interaction_df_mod.groupby('PERSON_ID').agg({
    'age': 'first',
    'GENDER_Female': 'first',
    'GENDER_Male': 'first',
    'FacilityCode': lambda x: list(set([i for sublist in x for i in sublist])),
    'SpecialtyCode': lambda x: list(set([i for sublist in x for i in sublist]))
}).reset_index()

# Prepare edge features
edges = interaction_df_mod[['PERSON_ID', 'ProviderCode', 'visit_ratings']]

# Map IDs to integers
person_id_map = {id: idx for idx, id in enumerate(person_features['PERSON_ID'].unique())}
provider_id_map = {id: idx + len(person_id_map) for idx, id in enumerate(provider_features['ProviderCode'].unique())}
id_provider_map = {value: key for key, value in provider_id_map.items()}

# Apply mapping to the edges
edges['PERSON_ID'] = edges['PERSON_ID'].map(person_id_map)
edges['ProviderCode'] = edges['ProviderCode'].map(provider_id_map)

# Initialize the graph
G = nx.Graph()

# Add person nodes with features
for _, row in person_features.iterrows():
    G.add_node(person_id_map[row['PERSON_ID']], bipartite=0, age=row['age'], gender_female=row['GENDER_Female'], gender_male=row['GENDER_Male'], facility_codes=row['FacilityCode'], specialty_codes=row['SpecialtyCode'])

# Add provider nodes with features
for _, row in provider_features.iterrows():
    G.add_node(provider_id_map[row['ProviderCode']], bipartite=1, num_visits=row['num_visits'], survey_score=row['survey_score'], star_rating=row['star_rating'], facility_codes=row['FacilityCode'], specialty_codes=row['SpecialtyCode'])

# Add edges with features
for _, row in edges.iterrows():
    G.add_edge(row['PERSON_ID'], row['ProviderCode'], visit_ratings=row['visit_ratings'])

# Convert to adjacency matrix
adj_matrix = nx.to_numpy_array(G)
adj_matrix = torch.tensor(adj_matrix, dtype=torch.float32)

# Extract node features for persons and providers
person_nodes = [n for n, d in G.nodes(data=True) if d['bipartite'] == 0]
provider_nodes = [n for n, d in G.nodes(data=True) if d['bipartite'] == 1]

person_features_tensor = torch.tensor([
    [d['age'], d['gender_female'], d['gender_male']] 
    for n, d in G.nodes(data=True) if n in person_nodes
], dtype=torch.float32)

provider_features_tensor = torch.tensor([
    [d['num_visits'], d['survey_score'], d['star_rating']]
    for n, d in G.nodes(data=True) if n in provider_nodes
], dtype=torch.float32)

# Create a single feature tensor
all_features = torch.cat([person_features_tensor, provider_features_tensor], dim=0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  edges['PERSON_ID'] = edges['PERSON_ID'].map(person_id_map)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  edges['ProviderCode'] = edges['ProviderCode'].map(provider_id_map)


In [2]:
class GNNLayer(nn.Module):
    def __init__(self, in_features, out_features, dropout=0.8, l2_reg=1e-5):
        super(GNNLayer, self).__init__()
        self.linear = nn.Linear(in_features, out_features)
        self.dropout = nn.Dropout(dropout)
        self.batch_norm = nn.BatchNorm1d(out_features)
        self.l2_reg = l2_reg

    def forward(self, adjacency_matrix, feature_matrix):
        h = torch.mm(adjacency_matrix, feature_matrix)
        h = self.linear(h)
        h = self.dropout(h)
        h = self.batch_norm(h)
        return F.relu(h)

class GNNModel(nn.Module):
    def __init__(self, num_facilities, num_specialties, in_features, hidden_features, out_features, embedding_dim=16, dropout=0.8, l2_reg=1e-5):
        super(GNNModel, self).__init__()
        self.facility_embedding = nn.Embedding(num_facilities, embedding_dim)
        self.specialty_embedding = nn.Embedding(num_specialties, embedding_dim)
        self.gnn1 = GNNLayer(in_features + 2 * embedding_dim, hidden_features, dropout, l2_reg)
        self.gnn2 = GNNLayer(hidden_features, out_features, dropout, l2_reg)

    def forward(self, adjacency_matrix, features, person_facility_codes, person_specialty_codes, provider_facility_codes, provider_specialty_codes):
        # Generate embeddings for facility and specialty codes
        person_facility_embeds = [torch.mean(self.facility_embedding(torch.tensor(codes)), dim=0) for codes in person_facility_codes]
        person_specialty_embeds = [torch.mean(self.specialty_embedding(torch.tensor(codes)), dim=0) for codes in person_specialty_codes]
        provider_facility_embeds = [torch.mean(self.facility_embedding(torch.tensor(codes)), dim=0) for codes in provider_facility_codes]
        provider_specialty_embeds = [torch.mean(self.specialty_embedding(torch.tensor(codes)), dim=0) for codes in provider_specialty_codes]

        # Concatenate embeddings with other features
        person_embeds = torch.cat([torch.stack(person_facility_embeds), torch.stack(person_specialty_embeds)], dim=1)
        provider_embeds = torch.cat([torch.stack(provider_facility_embeds), torch.stack(provider_specialty_embeds)], dim=1)

        # Combine features with embeddings
        person_features = torch.cat([features[:len(person_embeds)], person_embeds], dim=1)
        provider_features = torch.cat([features[len(person_embeds):], provider_embeds], dim=1)

        features = torch.cat([person_features, provider_features], dim=0)

        # Apply GNN layers
        h = self.gnn1(adjacency_matrix, features)
        h = self.gnn2(adjacency_matrix, h)
        return h

In [3]:
person_facility_codes = [d['facility_codes'] for n, d in G.nodes(data=True) if n in person_nodes]
person_specialty_codes = [d['specialty_codes'] for n, d in G.nodes(data=True) if n in person_nodes]
provider_facility_codes = [d['facility_codes'] for n, d in G.nodes(data=True) if n in provider_nodes]
provider_specialty_codes = [d['specialty_codes'] for n, d in G.nodes(data=True) if n in provider_nodes]

In [4]:
num_facilities = len(interaction_df['FacilityCode'].unique().tolist())
num_specialties = len(interaction_df['SpecialtyCode'].unique().tolist())

In [5]:
in_feats = 3
hidden_feats = 32
out_feats = 1

In [6]:
gcn_model = GNNModel(num_facilities, num_specialties,in_feats, hidden_feats, out_feats)
gcn_model.load_state_dict(torch.load('models/gcn_reco_v2.pth'))
gcn_model.eval()

GNNModel(
  (facility_embedding): Embedding(53, 16)
  (specialty_embedding): Embedding(89, 16)
  (gnn1): GNNLayer(
    (linear): Linear(in_features=35, out_features=32, bias=True)
    (dropout): Dropout(p=0.8, inplace=False)
    (batch_norm): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  )
  (gnn2): GNNLayer(
    (linear): Linear(in_features=32, out_features=1, bias=True)
    (dropout): Dropout(p=0.8, inplace=False)
    (batch_norm): BatchNorm1d(1, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  )
)

In [7]:
def get_recommendations(model, user_id,features,selected_provider_lst, k=5):
    # Get the node ID for the user
    if user_id not in person_id_map:
        return []
        
    user_node = person_id_map[user_id]    
    # Get the logits (embeddings) for all nodes
    with torch.no_grad():
        logits = model(adj_matrix, features, person_facility_codes, person_specialty_codes, provider_facility_codes, provider_specialty_codes)
    
    # Get the embeddings for the user
    user_embedding = logits[user_node]
    
    # Calculate the score for each provider
    provider_scores = []
    for provider_node in provider_nodes:
        provider_embedding = logits[provider_node]
        score = torch.dot(user_embedding, provider_embedding).item()        
        # Filter based on zip code and specialty code if provided
        provider_id = id_provider_map[provider_node]
        provider_info = provider_features[provider_features['ProviderCode'] == provider_id].iloc[0]     
        
        provider_scores.append((provider_node, score))

    
    mapped_provider = []
    for item_i,item_j in provider_scores:
        if id_provider_map[item_i] in selected_provider_lst:
            mapped_provider.append((item_i,item_j))
            

    # Sort the providers by score in descending order
    provider_scores = sorted(mapped_provider, key=lambda x: x[1], reverse=True)

    # Get the top k providers
    top_providers = [(id_provider_map[provider_node],score) for provider_node, score in provider_scores[:k]]
    
    return top_providers

In [8]:
user_id = '0071588ba3116840b32d9f7fcf3ce2707c5bbcc09a3960fbb5c91e1e7e8eb21d'

In [9]:
zip_code = 34950
specialty_code = 'PS305'
selected_provider_df = interaction_df[(interaction_df['zip_code']==zip_code)
                                            & (interaction_df['SpecialtyCode']==specialty_code)]
selected_provider_lst = selected_provider_df['ProviderCode'].unique().tolist()

In [10]:
recommendations = get_recommendations(gcn_model, user_id,all_features,selected_provider_lst, k=5)
print(f"Top recommendations for user {user_id}: {recommendations}")

Top recommendations for user 0071588ba3116840b32d9f7fcf3ce2707c5bbcc09a3960fbb5c91e1e7e8eb21d: [('2J7P4', 0.8380189538002014), ('XF6BG', 0.806585967540741), ('2RTCQ', 0.771134078502655)]


In [11]:
reco_lst = [item[0] for item in recommendations]
providers_df = interaction_df[interaction_df['ProviderCode'].isin(reco_lst)]

In [12]:
person_df=interaction_df[interaction_df['PERSON_ID']==user_id]