In [1]:
import pandas as pd
import numpy as np
import networkx as nx
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
from sklearn.metrics import ndcg_score

# Load the interaction data
data_path = 'metadata/'
interaction_df = pd.read_csv(data_path + 'interactions.csv').drop('Unnamed: 0', axis=1)

# Preprocess the interaction data
interaction_df['SurveyAnswerScore'] = np.round(interaction_df['SurveyAnswerScore'], 2)
interaction_df['RecommendStar'] = np.round(interaction_df['RecommendStar'], 2)
interaction_df = interaction_df.drop('InsertedOn', axis=1)

# Aggregate the interaction data
interaction_df_mod = interaction_df.groupby(['PERSON_ID', 'ProviderCode']).agg({
    'SurveyAnswerScore': 'mean',
    'RecommendStar': 'mean',
    'num_visits': 'first',
    'age': 'mean',
    'ratings': 'mean',
    'star_rating': 'first',
    'survey_score': 'first',
    'zip_code': 'first',
    'GENDER_Female': 'first',
    'GENDER_Male': 'first'
}).reset_index()

interaction_df_mod['visit_ratings'] = 0.65 * interaction_df_mod['ratings'] + 0.35 * interaction_df_mod['num_visits']

# Prepare provider features
provider_features = interaction_df_mod.groupby('ProviderCode').agg({
    'num_visits': 'sum',
    'survey_score': 'mean',
    'star_rating': 'mean'
}).reset_index()

# Prepare person features
person_features = interaction_df_mod.groupby('PERSON_ID').agg({
    'age': 'first',
    'GENDER_Female': 'first',
    'GENDER_Male': 'first'
}).reset_index()

# Prepare edge features
edges = interaction_df_mod[['PERSON_ID', 'ProviderCode', 'visit_ratings']]

# Map IDs to integers
person_id_map = {id: idx for idx, id in enumerate(person_features['PERSON_ID'].unique())}
provider_id_map = {id: idx + len(person_id_map) for idx, id in enumerate(provider_features['ProviderCode'].unique())}
id_provider_map =  {value: key for key, value in provider_id_map.items()}

# Apply mapping to the edges
edges['PERSON_ID'] = edges['PERSON_ID'].map(person_id_map)
edges['ProviderCode'] = edges['ProviderCode'].map(provider_id_map)

# Initialize the graph
G = nx.Graph()

# Add person nodes with features
for _, row in person_features.iterrows():
    G.add_node(person_id_map[row['PERSON_ID']], bipartite=0, age=row['age'], gender_female=row['GENDER_Female'], gender_male=row['GENDER_Male'])

# Add provider nodes with features
for _, row in provider_features.iterrows():
    G.add_node(provider_id_map[row['ProviderCode']], bipartite=1, num_visits=row['num_visits'], survey_score=row['survey_score'], star_rating=row['star_rating'])

# Add edges with features
for _, row in edges.iterrows():
    G.add_edge(row['PERSON_ID'], row['ProviderCode'], visit_ratings=row['visit_ratings'])

# Convert to adjacency matrix
adj_matrix = nx.to_numpy_array(G)
adj_matrix = torch.tensor(adj_matrix, dtype=torch.float32)

# Extract node features for persons and providers
person_nodes = [n for n, d in G.nodes(data=True) if d['bipartite'] == 0]
provider_nodes = [n for n, d in G.nodes(data=True) if d['bipartite'] == 1]

person_features = torch.tensor([
    [d['age'], d['gender_female'], d['gender_male']] 
    for n, d in G.nodes(data=True) if n in person_nodes
], dtype=torch.float32)

provider_features = torch.tensor([
    [d['num_visits'], d['survey_score'], d['star_rating']]
    for n, d in G.nodes(data=True) if n in provider_nodes
], dtype=torch.float32)

# Combine person and provider features into a single feature matrix
all_features = []
for node in G.nodes(data=True):
    if node[1]['bipartite'] == 0:
        all_features.append([node[1]['age'], node[1]['gender_female'], node[1]['gender_male']])
    else:
        all_features.append([node[1]['num_visits'], node[1]['survey_score'], node[1]['star_rating']])

features = torch.tensor(all_features, dtype=torch.float32)

# Prepare edge indices and labels
edge_list = np.array(G.edges)
labels = np.array([G[u][v]['visit_ratings'] for u, v in G.edges])

# Split the edges into train, validation, and test sets
train_edges, test_edges, train_labels, test_labels = train_test_split(edge_list, labels, test_size=0.2, random_state=42)
train_edges, val_edges, train_labels, val_labels = train_test_split(train_edges, train_labels, test_size=0.2, random_state=42)

train_edge_index = torch.tensor(train_edges, dtype=torch.long)
val_edge_index = torch.tensor(val_edges, dtype=torch.long)
test_edge_index = torch.tensor(test_edges, dtype=torch.long)

train_labels = torch.tensor(train_labels, dtype=torch.float32).view(-1, 1)
val_labels = torch.tensor(val_labels, dtype=torch.float32).view(-1, 1)
test_labels = torch.tensor(test_labels, dtype=torch.float32).view(-1, 1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  edges['PERSON_ID'] = edges['PERSON_ID'].map(person_id_map)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  edges['ProviderCode'] = edges['ProviderCode'].map(provider_id_map)


In [2]:
class GNNLayer(nn.Module):
    def __init__(self, in_features, out_features, dropout=0.8, l2_reg=1e-5):
        super(GNNLayer, self).__init__()
        self.linear = nn.Linear(in_features, out_features)
        self.dropout = nn.Dropout(dropout)
        self.batch_norm = nn.BatchNorm1d(out_features)
        self.l2_reg = l2_reg
    
    def forward(self, adjacency_matrix, feature_matrix):
        h = torch.mm(adjacency_matrix, feature_matrix)
        h = self.linear(h)
        h = self.dropout(h)
        h = self.batch_norm(h)
        return F.relu(h)

class GNNModel(nn.Module):
    def __init__(self, in_features, hidden_features, out_features, dropout=0.8, l2_reg=1e-5):
        super(GNNModel, self).__init__()
        self.gnn1 = GNNLayer(in_features, hidden_features, dropout, l2_reg)
        self.gnn2 = GNNLayer(hidden_features, out_features, dropout, l2_reg)
    
    def forward(self, adjacency_matrix, feature_matrix):
        h = self.gnn1(adjacency_matrix, feature_matrix)
        h = self.gnn2(adjacency_matrix, h)
        return h

# Assume the feature size for persons is 3 (age, gender_female, gender_male)
# and for providers is 3 (num_visits, survey_score, star_rating)
model = GNNModel(in_features=3, hidden_features=32, out_features=1, dropout=0.8, l2_reg=1e-5)


In [3]:
def train_gcn(model, adj_matrix, features, train_edge_index, train_labels, val_edge_index, val_labels, epochs=200, lr=0.01):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = nn.MSELoss()

    patience = 20
    best_val_loss = float('inf')
    patience_counter = 0

    for epoch in range(epochs):
        model.train()
        logits = model(adj_matrix, features)

        train_edge_logits = logits[train_edge_index[:, 0]] + logits[train_edge_index[:, 1]]
        train_loss = criterion(train_edge_logits, train_labels)

        optimizer.zero_grad()
        train_loss.backward()
        optimizer.step()

        model.eval()
        with torch.no_grad():
            val_logits = model(adj_matrix, features)
            val_edge_logits = val_logits[val_edge_index[:, 0]] + val_logits[val_edge_index[:, 1]]
            val_loss = criterion(val_edge_logits, val_labels)

        print(f'Epoch {epoch + 1}, Train Loss: {train_loss.item()}, Val Loss: {val_loss.item()}')

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            patience_counter = 0
            torch.save(model.state_dict(), 'best_gcn_model.pth')
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print("Early stopping")
                break

    model.load_state_dict(torch.load('best_gcn_model.pth'))
    return model

# Assume train_edge_index, train_labels, val_edge_index, and val_labels are already defined
in_feats = features.shape[1]
hidden_feats = 32
out_feats = 1

gcn_model = GNNModel(in_feats, hidden_feats, out_feats)
gcn_model = train_gcn(gcn_model, adj_matrix, features, train_edge_index, train_labels, val_edge_index, val_labels)


Epoch 1, Train Loss: 9.262527465820312, Val Loss: 9.642000198364258
Epoch 2, Train Loss: 9.47844409942627, Val Loss: 9.594399452209473
Epoch 3, Train Loss: 8.540969848632812, Val Loss: 9.232390403747559
Epoch 4, Train Loss: 8.13715934753418, Val Loss: 8.709370613098145
Epoch 5, Train Loss: 7.686004638671875, Val Loss: 8.116190910339355
Epoch 6, Train Loss: 7.26003360748291, Val Loss: 7.516577243804932
Epoch 7, Train Loss: 7.24498176574707, Val Loss: 7.187308311462402
Epoch 8, Train Loss: 6.826931953430176, Val Loss: 6.964295387268066
Epoch 9, Train Loss: 6.644498348236084, Val Loss: 6.877194404602051
Epoch 10, Train Loss: 6.639065742492676, Val Loss: 6.877134799957275
Epoch 11, Train Loss: 6.43827486038208, Val Loss: 6.939988613128662
Epoch 12, Train Loss: 6.2265496253967285, Val Loss: 6.879476547241211
Epoch 13, Train Loss: 6.212185382843018, Val Loss: 6.762838840484619
Epoch 14, Train Loss: 6.09029483795166, Val Loss: 6.700096130371094
Epoch 15, Train Loss: 5.775910377502441, Val Los

In [4]:
def calculate_ndcg(model, adj_matrix, features, test_edge_index, test_labels):
    model.eval()
    with torch.no_grad():
        logits = model(adj_matrix, features)
        test_edge_logits = logits[test_edge_index[:, 0]] + logits[test_edge_index[:, 1]]
        test_edge_logits = test_edge_logits.cpu().numpy().reshape(1, -1)
        test_labels = test_labels.cpu().numpy().reshape(1, -1)
        return ndcg_score(test_labels, test_edge_logits)

# Assume test_edge_index and test_labels are already defined
ndcg = calculate_ndcg(gcn_model, adj_matrix, features, test_edge_index, test_labels)
print(f'GCN NDCG Score: {ndcg}')


GCN NDCG Score: 0.9321025407205367


In [5]:
torch.save(gcn_model.state_dict(), 'models/gcn_reco.pth')


In [6]:
gcn_model = GNNModel(in_feats, hidden_feats, out_feats)
gcn_model.load_state_dict(torch.load('models/gcn_reco.pth'))
gcn_model.eval()


GNNModel(
  (gnn1): GNNLayer(
    (linear): Linear(in_features=3, out_features=32, bias=True)
    (dropout): Dropout(p=0.8, inplace=False)
    (batch_norm): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  )
  (gnn2): GNNLayer(
    (linear): Linear(in_features=32, out_features=1, bias=True)
    (dropout): Dropout(p=0.8, inplace=False)
    (batch_norm): BatchNorm1d(1, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  )
)

In [7]:
def get_recommendations(model, user_id, k=5):
        # Get the node ID for the user
        if user_id not in person_id_map:
            return []
            
        user_node = person_id_map[user_id]
        # Get the logits (embeddings) for all nodes
        with torch.no_grad():
            logits = model(adj_matrix, features)
        
        # Get the embeddings for the user
        user_embedding = logits[user_node]
    
        # Calculate the score for each provider
        provider_scores = []
        for provider_node in provider_nodes:
            provider_embedding = logits[provider_node]
            score = torch.dot(user_embedding, provider_embedding).item()
            provider_scores.append((provider_node, score))
    
        # Sort the providers by score in descending order
        provider_scores = sorted(provider_scores, key=lambda x: x[1], reverse=True)

        ## here we have to filter the  available provider nodes selected as per 

        print(provider_scores[:k])
        # Get the top k providers
        top_providers = [id_provider_map[provider_node] for provider_node, score in provider_scores[:k]]
        
        return top_providers

# Example usage
user_id = '0071588ba3116840b32d9f7fcf3ce2707c5bbcc09a3960fbb5c91e1e7e8eb21d'
recommendations = get_recommendations(gcn_model, user_id)
print(f"Top recommendations for user {user_id}: {recommendations}")


[(3472, 1.1778837442398071), (3567, 1.157257080078125), (3415, 1.1554570198059082), (3697, 1.1509815454483032), (3677, 1.1483124494552612)]
Top recommendations for user 0071588ba3116840b32d9f7fcf3ce2707c5bbcc09a3960fbb5c91e1e7e8eb21d: ['GF4D9', 'XHRCW', '6I98Z', 'Y7PRQ', 'Y52M5']
