In [1]:
# !wget https://files.grouplens.org/datasets/movielens/ml-1m.zip

In [2]:
# !unzip ./movielens/ml-1m.zip

In [1]:
import pandas as pd

def preprocess_dat_file(file_path, delimiter='\t', header=None):
    """
    Read a .dat file and convert it to a pandas DataFrame.

    Parameters:
    file_path (str): Path to the .dat file.
    delimiter (str, optional): The delimiter used in the file. Defaults to tab ('\t').
    header (int, optional): Row number to use as the column names. Defaults to None, indicating no header row.

    Returns:
    pandas.DataFrame: The processed data.
    """
    # Read the file into a DataFrame
    data = pd.read_csv(file_path, delimiter="::", header=header, encoding='latin-1')

    return data

# Movie Path
movie_path = '/raid/nlp/tejomoy/graphML/KP-GNN/data/movielens/ml-1m/movies.dat'
movie_data = preprocess_dat_file(movie_path)


# Rating Path
user_path = '/raid/nlp/tejomoy/graphML/KP-GNN/data/movielens/ml-1m/users.dat'
user_data = preprocess_dat_file(user_path)

# Users Path
rating_path = '/raid/nlp/tejomoy/graphML/KP-GNN/data/movielens/ml-1m/ratings.dat'
rating_data = preprocess_dat_file(rating_path)


  data = pd.read_csv(file_path, delimiter="::", header=header, encoding='latin-1')
  data = pd.read_csv(file_path, delimiter="::", header=header, encoding='latin-1')
  data = pd.read_csv(file_path, delimiter="::", header=header, encoding='latin-1')


In [2]:
rating_data = rating_data.rename({0:"userId", 1:"movieId", 2:"rating", 3:"Timestamp" }, axis=1)
rating_data.to_csv("/raid/nlp/tejomoy/graphML/KP-GNN/data/movielens/ratings.csv", index=False)


In [3]:
movie_data

Unnamed: 0,0,1,2
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
3878,3948,Meet the Parents (2000),Comedy
3879,3949,Requiem for a Dream (2000),Drama
3880,3950,Tigerland (2000),Drama
3881,3951,Two Family House (2000),Drama


In [4]:
movie_data = movie_data.rename({0:"movieId", 1:"title", 2:"genres"}, axis=1)
movie_data.to_csv("/raid/nlp/tejomoy/graphML/KP-GNN/data/movielens/movies.csv", index=False)

In [32]:
import pandas as pd
import networkx as nx

def load_movielens_data(ratings_file):
    # Load the dataset
    # Replace 'ratings_file' with the path to your MovieLens ratings file
    data = pd.read_csv(ratings_file)
    return data

def create_graph(data):
    G = nx.Graph()

    # Assuming the data has 'userId', 'movieId', and 'rating' columns
    for _, row in data.iterrows():
        user = f"user_{row['Users']}"
        movie = f"movie_{row['Movies']}"

        # Add nodes
        if user not in G:
            G.add_node(user, type='user')
        if movie not in G:
            G.add_node(movie, type='movie')

        # Add an edge with rating as an attribute
        G.add_edge(user, movie, rating=row['Rating'])

    return G

# Example usage
ratings_file = '/raid/nlp/tejomoy/graphML/KP-GNN/data/movielens/final_rating_movielens.csv'
data = load_movielens_data(ratings_file)
graph = create_graph(data)


In [39]:
graph.number_of_nodes()

9746

In [40]:
graph.number_of_edges()

1000209

In [None]:
"""Certainly! To tailor the `KPGCNConv` layer for a recommendation system using the MovieLens dataset, I'll make several modifications. 
These include handling the edge attributes (ratings) and potentially simplifying or adapting certain parts of the code for the MovieLens context.

The MovieLens dataset typically includes user IDs, movie IDs, and ratings. Assuming these are the primary features used, I'll modify the `KPGCNConv` layer accordingly. Note that this example assumes a simplified context and might need further adjustments based on the exact nature of your data and the features you want to use.

Here's the complete modified code:"""

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import MessagePassing
from torch_geometric.utils import add_self_loops

# Assuming the combine functions and classes (like AttentionCombine, GeometricCombine) are defined elsewhere

class KPGCNConv(MessagePassing):
    def __init__(self, input_size, output_size, K, num_hop1_edge=1, num_pe=1, combine="geometric"):
        super(KPGCNConv, self).__init__(node_dim=0)
        self.aggr = "add"
        self.K = K
        self.output_size = output_size
        assert output_size % K == 0
        self.output_dk = output_size // K

        self.hop_proj = nn.Linear(input_size, output_size)
        self.hop1_edge_emb = torch.nn.Embedding(num_hop1_edge + 2, self.output_dk, padding_idx=0)

        if self.K > 1:
            self.hopk_edge_emb = torch.nn.Embedding(num_pe + 2, self.output_dk, padding_idx=0)
            self.hopk_node_path_emb = torch.nn.Embedding(num_pe, self.output_dk, padding_idx=0)
            self.combine_proj = nn.Linear(self.output_dk, output_size)
            
            if combine == "attention":
                self.combine = AttentionCombine(self.output_dk, self.K)
            elif combine == "geometric":
                self.combine = GeometricCombine(self.K, self.output_dk)
            else:
                raise ValueError("Not implemented combine function")
        else:
            self.hopk_edge_emb = None
            self.combine = torch.squeeze
            self.combine_proj = nn.Identity()

        self.reset_parameters()

    def reset_parameters(self):
        self.hop1_edge_emb.reset_parameters()
        self.hop_proj.reset_parameters()
        if self.K > 1:
            self.hopk_edge_emb.reset_parameters()
            self.hopk_node_path_emb.reset_parameters()
            self.combine.reset_parameters()
        if isinstance(self.combine_proj, nn.Linear):
            self.combine_proj.reset_parameters()

    def forward(self, x, edge_index, edge_attr, pe_attr=None, peripheral_attr=None):
        batch_num_node = x.size(0)
        edge_index, _ = add_self_loops(edge_index, num_nodes=batch_num_node)

        # Normalize or preprocess ratings if necessary
        # Example: edge_attr = normalize_ratings(edge_attr)

        self_loop_attr = torch.ones([x.size(0), self.K], dtype=torch.long, device=edge_attr.device)
        edge_attr = torch.cat([edge_attr, self_loop_attr], dim=0)

        x = self.hop_proj(x)
        x = x.view(-1, self.K, self.output_dk)

        e1_emb = self.hop1_edge_emb(edge_attr[:, :1])

        if self.K > 1:
            if pe_attr is not None:
                pe = self.hopk_node_path_emb(pe_attr)
                x[:, 1:] = x[:, 1:] + pe
            ek_emb = self.hopk_edge_emb(edge_attr[:, 1:])
            e_emb = torch.cat([e1_emb, ek_emb], dim=-2)
        else:
            e_emb = e1_emb

        row, col = edge_index
        deg = degree(col, x.size(0), edge_attr)
        deg_inv_sqrt = deg.pow(-0.5)
        norm = deg_inv_sqrt[row] * deg_inv_sqrt[col]

        x = self.propagate(edge_index, x=x, norm=norm, edge_emb=e_emb, mask=edge_attr)

        if peripheral_attr is not None:
            x = x + peripheral_attr

        x = self.combine_proj(self.combine(x))

        return x

    def message(self, x_j, edge_emb, norm, mask):
        x_j = norm.unsqueeze(-1) * (x_j + edge_emb)
        mask = mask.unsqueeze(-1)
        return x_j.masked_fill_(mask == 0, 0.)

    def update(self, aggr_out):
        return F.relu(aggr_out)

# Example usage
# model = KPGCNConv(input_size, output_size, K, ...)
# output = model(user_movie_features, edge_index, rating_edge_attr, ...)
```

In this modified version:

- `edge_attr` would typically represent the ratings. You might need to preprocess these ratings (e.g., normalize them) depending on your model's requirements.
- If your dataset includes additional features like user demographics or movie genres (`pe_attr` or `peripheral_attr`), you should integrate these into the model.
- The `message` and `update` functions handle the propagation and aggregation of information in the graph.

Remember, this code assumes certain structures and may need adjustments based on the exact format of your MovieLens dataset and the features you are using. The `degree` function and the combine classes/functions (`AttentionCombine`, `GeometricCombine`) are also assumed to be defined elsewhere as per your original code context.