# STAT7008 Project: Recommendation System

In [1]:
import numpy as np
import pandas as pd
import scipy.sparse as sp
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [2]:
!sh scripts/process.sh

Archive:  data/hetrec2011-lastfm-2k.zip
  inflating: data/last_fm/user_friends.dat  
  inflating: data/last_fm/user_taggedartists.dat  
  inflating: data/last_fm/user_taggedartists-timestamps.dat  
  inflating: data/last_fm/artists.dat  
  inflating: data/last_fm/readme.txt  
  inflating: data/last_fm/tags.dat   
  inflating: data/last_fm/user_artists.dat  
Archive:  data/ml-1m.zip
   creating: data/ml-1m/
  inflating: data/ml-1m/movies.dat   
  inflating: data/ml-1m/ratings.dat  
  inflating: data/ml-1m/README       
  inflating: data/ml-1m/users.dat    


### (1)

Typical recommendation algorithms: Content-based filtering, Item-based collaborative filtering, and User-based collaborative filtering

Data Loading:

In [3]:
datapath = "data/ml-1m/"

# Load Movies
movies = pd.read_csv(datapath + 'movies.dat', delimiter='::', engine='python', header=None, names=['MovieID', 'Title', 'Genres'], encoding='latin-1')

# Load Ratings
ratings = pd.read_csv(datapath + 'ratings.dat', delimiter='::', engine='python', header=None, names=['UserID', 'MovieID', 'Rating', 'Timestamp'], encoding='latin-1')

# Load Users
users = pd.read_csv(datapath + 'users.dat', delimiter='::', engine='python', header=None, names=['UserID', 'Gender', 'Age', 'Occupation', 'Zip-code'], encoding='latin-1')

Content-Based Filtering: 

In [4]:
# Create a TF-IDF matrix of unigrams, bigrams, and trigrams for each movie's genre
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(movies['Genres'])

# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

# Function to get recommendations based on the cosine similarity score of movie genres
def get_recommendations(title, cosine_sim=cosine_sim):
    idx = movies[movies['Title'] == title].index[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]  # 10 most similar movies
    movie_indices = [i[0] for i in sim_scores]
    return movies['Title'].iloc[movie_indices]

# Example usage
recommendations = get_recommendations('Toy Story (1995)')
recommendations

1050            Aladdin and the King of Thieves (1996)
2072                          American Tail, An (1986)
2073        American Tail: Fievel Goes West, An (1991)
2285                         Rugrats Movie, The (1998)
2286                              Bug's Life, A (1998)
3045                                Toy Story 2 (1999)
3542                             Saludos Amigos (1943)
3682                                Chicken Run (2000)
3685    Adventures of Rocky and Bullwinkle, The (2000)
12                                        Balto (1995)
Name: Title, dtype: object

Item-Based Collaborative Filtering

In [5]:
# Create a pivot table with movies as rows and users as columns
movie_ratings = ratings.pivot_table(index='MovieID', columns='UserID', values='Rating').fillna(0)
item_similarity = cosine_similarity(movie_ratings)

# Function to recommend movies based on item similarity
def get_item_based_recommendation(movie_id):
    # Get movie index for similarity matrix
    idx = movies[movies['MovieID'] == movie_id].index[0]
    similar_scores = item_similarity[idx]
    similar_movies = list(movie_ratings.index[np.where(similar_scores > 0.5)])
    similar_movies.remove(movie_id)  # Remove the movie itself from the recommendation
    return movies[movies['MovieID'].isin(similar_movies)]['Title']

# Example usage
recommendations = get_item_based_recommendation(1)  # For movie with MovieID 1
recommendations[:10]

33                                   Babe (1995)
38                               Clueless (1995)
257    Star Wars: Episode IV - A New Hope (1977)
293                          Pulp Fiction (1994)
315             Shawshank Redemption, The (1994)
352                          Forrest Gump (1994)
360                        Lion King, The (1994)
453                         Fugitive, The (1993)
476                         Jurassic Park (1993)
584                               Aladdin (1992)
Name: Title, dtype: object

User-Based Collaborative Filtering:

In [6]:
# Create a pivot table with users as rows and movies as columns
user_ratings = ratings.pivot_table(index='UserID', columns='MovieID', values='Rating').fillna(0)
user_similarity = cosine_similarity(user_ratings)

# Function to recommend movies based on user similarity
def get_user_based_recommendation(user_id):
    # Get user index for similarity matrix
    idx = users[users['UserID'] == user_id].index[0]
    similar_users = user_similarity[idx]
    similar_users_index = np.where(similar_users > 0.5)[0]
    recommended_movies = set()
    for i in similar_users_index:
        movies_rated_by_similar_user = user_ratings.columns[np.where(user_ratings.iloc[i] > 3)].tolist()
        recommended_movies.update(movies_rated_by_similar_user)
    return movies[movies['MovieID'].isin(recommended_movies)]['Title']

# Example usage
recommendations = get_user_based_recommendation(1)  # For user with UserID 1
list(recommendations)[:10]

['Toy Story (1995)',
 'Pocahontas (1995)',
 'Apollo 13 (1995)',
 'Star Wars: Episode IV - A New Hope (1977)',
 "Schindler's List (1993)",
 'Secret Garden, The (1993)',
 'Aladdin (1992)',
 'Snow White and the Seven Dwarfs (1937)',
 'Beauty and the Beast (1991)',
 'Fargo (1996)']

### (2)

Deep learning-based algorithms

Neural Collaborative:

In [7]:
class MovieLensDataset(Dataset):
    def __init__(self, ratings):
        self.users = ratings['UserID'].values
        self.items = ratings['MovieID'].values
        self.ratings = ratings['Rating'].values

    def __len__(self):
        return len(self.ratings)

    def __getitem__(self, idx):
        return {
            'user': self.users[idx],
            'item': self.items[idx],
            'rating': self.ratings[idx]
        }


class NCF(nn.Module):
    def __init__(self, num_users, num_items, embedding_size=50, hidden_layers=[100, 50], dropout=0.2):
        super(NCF, self).__init__()
        self.user_embedding = nn.Embedding(num_users, embedding_size)
        self.item_embedding = nn.Embedding(num_items, embedding_size)

        layers = []
        input_size = embedding_size * 2  # User and item embeddings concatenated
        for layer_size in hidden_layers:
            layers.append(nn.Linear(input_size, layer_size))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(dropout))
            input_size = layer_size

        layers.append(nn.Linear(hidden_layers[-1], 1))
        self.hidden_layers = nn.Sequential(*layers)

    def forward(self, user_input, item_input):
        user_embedding = self.user_embedding(user_input)
        item_embedding = self.item_embedding(item_input)
        concatenated = torch.cat([user_embedding, item_embedding], dim=-1)
        x = self.hidden_layers(concatenated)
        return x.squeeze()

In [8]:
ratings['UserID'] = ratings['UserID'] - 1
ratings['MovieID'] = ratings['MovieID'] - 1

max_user_id = ratings['UserID'].max()
max_movie_id = ratings['MovieID'].max()
print(f'Max User ID: {max_user_id}, Max Movie ID: {max_movie_id}')

dataset = MovieLensDataset(ratings)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

model = NCF(max_user_id + 1, max_movie_id + 1)

Max User ID: 6039, Max Movie ID: 3951


In [9]:
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

num_epochs = 5
for epoch in range(num_epochs):
    for batch in dataloader:
        users = batch['user']
        items = batch['item']
        ratings = batch['rating'].float()

        optimizer.zero_grad()
        predictions = model(users, items).squeeze()
        loss = criterion(predictions, ratings)
        loss.backward()
        optimizer.step()

    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}')

Epoch 1/5, Loss: 1.0653433799743652
Epoch 2/5, Loss: 0.6517144441604614
Epoch 3/5, Loss: 0.7861999273300171
Epoch 4/5, Loss: 1.0934501886367798
Epoch 5/5, Loss: 0.6505059003829956


GNN-based recommendation algorithms:

In [10]:
ratings = pd.read_csv(datapath + 'ratings.dat', delimiter='::', engine='python', header=None, names=['UserID', 'MovieID', 'Rating', 'Timestamp'], encoding='latin-1')

ratings['UserID'] = ratings['UserID'] - 1
ratings['MovieID'] = ratings['MovieID'] - 1

max_user_id = ratings['UserID'].max()
max_movie_id = ratings['MovieID'].max()

def create_adjacency_matrix(ratings, num_users, num_items):
    user_item_matrix = sp.coo_matrix((ratings['Rating'], (ratings['UserID'], ratings['MovieID'] + num_users)), shape=(num_users + num_items, num_users + num_items), dtype=np.float32)
    adj_matrix = user_item_matrix + user_item_matrix.T
    adj_matrix = adj_matrix + sp.eye(adj_matrix.shape[0])
    return adj_matrix

adj_matrix = create_adjacency_matrix(ratings, max_user_id + 1, max_movie_id + 1)

In [11]:
class GCNLayer(nn.Module):
    def __init__(self, in_features, out_features):
        super(GCNLayer, self).__init__()
        self.linear = nn.Linear(in_features, out_features)

    def forward(self, adjacency_matrix, features):
        support = self.linear(features)
        output = torch.spmm(adjacency_matrix, support)
        return output

class GCN(nn.Module):
    def __init__(self, num_users, num_items, in_features, hidden_size=64):
        super(GCN, self).__init__()
        self.user_embedding = nn.Embedding(num_users, in_features)
        self.item_embedding = nn.Embedding(num_items, in_features)
        self.gcn1 = GCNLayer(in_features, hidden_size)
        self.gcn2 = GCNLayer(hidden_size, hidden_size)
        self.fc = nn.Linear(hidden_size, 1)

    def forward(self, user_indices, item_indices, adjacency_matrix):
        num_users, num_items = self.user_embedding.num_embeddings, self.item_embedding.num_embeddings
        user_embedded = self.user_embedding(user_indices)
        item_embedded = self.item_embedding(item_indices)
        features = torch.zeros(num_users + num_items, user_embedded.size(1))
        features.scatter_(0, user_indices.unsqueeze(1).repeat(1, user_embedded.size(1)), user_embedded)
        features.scatter_(0, (num_users + item_indices).unsqueeze(1).repeat(1, item_embedded.size(1)), item_embedded)

        hidden = F.relu(self.gcn1(adjacency_matrix, features))
        output = F.relu(self.gcn2(adjacency_matrix, hidden))
        user_output, item_output = output.split([num_users, num_items], 0)
        user_output = user_output[user_indices]
        item_output = item_output[item_indices]
        interaction = torch.mul(user_output, item_output)
        rating = self.fc(interaction)
        return rating.squeeze()

In [None]:
dataset = MovieLensDataset(ratings)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

embedding_size = 50
gcn_model = GCN(max_user_id + 1, max_movie_id + 1, embedding_size)

criterion = nn.MSELoss()
optimizer = optim.Adam(gcn_model.parameters(), lr=0.001)

adj_matrix_tensor = torch.FloatTensor(adj_matrix.toarray())

num_epochs = 5
for epoch in range(num_epochs):
    for batch in dataloader:
        users = batch['user']
        items = batch['item']
        ratings = batch['rating'].float()

        optimizer.zero_grad()
        predictions = gcn_model(users, items, adj_matrix_tensor).squeeze()
        loss = criterion(predictions, ratings)
        loss.backward()
        optimizer.step()

    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}')

Epoch 1/5, Loss: 972168256.0


### (3)

Knowledge Graph:

In [None]:
import pandas as pd
from SPARQLWrapper import SPARQLWrapper, JSON

# MovieLens数据集路径
datapath = "data/ml-1m/"

# 读取电影数据
movies = pd.read_csv(datapath + 'movies.dat', delimiter='::', engine='python', header=None, names=['MovieID', 'Title', 'Genres'], encoding='latin-1')
movies = movies[:10]

# 设置SPARQL
sparql = SPARQLWrapper("http://dbpedia.org/sparql")
sparql.setReturnFormat(JSON)

# 清理电影标题（去除年份）
def clean_title(title):
    if '(' in title and ')' in title:
        return title.split('(')[0].strip()
    return title

# 获取DBpedia URI
def get_dbpedia_uri(movie_title):
    cleaned_title = clean_title(movie_title)
    query = """
    SELECT ?film WHERE {
        ?film a dbo:Film .
        ?film foaf:name '""" + cleaned_title.replace("'", r"\'") + """'@en .
    }
    LIMIT 1
    """
    sparql.setQuery(query)
    results = sparql.query().convert()
    if results["results"]["bindings"]:
        return results["results"]["bindings"][0]["film"]["value"]
    else:
        return None

# 获取电影特征
def get_movie_features(dbpedia_uri):
    if dbpedia_uri is None:
        return {}
    query = """
    PREFIX dbo: <http://dbpedia.org/ontology/>
    PREFIX foaf: <http://xmlns.com/foaf/0.1/>
    SELECT ?director ?abstract ?releaseDate ?starring ?language WHERE {
      <""" + dbpedia_uri + """> dbo:director ?director .
      <""" + dbpedia_uri + """> dbo:abstract ?abstract .
      OPTIONAL { <""" + dbpedia_uri + """> dbo:releaseDate ?releaseDate . }
      OPTIONAL { <""" + dbpedia_uri + """> dbo:starring ?starring . }
      OPTIONAL { <""" + dbpedia_uri + """> dbo:language ?language . }
      FILTER (lang(?abstract) = 'en')
    }
    """
    sparql.setQuery(query)
    results = sparql.query().convert()
    features = {}
    if results["results"]["bindings"]:
        binding = results["results"]["bindings"][0]
        features = {
            "director": binding.get("director", {}).get("value"),
            "abstract": binding.get("abstract", {}).get("value"),
            "releaseDate": binding.get("releaseDate", {}).get("value"),
            "starring": [b["starring"]["value"] for b in results["results"]["bindings"] if "starring" in b],
            "language": binding.get("language", {}).get("value")
        }
    return features


# 新增列用于存储特征
movies['DBpedia_Director'] = None
movies['DBpedia_Abstract'] = None
movies['DBpedia_ReleaseDate'] = None
movies['DBpedia_Starring'] = None
movies['DBpedia_Language'] = None

# 遍历电影数据集
for index, row in movies.iterrows():
    movie_title = row['Title']
    dbpedia_uri = get_dbpedia_uri(movie_title)
    movie_features = get_movie_features(dbpedia_uri)
    movies.at[index, 'DBpedia_Director'] = movie_features.get('director')
    movies.at[index, 'DBpedia_Abstract'] = movie_features.get('abstract')
    movies.at[index, 'DBpedia_ReleaseDate'] = movie_features.get('releaseDate')
    movies.at[index, 'DBpedia_Starring'] = movie_features.get('starring')
    movies.at[index, 'DBpedia_Language'] = movie_features.get('language')

# 显示更新后的DataFrame
print(movies.head())