In [None]:
import pandas as pd
import networkx as nx
from os import path
import requests
import re
import os
from bs4 import BeautifulSoup
import pickle
import bz2file as bz2
from urllib.request import urlopen
from urllib.parse import quote
import statistics as stats
import pickle
import nltk
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
import matplotlib.pyplot as plt
import community as community_louvain
from collections import Counter
from wordcloud import WordCloud
from fa2_modified import ForceAtlas2
import matplotlib.colors as mcolors
import math
import networkx as nx

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [25]:
full_ml_dataset = False

In [26]:
def get_base_path():
    if full_ml_dataset:
        return "../data/ml/full"
    return "../data/ml/small"

In [27]:
def save(title, data):
    with bz2.BZ2File(f"{path.join(get_base_path(), title)}.pbz2", "w") as f:
        pickle.dump(data, f)

In [28]:
def load(title):
    with bz2.BZ2File(f"{path.join(get_base_path(), title)}.pbz2", "r") as f:
        return pickle.load(f)

In [29]:
def get_csv(name):
    return pd.read_csv(path.join(get_base_path(), name))

In [30]:
ratings = get_csv("ratings.csv")
movies = get_csv("movies.csv")
tags = get_csv("tags.csv")
links = get_csv("links.csv")

In [31]:
G = nx.Graph()
user_nodes = ratings['userId'].unique()
ratings['rating'] = ratings['rating']/ratings['rating'].max()
movie_nodes = ratings['movieId'].unique()

print(ratings.head())


G.add_nodes_from(movie_nodes, bipartite=0)
G.add_nodes_from(user_nodes, bipartite=1)
edges = ratings[['userId', 'movieId', 'rating']].values
G.add_weighted_edges_from(edges)

   userId  movieId  rating  timestamp
0       1        1     0.8  964982703
1       1        3     0.8  964981247
2       1        6     0.8  964982224
3       1       47     1.0  964983815
4       1       50     1.0  964982931


In [32]:
print(f"Number of nodes: {G.number_of_nodes()}")
print(f"Number of edges: {G.number_of_edges()}")

Number of nodes: 9811
Number of edges: 100403


In [33]:
forceatlas2 = ForceAtlas2(
                        # Behavior alternatives
                        outboundAttractionDistribution=False,  # Dissuade hubs
                        linLogMode=False,  # NOT IMPLEMENTED
                        adjustSizes=False,  # Prevent overlap (NOT IMPLEMENTED)
                        edgeWeightInfluence=1.0,

                        # Performance
                        jitterTolerance=1.0,  # Tolerance
                        barnesHutOptimize=True,
                        barnesHutTheta=1.2,
                        multiThreaded=False,  # NOT IMPLEMENTED

                        # Tuning
                        scalingRatio=2.0,
                        strongGravityMode=True,
                        gravity=1.0,

                        # Log
                        verbose=False)

In [47]:
partition = community_louvain.best_partition(G)

partition_counts = Counter(partition.values())
most_common_partitions = partition_counts.most_common(10)

common_partitions_ids = set(community[0] for community in most_common_partitions)

colors = list(mcolors.TABLEAU_COLORS.keys())
partitions_colors = {partition_id: colors[i % len(colors)] for i, partition_id in enumerate(common_partitions_ids)}

node_colors = []
for node in G.nodes():
    community_id = partition[node]
    if community_id in partitions_colors:
        node_colors.append(partitions_colors[community_id])
    else:
        node_colors.append('lightgray')

positions = forceatlas2.forceatlas2_networkx_layout(G, pos=None, iterations=500)

In [52]:
modularity = community_louvain.modularity(partition, G)
print(f"Louvain Modularity: {modularity}")

Louvain Modularity: 0.25255369018571705


In [None]:
plt.figure(figsize=(12, 12))
nx.draw_networkx_nodes(G, positions, node_size=20, node_color=node_colors, alpha=0.8)
nx.draw_networkx_edges(G, positions, edge_color="gray", alpha=0.05)
plt.axis('off')
plt.title("Network Visualization with Force Atlas Algorithm")
plt.show()

In [None]:
movies['genre_list'] = movies['genres'].apply(lambda x: x.split('|'))

movie_genres = movies[['movieId', 'genre_list']].set_index('movieId').to_dict()['genre_list']

['Adventure', 'Animation', 'Children', 'Comedy', 'Fantasy']


In [51]:
movies['primary_genre'] = movies['genre_list'].apply(lambda x: x[0]) 

movie_to_genre = movies[['movieId', 'primary_genre']].set_index('movieId').to_dict()['primary_genre']

print(movie_to_genre)

{1: 'Adventure', 2: 'Adventure', 3: 'Comedy', 4: 'Comedy', 5: 'Comedy', 6: 'Action', 7: 'Comedy', 8: 'Adventure', 9: 'Action', 10: 'Action', 11: 'Comedy', 12: 'Comedy', 13: 'Adventure', 14: 'Drama', 15: 'Action', 16: 'Crime', 17: 'Drama', 18: 'Comedy', 19: 'Comedy', 20: 'Action', 21: 'Comedy', 22: 'Crime', 23: 'Action', 24: 'Drama', 25: 'Drama', 26: 'Drama', 27: 'Children', 28: 'Drama', 29: 'Adventure', 30: 'Crime', 31: 'Drama', 32: 'Mystery', 34: 'Children', 36: 'Crime', 38: 'Children', 39: 'Comedy', 40: 'Drama', 41: 'Drama', 42: 'Action', 43: 'Drama', 44: 'Action', 45: 'Comedy', 46: 'Drama', 47: 'Mystery', 48: 'Animation', 49: 'Drama', 50: 'Crime', 52: 'Comedy', 53: 'Adventure', 54: 'Children', 55: 'Drama', 57: 'Drama', 58: 'Comedy', 60: 'Adventure', 61: 'Drama', 62: 'Drama', 63: 'Comedy', 64: 'Comedy', 65: 'Comedy', 66: 'Action', 68: 'Comedy', 69: 'Comedy', 70: 'Action', 71: 'Action', 72: 'Comedy', 73: 'Drama', 74: 'Drama', 75: 'Comedy', 76: 'Action', 77: 'Documentary', 78: 'Action'

In [54]:
for node in G.nodes():
    if node in movie_to_genre:
        G.nodes[node]['community'] = movie_to_genre[node]
    else:
        G.nodes[node]['community'] = None 


In [None]:
genre_partition = {}
for node, data in G.nodes(data=True):
    community = data.get('community')
    if community:
        genre_partition.setdefault(community, []).append(node)

genre_communities = [set(nodes) for nodes in genre_partition.values()]

[{1, 2, 116738, 2051, 2050, 2054, 8, 2056, 2057, 13, 77841, 45074, 6162, 47124, 26649, 6170, 6169, 96281, 29, 2077, 2085, 26662, 2043, 2088, 2089, 2090, 65577, 2092, 2093, 88108, 4135, 135216, 2099, 63540, 53, 2103, 60, 110655, 71745, 2116, 30793, 73804, 2133, 116823, 2135, 6232, 30810, 2139, 2140, 2141, 2142, 2143, 6239, 2138, 2147, 101, 2150, 107, 59501, 2161, 2162, 65651, 51314, 139385, 49274, 126, 26750, 155774, 73854, 2183, 146, 106642, 8341, 150, 4247, 26776, 6297, 45208, 155, 65685, 158, 127134, 57502, 8360, 169, 6316, 121007, 8368, 92348, 35015, 106696, 4306, 212, 69844, 4312, 2265, 53466, 26849, 4323, 231, 4327, 6377, 33004, 238, 49396, 2294, 2297, 82169, 258, 100611, 8450, 6405, 176389, 121097, 102666, 4366, 4370, 26901, 47384, 43289, 86298, 71970, 166183, 47404, 2355, 166203, 76093, 2367, 102720, 2368, 329, 104780, 8526, 2384, 340, 8534, 4445, 4446, 2399, 362, 364, 63853, 368, 33138, 119155, 4467, 45431, 51575, 156025, 26999, 115065, 4477, 2430, 6527, 8580, 6536, 98697, 8591

In [59]:
M = 0
L = G.number_of_edges()

for nodes in genre_communities:
    Lc = G.subgraph(nodes).number_of_edges()
    Kc = sum([x for _, x in G.subgraph(nodes).degree(nodes)])
    M += ((Lc/L)-((Kc/(2*L))**2))

print(f"Genre community modularity: {M}")

Genre community modularity: 0.15997045863914677


In [36]:
def get_user_rating_for_movie(user_id, movie_id):
    return ratings[(ratings['userId'] == user_id) & (ratings['movieId'] == movie_id)]['rating'].values[0]

In [37]:
def get_user_movies(user_id):
    user_movies = ratings[ratings['userId'] == user_id]['movieId'].values
    return user_movies

In [38]:
def get_movie_title_from_id(movie_id):
    return movies[movies['movieId'] == movie_id]['title'].values[0]

In [39]:
def get_movies_for_genre(genre):
    genre_movies = movies[movies['genres'].str.contains(genre, case=False)]['movieId'].values
    return genre_movies

In [40]:
def get_user_movies_with_genre(user_id, genre):
    user_movies = get_user_movies(user_id)
    genre_movies = get_movies_for_genre(genre)
    return set(user_movies) & set(genre_movies)

In [41]:
def get_genres_for_movie(movie_id):
    return movies[movies['movieId'] == movie_id]['genres'].values[0].split("|")

In [42]:
def create_user_id_genre_movies_dict(G):
    dict = {}
    for node in G.nodes():
        user_id = node
        genre_movies = {}
        for movie_id in get_user_movies(user_id):
            genres = get_genres_for_movie(movie_id)
            for genre in genres:
                if genre not in genre_movies:
                    genre_movies[genre] = {}
                genre_movies[genre][movie_id] = (get_movie_title_from_id(movie_id), get_user_rating_for_movie(user_id, movie_id))
        dict[user_id] = genre_movies
    return dict

In [46]:
# user_id : { genre : { movie_id : (movie_title, rating) } }
user_genre_movies_name = "user_genre_movies"
if not path.exists(f"{path.join(get_base_path(), f'{user_genre_movies_name}.pbz2')}"):
    user_genre_movies = create_user_id_genre_movies_dict(G)
    save("user_genre_movies", user_genre_movies)
else:
    user_genre_movies = load(user_genre_movies_name)