In [15]:
import pandas as pd
import networkx as nx
from os import path
import requests
import re
import os
from bs4 import BeautifulSoup
import pickle
import bz2file as bz2
from urllib.request import urlopen
from urllib.parse import quote
import statistics as stats
import pickle
import nltk
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
import matplotlib.pyplot as plt
import community as community_louvain
from collections import Counter
from wordcloud import WordCloud
from fa2_modified import ForceAtlas2
import matplotlib.colors as mcolors
import math
import networkx as nx

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [16]:
full_ml_dataset = False

In [17]:
def get_base_path():
    if full_ml_dataset:
        return "../data/ml/full"
    return "../data/ml/small"

In [18]:
def get_csv(name):
    return pd.read_csv(path.join(get_base_path(), name))

In [19]:
ratings = get_csv("ratings.csv")
movies = get_csv("movies.csv")
tags = get_csv("tags.csv")
links = get_csv("links.csv")

In [20]:
G = nx.Graph()
user_nodes = ratings['userId'].unique()
movie_nodes = ratings['movieId'].unique()

G.add_nodes_from(movie_nodes, bipartite=0)
G.add_nodes_from(user_nodes, bipartite=1)
edges = ratings[['userId', 'movieId', 'rating']].values
G.add_weighted_edges_from(edges)

In [21]:
print(f"Number of nodes: {G.number_of_nodes()}")
print(f"Number of edges: {G.number_of_edges()}")

Number of nodes: 9811
Number of edges: 100403


In [22]:
forceatlas2 = ForceAtlas2(
                        # Behavior alternatives
                        outboundAttractionDistribution=False,  # Dissuade hubs
                        linLogMode=False,  # NOT IMPLEMENTED
                        adjustSizes=False,  # Prevent overlap (NOT IMPLEMENTED)
                        edgeWeightInfluence=1.0,

                        # Performance
                        jitterTolerance=1.0,  # Tolerance
                        barnesHutOptimize=True,
                        barnesHutTheta=1.2,
                        multiThreaded=False,  # NOT IMPLEMENTED

                        # Tuning
                        scalingRatio=2.0,
                        strongGravityMode=True,
                        gravity=1.0,

                        # Log
                        verbose=False)

In [23]:
partition = community_louvain.best_partition(G)

partition_counts = Counter(partition.values())
most_common_partitions = partition_counts.most_common(10)

common_partitions_ids = set(community[0] for community in most_common_partitions)

colors = list(mcolors.TABLEAU_COLORS.keys())
partitions_colors = {partition_id: colors[i % len(colors)] for i, partition_id in enumerate(common_partitions_ids)}

node_colors = []
for node in G.nodes():
    community_id = partition[node]
    if community_id in partitions_colors:
        node_colors.append(partitions_colors[community_id])
    else:
        node_colors.append('lightgray')

#positions = forceatlas2.forceatlas2_networkx_layout(G, pos=None, iterations=2000)

In [24]:
#plt.figure(figsize=(12, 12))
#nx.draw_networkx_nodes(G, positions, node_size=20, node_color=node_colors, alpha=0.8)
#nx.draw_networkx_edges(G, positions, edge_color="gray", alpha=0.05)
#plt.axis('off')
#plt.title("Network Visualization with Force Atlas Algorithm")
#plt.show()

In [25]:
print(ratings.head())
print(movies.head())
print(tags.head())
print(links.head())

   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       1        6     4.0  964982224
3       1       47     5.0  964983815
4       1       50     5.0  964982931
   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  
   userId  movieId              tag   timestamp
0       2    60756            funny  1445714994
1       2    60756  Highly quotable  1445714996
2       2    60756     will ferre