In [94]:
import pandas as pd

In [95]:
reviews = pd.read_csv("reviews_winter_2001_2002.csv")

In [96]:
reviews.head()

Unnamed: 0.1,Unnamed: 0,CustomerID,Rating,Date,Movie_Id
0,100122157,1195392,1.0,2001-12-22,17672
1,100125446,1325977,3.0,2001-12-22,17672
2,100343752,1325977,4.0,2001-12-22,17726
3,100371992,752568,5.0,2001-12-22,17756
4,100374276,1427482,4.0,2001-12-22,17756


# Select Movies between thresholds

In [97]:
# only keep user movie connections where user rated movie 4 or 5 stars.
customers_like_movie = reviews.loc[reviews.loc[:,'Rating'] >= 4,['CustomerID','Movie_Id']]

In [98]:
movie_ratings_counts = customers_like_movie.groupby('Movie_Id', as_index=False).count()
movie_ratings_counts.rename(columns={'CustomerID':'number_of_ratings'},inplace=True, errors='raise')
movie_ratings_counts
moviesIDs_between_threshold = movie_ratings_counts[(movie_ratings_counts['number_of_ratings'] > 20) & (movie_ratings_counts['number_of_ratings'] < 50)]
len(moviesIDs_between_threshold) # number of movies that are between threshold

485

In [99]:
# only select user movie links of movies that are between rating thresholds
selected_customer_movie_links = customers_like_movie.loc[customers_like_movie['Movie_Id'].isin(moviesIDs_between_threshold['Movie_Id']),:]
selected_customer_movie_links

Unnamed: 0,CustomerID,Movie_Id
427,1048813,2988
562,1325977,3222
600,1630448,3342
601,2193771,3342
604,1866911,3342
...,...,...
864267,1945567,17424
864269,514107,17424
864271,378579,17424
864276,690591,17424


# Transform to movie-movie links

In [100]:
selected_customer_movie_links
selected_customer_movie_links_copy = selected_customer_movie_links.copy(deep = True)

In [101]:
linked_movies = selected_customer_movie_links.merge(selected_customer_movie_links_copy, on='CustomerID')
linked_movies = linked_movies[linked_movies['Movie_Id_x'] !=linked_movies['Movie_Id_y']]  # remove rows where movies are linked to themselves
number_of_link_movies = linked_movies.groupby(['Movie_Id_x','Movie_Id_y'], as_index=False).count()
number_of_link_movies.rename(columns={'CustomerID':'number_of_links'},inplace=True, errors='raise')

In [102]:
# Could remove remove directional links and only keep links as connection.
no_directional_links = number_of_link_movies[number_of_link_movies['Movie_Id_x'] < number_of_link_movies['Movie_Id_y']]
no_directional_links.sort_values('number_of_links')

Unnamed: 0,Movie_Id_x,Movie_Id_y,number_of_links
0,18,143,1
29764,6710,7589,1
29763,6710,7462,1
29762,6710,7355,1
29761,6710,7275,1
...,...,...,...
1219,463,12242,26
44597,9836,12242,27
54971,12242,16397,28
38884,8869,9836,29


# Create Edge and Node list

In [103]:
# Tawab en Alan Edge list
edge_list = no_directional_links

In [104]:
# Tawab en Alan Node list
moviesIDs_between_threshold
collection_movie_ids = set(no_directional_links['Movie_Id_x']) | set(no_directional_links['Movie_Id_y'])
movie_attributes = pd.read_csv('movie_titles.csv', error_bad_lines=False, encoding = 'latin1', names=['movie', 'publish_year', 'title'])
selected_movies_attributes = movie_attributes[movie_attributes['movie'].isin(collection_movie_ids)]

In [105]:
selected_movies_attributes.loc[:,'publish_year'] = selected_movies_attributes.loc[:,'publish_year'].astype(int)
selected_movies_attributes['decade'] = selected_movies_attributes.loc[:,'publish_year'].astype(str).map(lambda x: x[:-1] + '0').astype(int)
node_list = selected_movies_attributes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [106]:
edge_list.to_csv('movie_movie_connections_edge_list.csv')
node_list.to_csv('movie_movie_connections_node_list.csv')

# Example of how the logic works with small example

In [107]:
test_reviews = pd.DataFrame({'movieID':[1,1,1,2,2,3,4,4], 'userID':[10,11,12,10,13,14,10,11]})

In [108]:
copy_revies = test_reviews.copy(deep = True)

In [109]:
copy_revies

Unnamed: 0,movieID,userID
0,1,10
1,1,11
2,1,12
3,2,10
4,2,13
5,3,14
6,4,10
7,4,11


In [110]:
linked_movies = test_reviews.merge(copy_revies, on='userID')
linked_movies = linked_movies[linked_movies['movieID_x'] !=linked_movies['movieID_y']]  # remove rows where movies are linked to themselves
number_of_link_movies = linked_movies.groupby(['movieID_x','movieID_y'], as_index=False).count()
number_of_link_movies.rename(columns={'userID':'number_of_links'},inplace=True, errors='raise')

In [111]:
number_of_link_movies

Unnamed: 0,movieID_x,movieID_y,number_of_links
0,1,2,1
1,1,4,2
2,2,1,1
3,2,4,1
4,4,1,2
5,4,2,1


In [112]:
# Could remove remove directional links and only keep links as connection.
no_directional_links = number_of_link_movies[number_of_link_movies['movieID_x'] < number_of_link_movies['movieID_y']]

In [113]:
no_directional_links

Unnamed: 0,movieID_x,movieID_y,number_of_links
0,1,2,1
1,1,4,2
3,2,4,1
