In [1]:
import pandas as pd

In [2]:
# import data
ratings = pd.read_csv("data/reviews_winter_2001_2002.csv")


In [3]:
# See number of ratings that users did
ratings_per_userID = ratings.groupby("CustomerID").count().sort_values("Rating")['Rating']
#ratings_per_userID.hist(bins=[0,5,10,15,20])
ratings_per_userID

CustomerID
921239        1
2224065       1
1014797       1
2223516       1
2222868       1
           ... 
1639792    1068
76196      1072
1146000    1351
1272379    2001
1461435    2839
Name: Rating, Length: 21062, dtype: int64

In [4]:
# the number of users still left after selecting users that did at least 5 ratings
sum(ratings_per_userID >= 5)

17726

In [5]:
# Filter users with multiple ratings done
user_id_with_more_than_5_ratings = ratings_per_userID[ratings_per_userID >= 5].index
user_id_with_more_than_5_ratings = set(user_id_with_more_than_5_ratings)
ratings_for_active_users = ratings[ratings["CustomerID"].isin(user_id_with_more_than_5_ratings)]


In [6]:
# See number of ratings that users did
ratings_per_userID = ratings_for_active_users.groupby("CustomerID").count().sort_values("Rating")['Rating']
#ratings_per_userID.hist(bins=[0,5,10,15,20])
ratings_per_userID

CustomerID
2170076       5
1298009       5
2591322       5
598250        5
597735        5
           ... 
1639792    1068
76196      1072
1146000    1351
1272379    2001
1461435    2839
Name: Rating, Length: 17726, dtype: int64

In [7]:
ratings_per_movie_id = ratings_for_active_users.groupby("Movie_Id").count().sort_values("Rating")['Rating']
ratings_per_movie_id
#read_csv("small_test_selection_reviews_unique.csv")
#extra_small_correct_reviews_selection = ratings_for_active_users.loc[:, ['CustomerID','Rating','Date','Movie_Id']]
#extra_small_correct_reviews_selection['Rating'] = extra_small_correct_reviews_selection['Rating'].astype(int)
# extra_small_correct_reviews_selection.to_csv("small_test_selection_reviews_unique.csv", index=False)

Movie_Id
13140       1
7803        1
14908       1
16837       1
7819        1
         ... 
16377    4991
13728    5034
571      5487
14312    5496
12918    5772
Name: Rating, Length: 6369, dtype: int64

In [8]:
# Filter users with multiple ratings done
movie_id_with_more_than_5_ratings = ratings_per_movie_id[ratings_per_movie_id >= 5].index
movie_id_with_more_than_5_ratings = set(movie_id_with_more_than_5_ratings)
ratings_for_popular_movies = ratings_for_active_users[ratings_for_active_users["Movie_Id"].isin(movie_id_with_more_than_5_ratings)]

In [9]:
ratings_per_movie_id = ratings_for_popular_movies.groupby("Movie_Id").count().sort_values("Rating")['Rating']
ratings_per_movie_id

Movie_Id
12932       5
8633        5
11462       5
8611        5
3513        5
         ... 
16377    4991
13728    5034
571      5487
14312    5496
12918    5772
Name: Rating, Length: 4476, dtype: int64

In [10]:
ratings_per_movie_id = ratings_for_popular_movies.groupby("Movie_Id").count().sort_values("Rating")['Rating']
ratings_per_movie_id

Movie_Id
12932       5
8633        5
11462       5
8611        5
3513        5
         ... 
16377    4991
13728    5034
571      5487
14312    5496
12918    5772
Name: Rating, Length: 4476, dtype: int64

In [11]:
# Creating Edge lists
def create_edge_lists(ratings):
    edge_list = ratings.loc[:,["CustomerID", "Movie_Id"]]
    edge_list.rename(columns={'CustomerID': 'customer',
                    'Movie_Id': 'movie'},
            inplace=True, errors='raise')


    edge_list_with_ratings = ratings.loc[:,["CustomerID", "Movie_Id", "Rating"]]
    edge_list_with_ratings.rename(columns={'CustomerID': 'customer',
                    'Movie_Id': 'movie', 
                    'Rating':'weight'},
            inplace=True, errors='raise')
    return edge_list,edge_list_with_ratings

complete_edge_list, complete_edge_list_with_ratings = create_edge_lists(ratings_for_popular_movies)
complete_edge_list.to_csv('complete_edge_list_without_rating_weights.csv', index=False)
complete_edge_list_with_ratings.to_csv('complete_edge_list_with_rating_weights.csv', index=False)


In [12]:
# Creating Node list
def create_node_list(edge_list, movie_info_df):
    customer_ids = edge_list["customer"].unique()
    movie_ids = edge_list["movie"].unique()

    node_list_customers = pd.DataFrame(customer_ids, columns=['id'])
    node_list_customers["type"] = 'customer'

    node_list_movies = pd.DataFrame(movie_ids, columns=['id'])
    node_list_movies["type"] = 'movie'

    # attach movie attributes to corresponding movies
    attribute_node_list_movies = node_list_movies.merge(movie_info_df, left_on='id', right_on= 'movie', how='left', indicator=True)

    node_list = node_list_customers.append(node_list_movies)
    node_list_attributes = node_list_customers.append(attribute_node_list_movies)
    return node_list, node_list_attributes


movie_info_df = pd.read_csv("../movie_titles.csv", error_bad_lines=False, encoding = 'latin1', names=['movie', 'publish_year', 'title'])
node_list, node_list_attributes = create_node_list(edge_list=complete_edge_list_with_ratings, movie_info_df=movie_info_df)


In [13]:
#node_list_attributes.to_csv('small_node_list_with_attributes.csv')

In [14]:
node_list

Unnamed: 0,id,type
0,1195392,customer
1,1325977,customer
2,752568,customer
3,1427482,customer
4,1144950,customer
...,...,...
4471,12002,movie
4472,13232,movie
4473,15272,movie
4474,17574,movie
