In [152]:
from plotly.offline import init_notebook_mode, plot, iplot
import plotly.graph_objs as go
import pandas as pd

In [153]:



def plot_most_reviewed_movies(reviews_df):
    ##### Ratings Per Movie #####
    # Get data
    data = reviews_df.groupby('Movie_Id')['Rating'].count().clip(upper=199)

    # Create trace
    trace = go.Histogram(x = data.values,
                        name = 'Ratings',
                        xbins = dict(start = 0,
                                    end = 200,
                                    size = 2),
                        marker = dict(color = '#db0000'))
    # Create layout
    layout = go.Layout(title = 'Distribution Of Ratings Per Movie (Clipped at 199)',
                    xaxis = dict(title = 'Ratings Per Movie'),
                    yaxis = dict(title = 'Count'),
                    bargap = 0.2)

    # Create plot
    fig = go.Figure(data=[trace], layout=layout)
    iplot(fig)


def plot_most_reviews_user(reviews_df):
    ##### Ratings Per User #####
    # Get data
    data = reviews_df.groupby('CustomerID')['Rating'].count().clip(upper=199)

    # Create trace
    trace = go.Histogram(x = data.values,
                        name = 'Ratings',
                        xbins = dict(start = 0,
                                    end = 200,
                                    size = 2),
                        marker = dict(color = '#db0000'))
    # Create layout
    layout = go.Layout(title = 'Distribution Of Ratings Per User (Clipped at 199)',
                    xaxis = dict(title = 'Ratings Per User'),
                    yaxis = dict(title = 'Count'),
                    bargap = 0.2)

    # Create plot
    fig = go.Figure(data=[trace], layout=layout)
    iplot(fig)






In [154]:

reviews_df = pd.read_csv('reviews_winter_2001_2002.csv') 
print(reviews_df.head())
plot_most_reviewed_movies(reviews_df)
plot_most_reviews_user(reviews_df)

   Unnamed: 0  CustomerID  Rating        Date  Movie_Id
0   100122157     1195392     1.0  2001-12-22     17672
1   100125446     1325977     3.0  2001-12-22     17672
2   100343752     1325977     4.0  2001-12-22     17726
3   100371992      752568     5.0  2001-12-22     17756
4   100374276     1427482     4.0  2001-12-22     17756


In [155]:
# First remove movies that have more than 200 reviews and lower than 4 reviews
# Then remove users with more than 100 reviews and less than 4 reviews

review_count_per_movie = reviews_df.groupby('Movie_Id')['Rating'].count() 
movies_with_right_amount_reviews = tuple(review_count_per_movie[(review_count_per_movie < 20) & (review_count_per_movie >= 5)].index)
reviews_df_filtered_for_movies = reviews_df.loc[reviews_df['Movie_Id'].isin(movies_with_right_amount_reviews),:]


In [156]:
review_count_per_user = reviews_df_filtered_for_movies.groupby('CustomerID')['Rating'].count() 
users_with_right_amount_reviews = tuple(review_count_per_user[(review_count_per_user < 15) & (review_count_per_user >= 5)].index)
reviews_df_filtered_for_movie_user = reviews_df_filtered_for_movies.loc[reviews_df_filtered_for_movies['CustomerID'].isin(users_with_right_amount_reviews),:]

In [157]:
len(reviews_df_filtered_for_movie_user)

4561

In [158]:
len(reviews_df_filtered_for_movies)

20471

In [159]:
len(reviews_df_filtered_for_movie_user['Movie_Id'].unique())

1738

In [160]:
len(reviews_df_filtered_for_movie_user['CustomerID'].unique())

640

In [161]:
#reviews_df_filtered_for_movie_user.to_csv("small_test_selection_reviews.csv")

In [162]:
reviews_df_filtered_for_movie_user.groupby("CustomerID").count().sort_values("Rating")['Rating']

CustomerID
3458        5
1236530     5
1229664     5
1212585     5
1201695     5
           ..
2587506    14
1316286    14
1227649    14
1573228    14
1729139    14
Name: Rating, Length: 640, dtype: int64

In [163]:
review_count_per_user.sort_values()

CustomerID
684          1
2008457      1
1032389      1
1032402      1
1032648      1
          ... 
1146000    186
1677588    198
1272379    342
76196      480
1461435    759
Name: Rating, Length: 7073, dtype: int64

In [164]:
review_count_per_user[(review_count_per_user < 100) & (review_count_per_user >= 4)].sort_values()

CustomerID
3402        4
1363756     4
1352981     4
1347033     4
1341267     4
           ..
1028541    62
1039555    78
57633      79
542323     84
1267805    87
Name: Rating, Length: 1097, dtype: int64