# What Makes Movies Good?
By: Trevor Mitchell and Matthew Bouch

## Initial Data Exploration

In [2]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
# For when we want our notebook to look nice
import warnings
warnings.filterwarnings('ignore')

In [3]:
ratings = pd.read_csv("ml-25m/ratings.csv")
movies = pd.read_csv("ml-25m/movies.csv")

We will start simple by observing the most highly rated movies

In [4]:
movie_ratings = ratings.groupby(by="movieId").agg(["mean", "count"])
movie_ratings.columns = movie_ratings.columns.get_level_values(1) + '_' + movie_ratings.columns.get_level_values(0)
movie_ratings.reset_index(inplace=True)
movie_ratings = movie_ratings[["movieId", "mean_rating", "count_rating"]].rename(columns = {
    "mean_rating": "rating",
    "count_rating": "count"
})

movie_ratings = pd.merge(movie_ratings, movies, on="movieId")

movie_ratings.sort_values(by=["rating", "count"], ascending=False, inplace=True)
movie_ratings.head(10)

Unnamed: 0,movieId,rating,count,title,genres
23235,118268,5.0,3,Borrowed Time (2012),Drama
33976,148298,5.0,3,Awaken (2013),Drama|Romance|Sci-Fi
40729,165787,5.0,3,Lonesome Dove Church (2014),Western
46900,179731,5.0,3,Sound of Christmas (2016),Drama
28113,133297,5.0,2,Genius on Hold (2013),(no genres listed)
29959,137853,5.0,2,El camino (2008),Drama
30557,139547,5.0,2,Placebo: Soulmates Never Die: Live in Paris 20...,(no genres listed)
30895,140369,5.0,2,War Arrow (1954),Adventure|Drama|Romance|War|Western
30899,140377,5.0,2,About Sarah,Drama
32154,143422,5.0,2,2 (2007),Drama


As we can see above, simply using average rating as the metric for research will not provide an accurate picture. This would imply that Borrowed Time (2012), reviewed by a sum total of 3 people in this dataset, is the greatest movie of all time.

However, with some extra research, we found that it has a score of 5.9/10 on IMdB out of 371 ratings, and 3.2/5 on Letterboxd out of 59 ratings.

In [5]:
movie_ratings.sort_values(by=["count", "rating"], ascending=False, inplace=True)
movie_ratings.head(10)

Unnamed: 0,movieId,rating,count,title,genres
351,356,4.048011,81491,Forrest Gump (1994),Comedy|Drama|Romance|War
314,318,4.413576,81482,"Shawshank Redemption, The (1994)",Crime|Drama
292,296,4.188912,79672,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller
585,593,4.151342,74127,"Silence of the Lambs, The (1991)",Crime|Horror|Thriller
2480,2571,4.154099,72674,"Matrix, The (1999)",Action|Sci-Fi|Thriller
257,260,4.120189,68717,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi
475,480,3.679175,64144,Jurassic Park (1993),Action|Adventure|Sci-Fi|Thriller
522,527,4.247579,60411,Schindler's List (1993),Drama|War
108,110,4.002273,59184,Braveheart (1995),Action|Drama|War
2867,2959,4.228311,58773,Fight Club (1999),Action|Crime|Drama|Thriller


Ranking films primarily based on the metric of popularity (How many reviews it has received) seems to provide a more reliable outcome.

Let's now examine the relationship between popularity and rating

In [6]:
px.scatter(movie_ratings, x="count", y="rating", opacity=.5)

We now have our first question at hand: What number of reviews is required for an average rating to accurately reflect the quality of the movie, devoid of personal biases such as a friend as the director?

In [7]:
movie_ratings_filtered = movie_ratings[movie_ratings["count"] >= 100]
movie_ratings_filtered.sort_values("rating", ascending=False).head(10)

Unnamed: 0,movieId,rating,count,title,genres
42953,171011,4.483096,1124,Planet Earth II (2016),Documentary
38361,159817,4.464797,1747,Planet Earth (2006),Documentary
314,318,4.413576,81482,"Shawshank Redemption, The (1994)",Crime|Drama
42806,170705,4.398599,1356,Band of Brothers (2001),Action|Drama|War
43172,171495,4.326715,277,Cosmos,(no genres listed)
840,858,4.324336,52498,"Godfather, The (1972)",Crime|Drama
46627,179135,4.289833,659,Blue Planet II (2017),Documentary
49,50,4.284353,55366,"Usual Suspects, The (1995)",Crime|Mystery|Thriller
54972,198185,4.267361,288,Twin Peaks (1989),Drama|Mystery
1190,1221,4.261759,34188,"Godfather: Part II, The (1974)",Crime|Drama


Interestingly enough, Many of the less popular movies that still rated highly are documentaries.

Using the README.txt provided, we manually created a "genres.txt" file that contains a list of genres, where each genre is on a separate line.

In [8]:
with open("ml-25m/genres.txt") as file:
    genres = [line.rstrip("\n") for line in file]
print(genres)

['Action', 'Adventure', 'Animation', "Children's", 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']


In [9]:
genre_avgs = []
genre_counts = []
#! Test
movies_count = []
for genre in genres:
    movies_with_genre = movie_ratings_filtered[movie_ratings_filtered["genres"].str.contains(genre)]
    score = movies_with_genre.mean(skipna=True)
    #! Test
    movies_count.append(movies_with_genre["count"].count())
    reviews_count = movies_with_genre["count"].sum()
    genre_avgs.append(score["rating"])
    genre_counts.append(reviews_count)

genre_scores = pd.DataFrame(
    list(zip(genres, genre_avgs, genre_counts)), 
    columns=["genre", "avg_rating", "total_ratings"]
).dropna()
genre_scores.sort_values("avg_rating", inplace=True, ascending=False)

movie_totals = pd.DataFrame(
    list(zip(genres, movies_count, genre_avgs, genre_counts)),
    columns=["genre", "total_movies", "avg_rating", "total_ratings"]
).dropna()
movie_totals.sort_values("avg_rating", inplace=True, ascending=False)


Dropping of nuisance columns in DataFrame reductions (with 'numeric_only=None') is deprecated; in a future version this will raise TypeError.  Select only valid columns before calling the reduction.



Create the overlapping bar chart

In [10]:
from plotly.subplots import make_subplots
fig = make_subplots(specs=[[{"secondary_y": True}]])

fig.add_trace(go.Bar(y=genre_scores["avg_rating"], x=genre_scores["genre"],
                    name="Average Rating", yaxis='y'))

fig.add_trace(go.Bar(y=genre_scores["total_ratings"], x=genre_scores["genre"],
                    name="Total Ratings", yaxis='y2',
                    opacity=.5),
                secondary_y=True)

# Add figure title
fig.update_layout(title_text="Film Performance by Genre")

# Set x-axis title
fig.update_xaxes(title_text="Genre")

# Set y-axes titles
fig.update_yaxes(
    title_text="Average Rating", 
    secondary_y=False)
fig.update_yaxes(
    title_text="Count of Reviews", 
    secondary_y=True)

Lets now plot the amount of movies for each genre to see if there is a bias to any genre because of sheer amount of films.

In [11]:
fig = make_subplots(specs=[[{"secondary_y": True}]])

fig.add_trace(go.Bar(y=movie_totals["avg_rating"], x=movie_totals["genre"],
                    name="Average Rating", yaxis='y'))

fig.add_trace(go.Bar(y=movie_totals["total_movies"], x=movie_totals["genre"],
                    name="Film Count by Genre", yaxis='y2', opacity=.5),
                    secondary_y=True)

# Add figure title
fig.update_layout(title_text="Film Rating and Count by Genre")

# Set x-axis title
fig.update_xaxes(title_text="Genre")

# Set y-axes titles
fig.update_yaxes(
    title_text="Movie Count", 
    secondary_y=True)
fig.update_yaxes(
    title_text="Average Rating",
    secondary_y=False)

So the above graph shows the average ratings of films along with how many are made in that genre.\
Here we can see that movie genres with less films tend to be slightly more performing that genres with lots of films.

In [18]:
movie_totals["ratings_per_movie"] = movie_totals["total_ratings"] / movie_totals["total_movies"]
movie_totals

fig = make_subplots(specs=[[{"secondary_y": True}]])

fig.add_trace(go.Bar(y=movie_totals["avg_rating"], x=movie_totals["genre"],
                    name="Average Rating", yaxis='y'))

fig.add_trace(go.Bar(y=movie_totals["ratings_per_movie"], x=movie_totals["genre"],
                    name="Ratings Per Movie", yaxis='y2', opacity=.5),
                    secondary_y=True)

# Add figure title
fig.update_layout(title_text="Film Rating and Count by Genre")

# Set x-axis title
fig.update_xaxes(title_text="Genre")

# Set y-axes titles
fig.update_yaxes(
    title_text="Ratings Per Movie", 
    secondary_y=True)
fig.update_yaxes(
    title_text="Average Rating",
    secondary_y=False)

Now that we have seen some ideas on what influences a movies rating now lets see how well our audience does at rating films

In [13]:
# make a dictionary of the userIds and their ratings
user_ratings = ratings.groupby(by="userId").agg(["mean", "count"])

# print(user_ratings["rating"].head(5))

rating_mean_list = []

# loop through the user_ratings dataframe and for users with more then 20 reviews add their average rating to the list
for index, row in user_ratings.iterrows():
    # print(row["rating"]["mean"])
    if row["rating"]["count"] > 20:
        rating_mean_list.append(row["rating"]["mean"])

user_mean_avg = sum(rating_mean_list) / len(rating_mean_list)
print("The average user review is: " + str(user_mean_avg))

The average user review is: 3.6845912164039505


So we have now found the average user review to be *3.68* which is above the expected average of *3* so users may be all biased high on how they review movies.

This is likely because users are more likely to select movies that seem appealing to them, and therefore have a higher average rating than if they selected entirely randomly.

## User Review Spread over Time

First we must select the user with the most reviews

In [14]:
user_review_counts = ratings.groupby("userId").size()
most_active_user = user_review_counts.idxmax()
num_reviews = user_review_counts.loc[most_active_user]
print("User_id: {}   number_of_reviews: {}".format(most_active_user, num_reviews))


User_id: 72315   number_of_reviews: 32202


In [15]:
selected_user_reviews = ratings[ratings["userId"] == most_active_user]
selected_user_reviews = selected_user_reviews.sort_values("timestamp")
selected_user_reviews

Unnamed: 0,userId,movieId,rating,timestamp
11131331,72315,86057,4.5,1450162767
11135447,72315,116897,4.0,1450162767
11134148,72315,106782,4.0,1450162767
11127970,72315,52170,4.0,1450162768
11129917,72315,73881,4.0,1450162768
...,...,...,...,...
11145132,72315,173075,0.5,1570529429
11145138,72315,173095,1.0,1570529429
11145142,72315,173125,1.0,1570529429
11142388,72315,159287,2.0,1570529429


The above data shows that this user must be a bot, because they are rating multiple movies over the span of any given second

In [16]:
selected_user_reviews = ratings[ratings["userId"] == 3] # 3 manually selected because it had enough datapoints
selected_user_reviews = selected_user_reviews.sort_values("timestamp")
selected_user_reviews

Unnamed: 0,userId,movieId,rating,timestamp
266,3,356,4.0,1439472199
272,3,593,4.0,1439472203
298,3,1270,3.5,1439472211
254,3,1,4.0,1439472215
268,3,480,2.0,1439472219
...,...,...,...,...
758,3,88125,3.5,1566091623
819,3,106489,3.5,1566091626
734,3,81834,3.5,1566091630
380,3,4262,4.5,1566091872


In [17]:
#animated_hist_df = pd.DataFrame(columns=["timestamp", ".5", "1", "1.5", "2", "2.5", "3", "3.5", "4", "4.5", "5"])
df_arr = [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
i = 1
for index, row in selected_user_reviews.iterrows():
    df_arr.append([row["timestamp"]])
    for j in range (1,11):
        if row["rating"] == j * .5: df_arr[i].append(df_arr[i-1][j]+1)
        else:                       df_arr[i].append(df_arr[i-1][j])
    i += 1

animated_hist_df = pd.DataFrame(df_arr, columns=["timestamp", ".5", "1", "1.5", "2", "2.5", "3", "3.5", "4", "4.5", "5"])
animated_hist_df = animated_hist_df.melt(id_vars=["timestamp"], var_name="rating", value_name="cumulative_count")

fig = px.bar(animated_hist_df, x="rating", y="cumulative_count", animation_frame="timestamp", range_y=[0, 300])
fig.show()

Show Rating Activity over time for Forrest Gump

In [25]:
import datetime

forrest_gump = ratings[ratings["movieId"] == 356]
forrest_gump.sort_values("timestamp", inplace=True)

forrest_gump["Date"] = forrest_gump.apply(
    lambda d:
    datetime.datetime.fromtimestamp(d["timestamp"]).date(),
    axis=1
)

forrest_gump = pd.DataFrame(forrest_gump.groupby('Date').size())
forrest_gump.reset_index(inplace=True)
forrest_gump = forrest_gump.rename(columns={0: "count"})

forrest_gump["cumulative"] = forrest_gump["count"].cumsum()

px.line(forrest_gump, x="Date", y="cumulative", title="Forrest Gump Activity",
    labels = {"cumulative": "Total Ratings"}
)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



![](MovieTagsWordCloud.png)