In [None]:
import pandas as pd

ratings = pd.read_csv("row_data/MovieLens 20M Dataset/rating.csv")
movies = pd.read_csv("row_data/MovieLens 20M Dataset/movie.csv")

ratings_movies = pd.merge(ratings, movies, on="movieId", how="inner")

metadata = pd.read_csv("row_data/The Movies Dataset/movies_metadata.csv", low_memory=False)

In [None]:
ratings_movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000263 entries, 0 to 20000262
Data columns (total 6 columns):
 #   Column     Dtype  
---  ------     -----  
 0   userId     int64  
 1   movieId    int64  
 2   rating     float64
 3   timestamp  object 
 4   title      object 
 5   genres     object 
dtypes: float64(1), int64(2), object(3)
memory usage: 915.5+ MB


In [None]:
# 1. 最多評分的電影
ratings_movies.groupby(["movieId", "title"]).size().reset_index(name="rating_count").sort_values('rating_count',ascending=False).head(10)

# 不用 reset_index()？
# 不會報錯，但是：你拿到的是 Series，不是 DataFrame

Unnamed: 0,movieId,title,rating_count
293,296,Pulp Fiction (1994),67310
352,356,Forrest Gump (1994),66172
315,318,"Shawshank Redemption, The (1994)",63366
587,593,"Silence of the Lambs, The (1991)",63299
476,480,Jurassic Park (1993),59715
257,260,Star Wars: Episode IV - A New Hope (1977),54502
108,110,Braveheart (1995),53769
583,589,Terminator 2: Judgment Day (1991),52244
2486,2571,"Matrix, The (1999)",51334
523,527,Schindler's List (1993),50054


In [None]:
# 2. genre 哪一個出現最多次
df_genre = ratings_movies.copy()
df_genre["genre"] = df_genre["genres"].str.split("|")
df_genre = df_genre.explode("genre")

genre_count_df = df_genre.groupby('genre').size().reset_index(name='genre_count').sort_values('genre_count', ascending=False)
genre_count_df

Unnamed: 0,genre,genre_count
8,Drama,8857853
5,Comedy,7502234
1,Action,5614208
17,Thriller,5313506
2,Adventure,4380351
15,Romance,3802002
6,Crime,3298335
16,Sci-Fi,3150141
9,Fantasy,2111403
4,Children,1669249


In [None]:
# 3. 先過濾，接著計算機於「分類」的最高平均分數
df_without_blank = df_genre[(df_genre["genre"].notna()) & (df_genre["genre"] != "(no genres listed)")]

r = df_without_blank.groupby('genre').agg(
  mean = ("rating", "mean"),
  count = ("rating","count")
).sort_values('mean', ascending=False)

# 下面也是可以，只是就只能一次作一個
# .reset_index(name='mean').sort_values('mean', ascending=False)
r

Unnamed: 0_level_0,mean,count
genre,Unnamed: 1_level_1,Unnamed: 2_level_1
Film-Noir,3.965381,216689
War,3.809531,1048618
Documentary,3.739718,244619
Crime,3.674528,3298335
Drama,3.674296,8857853
Mystery,3.663509,1557282
IMAX,3.655946,492366
Animation,3.617494,1140476
Western,3.570498,423714
Musical,3.558091,870915


In [None]:
# 4. 最高分的，電影「平均分數最高」
# 建議加一個門檻（例如：至少 50 筆評分），避免只有 1–2 筆評分的電影衝到第一名。

movie_stats = (
    ratings_movies.groupby(["movieId", "title"])
      .agg(
          rating_mean = ("rating", "mean"),
          rating_cnt  = ("rating", "count")
      )
      .reset_index()
)
filled_by_rating_count = movie_stats[movie_stats['rating_cnt'] >= 50]
filled_by_rating_count.sort_values("rating_mean", ascending=False)

Unnamed: 0,movieId,title,rating_mean,rating_cnt
315,318,"Shawshank Redemption, The (1994)",4.446990,63366
843,858,"Godfather, The (1972)",4.364732,41355
49,50,"Usual Suspects, The (1995)",4.334372,47006
523,527,Schindler's List (1993),4.310175,50054
1195,1221,"Godfather: Part II, The (1974)",4.275641,27398
...,...,...,...,...
1746,1826,Barney's Great Adventure (1998),1.163484,419
4679,4775,Glitter (2001),1.124088,685
12003,54290,Bratz: The Movie (2007),1.105556,180
6373,6483,From Justin to Kelly (2003),0.973005,426
