In [2]:
import pandas as pd

In [3]:
#Read the ratings dataset
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv('ml-100k/u.data', sep='\t', names=r_cols, encoding='latin-1')

In [4]:
#Read the movies dataset
i_cols = ['movie id', 'movie title', 'release date', 'video release date', 'IMDb URL',
          'unknown', 'Action', 'Adventure', 'Animation', "Children's", 'Comedy', 'Crime', 'Documentary', 'Drama',
          'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
movies = pd.read_csv('ml-100k/u.item', sep='|', names=i_cols, encoding='latin-1')

In [5]:
#Merge ratings and movies dataset (only needed columns)
movie_data = pd.merge(ratings, movies[['movie id', 'movie title']], left_on='movie_id', right_on='movie id')

In [8]:
movie_data

Unnamed: 0,user_id,movie_id,rating,unix_timestamp,movie id,movie title
0,196,242,3,881250949,242,Kolya (1996)
1,186,302,3,891717742,302,L.A. Confidential (1997)
2,22,377,1,878887116,377,Heavyweights (1994)
3,244,51,2,880606923,51,Legends of the Fall (1994)
4,166,346,1,886397596,346,Jackie Brown (1997)
...,...,...,...,...,...,...
99995,880,476,3,880175444,476,"First Wives Club, The (1996)"
99996,716,204,5,879795543,204,Back to the Future (1985)
99997,276,1090,1,874795795,1090,Sliver (1993)
99998,13,225,2,882399156,225,101 Dalmatians (1996)


In [6]:
#Calculate the popularity (number of ratings + average rating)
movie_popularity = movie_data.groupby('movie title').agg({'rating': ['count', 'mean']})
movie_popularity.columns = ['rating_count', 'rating_mean']
movie_popularity = movie_popularity.reset_index()

In [7]:
movie_popularity

Unnamed: 0,movie title,rating_count,rating_mean
0,'Til There Was You (1997),9,2.333333
1,1-900 (1994),5,2.600000
2,101 Dalmatians (1996),109,2.908257
3,12 Angry Men (1957),125,4.344000
4,187 (1997),41,3.024390
...,...,...,...
1659,Young Guns II (1990),44,2.772727
1660,"Young Poisoner's Handbook, The (1995)",41,3.341463
1661,Zeus and Roxanne (1997),6,2.166667
1662,unknown,9,3.444444


In [9]:
#Sort by rating_count first (you can also combine rating_mean if needed)
popular_movies = movie_popularity.sort_values(by=['rating_count', 'rating_mean'], ascending=False)

In [10]:
popular_movies

Unnamed: 0,movie title,rating_count,rating_mean
1398,Star Wars (1977),583,4.358491
333,Contact (1997),509,3.803536
498,Fargo (1996),508,4.155512
1234,Return of the Jedi (1983),507,4.007890
860,Liar Liar (1997),485,3.156701
...,...,...,...
1582,"Very Natural Thing, A (1974)",1,1.000000
1584,"Vie est belle, La (Life is Rosey) (1987)",1,1.000000
1610,Wend Kuuni (God's Gift) (1982),1,1.000000
1646,"Woman in Question, The (1950)",1,1.000000


In [11]:
#This will filter only those movies that have more than 100 ratings, then sort by highest average rating
popular_mov = movie_popularity[(movie_popularity['rating_count'] > 100)].sort_values(by='rating_mean', ascending=False)

In [12]:
popular_mov

Unnamed: 0,movie title,rating_count,rating_mean
318,"Close Shave, A (1995)",112,4.491071
1281,Schindler's List (1993),298,4.466443
1652,"Wrong Trousers, The (1993)",118,4.466102
273,Casablanca (1942),243,4.456790
1317,"Shawshank Redemption, The (1994)",283,4.445230
...,...,...,...
1369,Spawn (1997),143,2.615385
471,Event Horizon (1997),127,2.574803
348,Crash (1996),128,2.546875
794,Jungle2Jungle (1997),132,2.439394


In [13]:
def popularity_high_quality(top_n=10):
    # Filter movies with more than 100 ratings
    filtered_movies = movie_popularity[movie_popularity['rating_count'] > 100]
    
    # Sort by highest average rating
    filtered_sorted_movies = filtered_movies.sort_values(by='rating_mean', ascending=False)
    
    # Return top_n movies
    return filtered_sorted_movies[['movie title', 'rating_count', 'rating_mean']].head(top_n)

In [18]:
recommendations = popularity_high_quality(10)
print(recommendations)

                           movie title  rating_count  rating_mean
318              Close Shave, A (1995)           112     4.491071
1281           Schindler's List (1993)           298     4.466443
1652        Wrong Trousers, The (1993)           118     4.466102
273                  Casablanca (1942)           243     4.456790
1317  Shawshank Redemption, The (1994)           283     4.445230
1215                Rear Window (1954)           209     4.387560
1572        Usual Suspects, The (1995)           267     4.385768
1398                  Star Wars (1977)           583     4.358491
3                  12 Angry Men (1957)           125     4.344000
303                Citizen Kane (1941)           198     4.292929
