In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

np.random.seed(12345)
plt.rc('figure', figsize=(10, 6))
np.set_printoptions(precision=4)
pd.options.display.max_columns = 20
pd.options.display.max_rows = 20
pd.options.display.max_colwidth = 80

In [60]:
names = ['user_id', 'gender', 'age', 'occupation', 'zip']
users = pd.read_table('datasets/movielens/users.dat', sep='::',
                      header=None, names=names, engine='python')
rnames = ["user_id", "movie_id", "rating", "timestamp"]
ratings = pd.read_table("datasets/movielens/ratings.dat", sep="::",
                        header=None, names=rnames, engine="python")

mnames = ["movie_id", "title", "genres"]
movies = pd.read_table("datasets/movielens/movies.dat", sep="::",
                       header=None, names=mnames, engine="python")


In [7]:
users.head()

Unnamed: 0,user_id,gender,age,occupation,zip
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [8]:
ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [9]:
movies.head()

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [61]:
data = pd.merge(pd.merge(ratings, users), movies)
data.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,gender,age,occupation,zip,title,genres
0,1,1193,5,978300760,F,1,10,48067,One Flew Over the Cuckoo's Nest (1975),Drama
1,2,1193,5,978298413,M,56,16,70072,One Flew Over the Cuckoo's Nest (1975),Drama
2,12,1193,4,978220179,M,25,12,32793,One Flew Over the Cuckoo's Nest (1975),Drama
3,15,1193,4,978199279,M,25,7,22903,One Flew Over the Cuckoo's Nest (1975),Drama
4,17,1193,5,978158471,M,50,1,95350,One Flew Over the Cuckoo's Nest (1975),Drama


In [62]:
data

Unnamed: 0,user_id,movie_id,rating,timestamp,gender,age,occupation,zip,title,genres
0,1,1193,5,978300760,F,1,10,48067,One Flew Over the Cuckoo's Nest (1975),Drama
1,2,1193,5,978298413,M,56,16,70072,One Flew Over the Cuckoo's Nest (1975),Drama
2,12,1193,4,978220179,M,25,12,32793,One Flew Over the Cuckoo's Nest (1975),Drama
3,15,1193,4,978199279,M,25,7,22903,One Flew Over the Cuckoo's Nest (1975),Drama
4,17,1193,5,978158471,M,50,1,95350,One Flew Over the Cuckoo's Nest (1975),Drama
...,...,...,...,...,...,...,...,...,...,...
1000204,5949,2198,5,958846401,M,18,17,47901,Modulations (1998),Documentary
1000205,5675,2703,3,976029116,M,35,14,30030,Broken Vessels (1998),Drama
1000206,5780,2845,1,958153068,M,18,17,92886,White Boys (1999),Drama
1000207,5851,3607,5,957756608,F,18,20,55410,One Little Indian (1973),Comedy|Drama|Western


In [14]:
mean_ratings = data.pivot_table(values='rating', index='title', columns='gender', aggfunc=['mean', 'count'])
mean_ratings

Unnamed: 0_level_0,mean,mean,count,count
gender,F,M,F,M
title,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
"$1,000,000 Duck (1971)",3.375000,2.761905,16.0,21.0
'Night Mother (1986),3.388889,3.352941,36.0,34.0
'Til There Was You (1997),2.675676,2.733333,37.0,15.0
"'burbs, The (1989)",2.793478,2.962085,92.0,211.0
...And Justice for All (1979),3.828571,3.689024,35.0,164.0
...,...,...,...,...
"Zed & Two Noughts, A (1985)",3.500000,3.380952,8.0,21.0
Zero Effect (1998),3.864407,3.723140,59.0,242.0
Zero Kelvin (Kjærlighetens kjøtere) (1995),,3.500000,,2.0
Zeus and Roxanne (1997),2.777778,2.357143,9.0,14.0


In [15]:
rating_by_title = data.groupby('title').size()
active_titles = rating_by_title[rating_by_title >= 250]
active_titles

title
'burbs, The (1989)                   303
10 Things I Hate About You (1999)    700
101 Dalmatians (1961)                565
101 Dalmatians (1996)                364
12 Angry Men (1957)                  616
                                    ... 
Young Guns (1988)                    562
Young Guns II (1990)                 369
Young Sherlock Holmes (1985)         379
Zero Effect (1998)                   301
eXistenZ (1999)                      410
Length: 1216, dtype: int64

In [17]:
mean_ratings = mean_ratings.loc[active_titles.index]

In [20]:
top_male_ratings = mean_ratings.sort_values(('mean', 'M'), ascending=False)
top_male_ratings

Unnamed: 0_level_0,mean,mean,count,count
gender,F,M,F,M
title,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
"Godfather, The (1972)",4.314700,4.583333,483.0,1740.0
Seven Samurai (The Magnificent Seven) (Shichinin no samurai) (1954),4.481132,4.576628,106.0,522.0
"Shawshank Redemption, The (1994)",4.539075,4.560625,627.0,1600.0
Raiders of the Lost Ark (1981),4.332168,4.520597,572.0,1942.0
"Usual Suspects, The (1995)",4.513317,4.518248,413.0,1370.0
...,...,...,...,...
Speed 2: Cruise Control (1997),1.906667,1.863014,75.0,292.0
Superman IV: The Quest for Peace (1987),2.216216,1.847458,37.0,295.0
Super Mario Bros. (1993),2.163636,1.820339,55.0,295.0
Grease 2 (1982),2.243478,1.792553,115.0,188.0


In [21]:
mean_ratings = mean_ratings['mean']
mean_ratings

gender,F,M
title,Unnamed: 1_level_1,Unnamed: 2_level_1
"'burbs, The (1989)",2.793478,2.962085
10 Things I Hate About You (1999),3.646552,3.311966
101 Dalmatians (1961),3.791444,3.500000
101 Dalmatians (1996),3.240000,2.911215
12 Angry Men (1957),4.184397,4.328421
...,...,...
Young Guns (1988),3.371795,3.425620
Young Guns II (1990),2.934783,2.904025
Young Sherlock Holmes (1985),3.514706,3.363344
Zero Effect (1998),3.864407,3.723140


In [24]:
mean_ratings['diff'] = abs(mean_ratings['M'] - mean_ratings['F'])
mean_ratings.sort_values('diff', ascending=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mean_ratings['diff'] = abs(mean_ratings['M'] - mean_ratings['F'])


gender,F,M,diff
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Dirty Dancing (1987),3.790378,2.959596,0.830782
"Good, The Bad and The Ugly, The (1966)",3.494949,4.221300,0.726351
"Kentucky Fried Movie, The (1977)",2.878788,3.555147,0.676359
Jumpin' Jack Flash (1986),3.254717,2.578358,0.676359
Dumb & Dumber (1994),2.697987,3.336595,0.638608
...,...,...,...
Robin Hood: Prince of Thieves (1991),3.161290,3.163347,0.002056
Trainspotting (1996),3.958974,3.960432,0.001457
Jerry Maguire (1996),3.758315,3.759424,0.001109
"Mystery, Alaska (1999)",3.434783,3.435780,0.000997


In [41]:
#  哪部电影的评分的标准差最大
movies = data.groupby('title').size()
index = movies.index[movies > 50]

In [46]:
rating_std_by_title = data.groupby('title')['rating'].std()

In [49]:
active_titles

title
'burbs, The (1989)                   303
10 Things I Hate About You (1999)    700
101 Dalmatians (1961)                565
101 Dalmatians (1996)                364
12 Angry Men (1957)                  616
                                    ... 
Young Guns (1988)                    562
Young Guns II (1990)                 369
Young Sherlock Holmes (1985)         379
Zero Effect (1998)                   301
eXistenZ (1999)                      410
Length: 1216, dtype: int64

In [52]:
rating_std_by_title = rating_std_by_title.loc[active_titles.index]
rating_std_by_title

title
'burbs, The (1989)                   1.107760
10 Things I Hate About You (1999)    0.989815
101 Dalmatians (1961)                0.982103
101 Dalmatians (1996)                1.098717
12 Angry Men (1957)                  0.812731
                                       ...   
Young Guns (1988)                    1.017437
Young Guns II (1990)                 1.071959
Young Sherlock Holmes (1985)         0.891176
Zero Effect (1998)                   1.042932
eXistenZ (1999)                      1.178568
Name: rating, Length: 1216, dtype: float64

In [53]:
rating_std_by_title.sort_values(ascending=False)

title
Dumb & Dumber (1994)                     1.321333
Blair Witch Project, The (1999)          1.316368
Natural Born Killers (1994)              1.307198
Tank Girl (1995)                         1.277695
Rocky Horror Picture Show, The (1975)    1.260177
                                           ...   
Wrong Trousers, The (1993)               0.708666
Shawshank Redemption, The (1994)         0.700443
Great Escape, The (1963)                 0.692585
Rear Window (1954)                       0.688946
Close Shave, A (1995)                    0.667143
Name: rating, Length: 1216, dtype: float64

In [58]:
data

Unnamed: 0,user_id,movie_id,rating,timestamp,gender,age,occupation,zip,title
0,1,1193,5,978300760,F,1,10,48067,One Flew Over the Cuckoo's Nest (1975)
1,2,1193,5,978298413,M,56,16,70072,One Flew Over the Cuckoo's Nest (1975)
2,12,1193,4,978220179,M,25,12,32793,One Flew Over the Cuckoo's Nest (1975)
3,15,1193,4,978199279,M,25,7,22903,One Flew Over the Cuckoo's Nest (1975)
4,17,1193,5,978158471,M,50,1,95350,One Flew Over the Cuckoo's Nest (1975)
...,...,...,...,...,...,...,...,...,...
1000204,5949,2198,5,958846401,M,18,17,47901,Modulations (1998)
1000205,5675,2703,3,976029116,M,35,14,30030,Broken Vessels (1998)
1000206,5780,2845,1,958153068,M,18,17,92886,White Boys (1999)
1000207,5851,3607,5,957756608,F,18,20,55410,One Little Indian (1973)


In [63]:
data['genres'] = data.pop('genres').str.split('|')
data.head(5)

Unnamed: 0,user_id,movie_id,rating,timestamp,gender,age,occupation,zip,title,genres
0,1,1193,5,978300760,F,1,10,48067,One Flew Over the Cuckoo's Nest (1975),[Drama]
1,2,1193,5,978298413,M,56,16,70072,One Flew Over the Cuckoo's Nest (1975),[Drama]
2,12,1193,4,978220179,M,25,12,32793,One Flew Over the Cuckoo's Nest (1975),[Drama]
3,15,1193,4,978199279,M,25,7,22903,One Flew Over the Cuckoo's Nest (1975),[Drama]
4,17,1193,5,978158471,M,50,1,95350,One Flew Over the Cuckoo's Nest (1975),[Drama]


In [64]:
data.explode('genres')

Unnamed: 0,user_id,movie_id,rating,timestamp,gender,age,occupation,zip,title,genres
0,1,1193,5,978300760,F,1,10,48067,One Flew Over the Cuckoo's Nest (1975),Drama
1,2,1193,5,978298413,M,56,16,70072,One Flew Over the Cuckoo's Nest (1975),Drama
2,12,1193,4,978220179,M,25,12,32793,One Flew Over the Cuckoo's Nest (1975),Drama
3,15,1193,4,978199279,M,25,7,22903,One Flew Over the Cuckoo's Nest (1975),Drama
4,17,1193,5,978158471,M,50,1,95350,One Flew Over the Cuckoo's Nest (1975),Drama
...,...,...,...,...,...,...,...,...,...,...
1000206,5780,2845,1,958153068,M,18,17,92886,White Boys (1999),Drama
1000207,5851,3607,5,957756608,F,18,20,55410,One Little Indian (1973),Comedy
1000207,5851,3607,5,957756608,F,18,20,55410,One Little Indian (1973),Drama
1000207,5851,3607,5,957756608,F,18,20,55410,One Little Indian (1973),Western


In [67]:
movies['genres'].str.get_dummies('|')

Unnamed: 0,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0
2,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3878,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
3879,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3880,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3881,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
