In [3]:
%matplotlib inline
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display
pd.set_option('notebook_repr_html', False )

xpath = "pydata-book-master/ch02/movielens/"

pd.set_option('precision', 3)

In [4]:
unames = ['user_Id', 'gender', 'age', 'occupation', 'zip']
users = pd.read_csv(os.path.join(xpath, "users.dat"), header=None, names=unames,  sep='::', engine='python')
print(users.head(5))


   user_Id gender  age  occupation    zip
0        1      F    1          10  48067
1        2      M   56          16  70072
2        3      M   25          15  55117
3        4      M   45           7  02460
4        5      M   25          20  55455


In [7]:
rnames = ['user_Id', 'movie_Id', 'ratings', 'timestamp']
ratings = pd.read_csv(os.path.join(xpath, "ratings.dat"), header=None, names=rnames, sep='::', engine='python')
print(ratings.head(5))


   user_Id  movie_Id  ratings  timestamp
0        1      1193        5  978300760
1        1       661        3  978302109
2        1       914        3  978301968
3        1      3408        4  978300275
4        1      2355        5  978824291


In [9]:
mnames = ['movie_Id', 'title', 'genre']
movies = pd.read_csv(os.path.join(xpath, "movies.dat"), header=None, names=mnames, sep='::', engine='python')
print(movies.head(5))


   movie_Id                               title                         genre
0         1                    Toy Story (1995)   Animation|Children's|Comedy
1         2                      Jumanji (1995)  Adventure|Children's|Fantasy
2         3             Grumpier Old Men (1995)                Comedy|Romance
3         4            Waiting to Exhale (1995)                  Comedy|Drama
4         5  Father of the Bride Part II (1995)                        Comedy


In [33]:

alldata = pd.merge(pd.merge(ratings, users), movies)
# print(alldata.head(5))
alldata.ix[0]

user_Id                                            1
movie_Id                                        1193
ratings                                            5
timestamp                                  978300760
gender                                             F
age                                                1
occupation                                        10
zip                                            48067
title         One Flew Over the Cuckoo's Nest (1975)
genre                                          Drama
Name: 0, dtype: object

In [38]:
# Average movie ratings among the genders.
mean_ratings = alldata.pivot_table('ratings', columns='gender', index='title', aggfunc='mean')
print(mean_ratings.head(3))

gender                        F     M
title                                
$1,000,000 Duck (1971)     3.38  2.76
'Night Mother (1986)       3.39  3.35
'Til There Was You (1997)  2.68  2.73


In [39]:
ratings_by_title = alldata.groupby('title').size()
print(ratings_by_title.head(5))

title
$1,000,000 Duck (1971)            37
'Night Mother (1986)              70
'Til There Was You (1997)         52
'burbs, The (1989)               303
...And Justice for All (1979)    199
dtype: int64


In [41]:
# Average age among the genders renting movies index by genre and movie title. 
gender_avg_age = alldata.pivot_table('age', columns='gender', index='title')
print(gender_avg_age.head(5))
print("\n\n")

# Count reviews per movie for each genre
ratings_genre_title = alldata.groupby(['genre','title']).size()
print(ratings_genre_title.head())

print("\n\n")
# Then pick out rated movies with reviews >= 250 
active_titles = ratings_genre_title[ratings_genre_title >= 250]
print(active_titles.head(25))

gender                             F      M
title                                      
$1,000,000 Duck (1971)         34.88  28.33
'Night Mother (1986)           30.89  35.79
'Til There Was You (1997)      28.14  30.53
'burbs, The (1989)             27.50  28.07
...And Justice for All (1979)  38.23  35.51



genre   title                 
Action  American Strays (1996)      4
        Art of War, The (2000)    144
        Assassination (1987)       23
        Avalanche (1978)           17
        Bad Boys (1995)           362
dtype: int64



genre             title                                     
Action            Bad Boys (1995)                                362
                  Dr. No (1962)                                  646
                  First Blood (1982)                             397
                  For Your Eyes Only (1981)                      425
                  From Russia with Love (1963)                   771
                  Goldfinger (1964)           

In [30]:
# Add one more column to the mean rating showing number of reviews each movie recieved.
mean_ratings['#_rvws'] = alldata.groupby('title').size()

# Average ratings per movies according to gender
print(mean_ratings.ix[active_titles].head(5))

gender                                     F     M  #_rvws
title                                                     
Better Living Through Circuitry (1999)  3.67  3.85      16
Charlie, the Lonesome Cougar (1967)     3.29  3.06      23
Billy Madison (1995)                    3.14  3.19     355
Blank Check (1994)                      2.69  2.01      96
Cotton Mary (1999)                      3.00  3.00       4


In [125]:
# top movie among female viewers
mean_ratings.sort_index(by='F', ascending=False).head(5)

gender                                             F      M  #_rvws
title                                                              
Clean Slate (Coup de Torchon) (1981)               5  3.857      15
Ballad of Narayama, The (Narayama Bushiko) (1958)  5  3.429       8
Raw Deal (1948)                                    5  3.308      14
Bittersweet Motel (2000)                           5    NaN       1
Skipped Parts (2000)                               5  4.000       2

In [135]:
# Add one more column to Dataframe showign differences in Male & Female ratings
mean_ratings['diff'] = mean_ratings['M'] - mean_ratings['F']


# Moviews with wider discrepancy and popular among Females
sorted_by_diff = mean_ratings.sort_index(by='diff')
print(sorted_by_diff.head(15))

gender                                                 F     M  #_rvws  diff
title                                                                       
James Dean Story, The (1957)                        4.00  1.00       3 -3.00
Country Life (1994)                                 5.00  2.00       3 -3.00
Spiders, The (Die Spinnen, 1. Teil: Der Goldene...  4.00  1.00       4 -3.00
Babyfever (1994)                                    3.67  1.00       4 -2.67
Woman of Paris, A (1923)                            5.00  2.43       8 -2.57
Cobra (1925)                                        4.00  1.50       5 -2.50
Other Side of Sunday, The (S�ndagsengler) (1996)    5.00  2.93      16 -2.07
To Have, or Not (1995)                              4.00  2.00       2 -2.00
For the Moment (1994)                               5.00  3.00       5 -2.00
Phat Beach (1996)                                   3.00  1.00       4 -2.00
Crude Oasis, The (1995)                             3.00  1.00       2 -2.00

In [58]:

# Standard deviation of ratings grouped by title
rating_std_by_title = alldata.groupby('title')['ratings'].std()


# Movies with highest rating disagreement among viewers using Std Dev. 
rating_std_by_title.ix[active_titles].head(5)


title
Better Living Through Circuitry (1999)    0.9811
Charlie, the Lonesome Cougar (1967)       0.8149
Billy Madison (1995)                      1.2500
Blank Check (1994)                        1.0682
Cotton Mary (1999)                        1.4142
Name: ratings, dtype: float64

In [54]:

# top 10 movies with highest disagreement
pd.set_option('precision', 5)
rating_std_by_title.order(ascending=False)[:10]


title
Foreign Student (1994)                                             2.8284
Criminal Lovers (Les Amants Criminels) (1999)                      2.3094
Identification of a Woman (Identificazione di una donna) (1982)    2.1213
Sunset Park (1996)                                                 2.1213
Eaten Alive (1976)                                                 2.1213
Neon Bible, The (1995)                                             2.1213
Talk of Angels (1998)                                              2.1213
Tokyo Fist (1995)                                                  2.1213
Paralyzing Fear: The Story of Polio in America, A (1998)           2.1213
Better Living (1998)                                               2.1213
Name: ratings, dtype: float64