In [18]:
# SECTION 1 - IMPORT THE REQUIRED LIBRARIES

from IPython.display import HTML
import numpy as np
import bs4 # Beautiful Soup:library for pulling data out of HTML and XML files.
import time
import operator
import socket
import re # regular expressions

from pandas import Series
import pandas as pd
from pandas import DataFrame

import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
sns.set_context("talk")
sns.set_style("white")

In [19]:
# SECTION 2 - READ THE USER DATA

# pass in column names for each CSV
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']

users = pd.read_csv(
    'http://files.grouplens.org/datasets/movielens/ml-100k/u.user', 
    sep='|', names=u_cols)

users.head()

Unnamed: 0,user_id,age,sex,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [20]:
# SECTION 3 - READ THE RATINGS

r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv(
    'http://files.grouplens.org/datasets/movielens/ml-100k/u.data', 
    sep='\t', names=r_cols)

ratings.head()

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [30]:
# SECTION 4 - READ THE MOVIE INFORMATION

# the movies file contains columns indicating the movie's genres
# let's only load the first five columns of the file with usecols
m_cols = ['movie_id', 'title', 'release_date', 
            'video_release_date', 'imdb_url']

movies = pd.read_csv(
    'http://files.grouplens.org/datasets/movielens/ml-100k/u.item', 
    sep='|', names=m_cols, usecols=range(5))

movies.head()

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xe9 in position 3: invalid continuation byte

In [25]:
# SECTION 5 - GET INFORMATION ABOUT DATA

print(movies.dtypes)
print(movies.describe())
# *** Why only those two columns? ***

NameError: name 'movies' is not defined

In [68]:
# SECTION 6 - SELECTING DATA

# DataFrame => group of Series with shared index
# single DataFrame column => Series

users.head()
users['occupation'].head()

## *** Where did the nice design go? ***
columns_you_want = ['occupation', 'sex'] 
print(users[columns_you_want].head())

#print(users.head())

#print(users.iloc[3])

   occupation sex
0  technician   M
1       other   F
2      writer   M
3  technician   M
4       other   F


In [69]:
# SECTION 7 - FILTERING DATA

# select users older than 25
oldUsers = users[users.age > 25]
oldUsers.head()

Unnamed: 0,user_id,age,sex,occupation,zip_code
1,2,53,F,other,94043
4,5,33,F,other,15213
5,6,42,M,executive,98101
6,7,57,M,administrator,91344
7,8,36,M,administrator,5201


In [54]:
# SECTION 8 - OVERVIEW QUIZ 1
# part 1

# show users aged 40 AND male
users[(users.age == 40) & (users.sex == 'M')].head(3)

Unnamed: 0,user_id,age,sex,occupation,zip_code
18,19,40,M,librarian,2138
82,83,40,M,other,44133
115,116,40,M,healthcare,97232


In [55]:
# OVERVIEW QUIZ 1
# part 2

## show users who are female and programmers
selected_users = users[(users.sex == 'F') & 
                       (users.occupation == 'programmer')]

## show statistic summary
print(selected_users.describe())

## alternatives:
# display as a mean age of all female programmers
print(selected_users.age.mean())
print(selected_users['age'].mean())

          user_id        age
count    6.000000   6.000000
mean   411.166667  32.166667
std    149.987222   5.115336
min    292.000000  26.000000
25%    313.000000  28.250000
50%    378.000000  32.000000
75%    416.750000  36.500000
max    698.000000  38.000000
32.166666666666664
32.166666666666664


In [71]:
# SECTION 9 - SPLIT_APPLY_COMBINE

print(ratings.head())
## split data into groups
#grouped_data = ratings.groupby('user_id')
grouped_data = ratings['movie_id'].groupby(ratings['user_id'])

## count & combine
ratings_per_user = grouped_data.count()

ratings_per_user.head(5)

   user_id  movie_id  rating  unix_timestamp
0      196       242       3       881250949
1      186       302       3       891717742
2       22       377       1       878887116
3      244        51       2       880606923
4      166       346       1       886397596


user_id
1    272
2     62
3     54
4     24
5    175
Name: movie_id, dtype: int64

In [72]:
# SECTION 10 - OVERVIEW QUIZ 2
# part 1

## split data
grouped_data = ratings['rating'].groupby(ratings['movie_id'])
## average and combine
average_ratings = grouped_data.mean()
print("Average ratings:")
print(average_ratings.head())
print

maximum_rating = average_ratings.max()
good_movie_ids = average_ratings[average_ratings == maximum_rating].index

Average ratings:
movie_id
1    3.878319
2    3.206107
3    3.033333
4    3.550239
5    3.302326
Name: rating, dtype: float64


In [73]:
# OVERVIEW QUIZ 2
# part 2

print("Good movie IDs:")
print(good_movie_ids)
print

print("Best movie titles")
print(movies[movies.movie_id.isin(good_movie_ids)].title)
print

Good movie IDs:
Int64Index([814, 1122, 1189, 1201, 1293, 1467, 1500, 1536, 1599, 1653], dtype='int64', name='movie_id')
Best movie titles


NameError: name 'movies' is not defined

In [74]:
# OVERVIEW QUIZ 2
# part 3

how_many_ratings = grouped_data.count()
print("Number of ratings per movie")
print(how_many_ratings[average_ratings == maximum_rating])

Number of ratings per movie
movie_id
814     1
1122    1
1189    3
1201    1
1293    3
1467    2
1500    2
1536    1
1599    1
1653    1
Name: rating, dtype: int64


In [75]:
# SECTION 11 - PASSING A FUNCTION

average_ratings = grouped_data.apply(lambda f: f.mean())
average_ratings.head()

movie_id
1    3.878319
2    3.206107
3    3.033333
4    3.550239
5    3.302326
Name: rating, dtype: float64

In [76]:
# SECTION 12 - OVERVIEW QUIZ 3
# part 1

# get the average rating per user
grouped_data = ratings['rating'].groupby(ratings['user_id'])
average_ratings = grouped_data.mean()
average_ratings.head()

user_id
1    3.610294
2    3.709677
3    2.796296
4    4.333333
5    2.874286
Name: rating, dtype: float64

In [77]:
# OVERVIEW QUIZ 3
# part 2

# list all occupations and if they are male or female dominant
grouped_data = users['sex'].groupby(users['occupation'])
male_dominant_occupations = grouped_data.apply(lambda f: 
                                               sum(f == 'M') > sum(f == 'F'))
print(male_dominant_occupations)
print('\n')

occupation
administrator     True
artist            True
doctor            True
educator          True
engineer          True
entertainment     True
executive         True
healthcare       False
homemaker        False
lawyer            True
librarian        False
marketing         True
none              True
other             True
programmer        True
retired           True
salesman          True
scientist         True
student           True
technician        True
writer            True
Name: sex, dtype: bool




In [78]:
# OVERVIEW QUIZ 3
# part 3

print('number of male users: ')
print(sum(users['sex'] == 'M'))

print('number of female users: ')
print(sum(users['sex'] == 'F'))

number of male users: 
670
number of female users: 
273


In [79]:
# SECTION 13 - PANDAS WRAP-UP

#Create data frames

#Get sub-frames

#Filter data

#Use group-by

#Apply a user defined function
