In [1]:
import numpy as np
import pandas as pd

In [30]:
# Generate some movie recommendations given movies they have already watched and the ratings they gave for those movies

# We'll use pandas for data preparation and analysis

# Let's use the movielens dataset

dataFile = '/home/rydra/projects/anaconda-firststeps/ml-latest-small/ratings.csv'
data = pd.read_csv(dataFile, sep=',', dtype={'userId': str, 'movieId': str, 'rating': float, 'timestamp': int})

In [36]:
# data is now a pandas Dataframe object. There are many complex ways of indexing this dataframe
# and manipulating it, subsetting it, etc...

# head() will print the first few rows
data.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [37]:
# Let's load movie info
movieInfoFile = '/home/rydra/projects/anaconda-firststeps/ml-latest-small/movies.csv'
movieData = pd.read_csv(movieInfoFile, sep=',', usecols=[0, 1], dtype={'movieId': str})

In [38]:
movieData.head()

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)


In [39]:
# Let's combine the two datasets so that we include the movie name into the first dataset (like a JOIN)

merged_data = pd.merge(data, movieData, left_on='movieId', right_on='movieId')

In [41]:
merged_data.head()

Unnamed: 0,userId,movieId,rating,timestamp,title
0,1,31,2.5,1260759144,Dangerous Minds (1995)
1,7,31,3.0,851868750,Dangerous Minds (1995)
2,31,31,4.0,1273541953,Dangerous Minds (1995)
3,32,31,4.0,834828440,Dangerous Minds (1995)
4,36,31,3.0,847057202,Dangerous Minds (1995)


In [43]:
userIds = merged_data[['userId']] # a pandas dataframe
userIds2 = merged_data.userId # a pandas series object

type(userIds)
type(userIds2)

Unnamed: 0,userId
0,1
1,7
2,31
3,32
4,36


In [44]:
# loc is a function we'll use heavily for indexing. You can give it column and row indices, or use boolean indexing
merged_data.loc[0:10, ['userId']]

Unnamed: 0,userId
0,1
1,1
2,1
3,1
4,1
5,1
6,1
7,1
8,1
9,1


In [47]:
toyStoryUsers = merged_data[merged_data.title == 'Toy Story (1995)']
# This will give all those rows whose title include Toy Story (1995)

toyStoryUsers.head()

Unnamed: 0,userId,movieId,rating,timestamp,title
31314,7,1,3.0,851866703,Toy Story (1995)
31315,9,1,4.0,938629179,Toy Story (1995)
31316,13,1,5.0,1331380058,Toy Story (1995)
31317,15,1,2.0,997938310,Toy Story (1995)
31318,19,1,3.0,855190091,Toy Story (1995)
