In [25]:
import pandas as pd
import numpy as np
import os
import re

In [2]:
ml_latest_small = "ml-latest-small.zip"

!curl -o  $ml_latest_small http://files.grouplens.org/datasets/movielens/ml-latest-small.zip

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  896k  100  896k    0     0   593k      0  0:00:01  0:00:01 --:--:--  179k 593k


In [3]:
ml_latest_small[:-4]

'ml-latest-small'

In [4]:
import zipfile
with zipfile.ZipFile(ml_latest_small, 'r') as zip_ref:
    zip_ref.extractall(".")

In [5]:
movies = pd.read_csv("ml-latest-small/movies.csv", index_col=0)
links = pd.read_csv("ml-latest-small/links.csv", index_col=0)
movies = pd.merge(movies, links, left_index=True, right_index=True)
movies.tmdbId = movies.tmdbId.apply(lambda v: int(v) if pd.notna(v) else -1)
movies.head()

Unnamed: 0_level_0,title,genres,imdbId,tmdbId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,114709,862
2,Jumanji (1995),Adventure|Children|Fantasy,113497,8844
3,Grumpier Old Men (1995),Comedy|Romance,113228,15602
4,Waiting to Exhale (1995),Comedy|Drama|Romance,114885,31357
5,Father of the Bride Part II (1995),Comedy,113041,11862


In [6]:
year = re.compile("\(([0-9]{4})\)$")

def get_year(date):
    match = year.search(date.strip())
    if match:
        return int(match.group(1))
    return -1

shave_year = lambda title: title[:-7] if year.search(title) else title

print(shave_year("Witchfinder General (Conquerer Worm, The) (1968)"))

movies["year"] = movies.title.apply(get_year)
movies["title"] = movies.title.apply(shave_year)
movies.head()

Witchfinder General (Conquerer Worm, The)


Unnamed: 0_level_0,title,genres,imdbId,tmdbId,year
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,114709,862,1995
2,Jumanji,Adventure|Children|Fantasy,113497,8844,1995
3,Grumpier Old Men,Comedy|Romance,113228,15602,1995
4,Waiting to Exhale,Comedy|Drama|Romance,114885,31357,1995
5,Father of the Bride Part II,Comedy,113041,11862,1995


In [7]:
movies = movies[movies.year!=-1]
movies.head()

Unnamed: 0_level_0,title,genres,imdbId,tmdbId,year
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,114709,862,1995
2,Jumanji,Adventure|Children|Fantasy,113497,8844,1995
3,Grumpier Old Men,Comedy|Romance,113228,15602,1995
4,Waiting to Exhale,Comedy|Drama|Romance,114885,31357,1995
5,Father of the Bride Part II,Comedy,113041,11862,1995


In [8]:
get_genre_set = lambda g: g.split('|')
genres = movies.genres.apply(get_genre_set)
movies.drop("genres", axis=1, inplace=True)

In [9]:
movies.to_csv("movies.clean.csv")
movies.head()

Unnamed: 0_level_0,title,imdbId,tmdbId,year
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,Toy Story,114709,862,1995
2,Jumanji,113497,8844,1995
3,Grumpier Old Men,113228,15602,1995
4,Waiting to Exhale,114885,31357,1995
5,Father of the Bride Part II,113041,11862,1995


In [10]:
genres_assignation = []
for i, gen in genres.iteritems():
    for gnre in gen:
        genres_assignation.append([i, gnre])
genres_df = pd.DataFrame(genres_assignation, columns=["movieId", "genre"]).set_index("movieId")
genres_df.to_csv("genres.csv")
genres_df.head()

Unnamed: 0_level_0,genre
movieId,Unnamed: 1_level_1
1,Adventure
1,Animation
1,Children
1,Comedy
1,Fantasy


In [11]:
import datetime
'2015-06-24T12:50:35.556+0100'
def date_ms(timestamp):
    return datetime.datetime.fromtimestamp(timestamp).strftime('%Y-%m-%dT%H:%M:%S')

2007-01-15T16:29:38


In [12]:
ratings = pd.read_csv("ml-latest-small/ratings.csv", index_col=[0,1])
ratings["time"] = ratings.timestamp.apply(date_ms)
ratings.to_csv("ratings.csv")
ratings.head(10) 

Unnamed: 0_level_0,Unnamed: 1_level_0,rating,timestamp,time
userId,movieId,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,31,2.5,1260759144,2009-12-14T02:52:24
1,1029,3.0,1260759179,2009-12-14T02:52:59
1,1061,3.0,1260759182,2009-12-14T02:53:02
1,1129,2.0,1260759185,2009-12-14T02:53:05
1,1172,4.0,1260759205,2009-12-14T02:53:25
1,1263,2.0,1260759151,2009-12-14T02:52:31
1,1287,2.0,1260759187,2009-12-14T02:53:07
1,1293,2.0,1260759148,2009-12-14T02:52:28
1,1339,3.5,1260759125,2009-12-14T02:52:05
1,1343,2.0,1260759131,2009-12-14T02:52:11


In [39]:
tags = pd.read_csv("ml-latest-small/tags.csv", index_col=[0,1])
tags["time"] = tags.timestamp.apply(date_ms)
tags.to_csv("tags.csv")
tags.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,tag,timestamp,time
userId,movieId,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
15,339,sandra 'boring' bullock,1138537770,2006-01-29T12:29:30
15,1955,dentist,1193435061,2007-10-26T22:44:21
15,7478,Cambodia,1170560997,2007-02-04T03:49:57
15,32892,Russian,1170626366,2007-02-04T21:59:26
15,34162,forgettable,1141391765,2006-03-03T13:16:05


In [40]:
users = np.unique(np.concatenate(
    (ratings.index.levels[0].values , tags.index.levels[0].values)))

users_df = pd.DataFrame({'userId':users})
users_df.to_csv("users.csv")
users_df.tail()

Unnamed: 0,userId
666,667
667,668
668,669
669,670
670,671
