In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [21]:
# First time data load.
movies = pd.read_csv('collData/ml-1m/movies.csv')
ratings = pd.read_csv('collData/ml-1m/ratings.csv')

In [22]:
# Organise a bit and store into feather-format
movies.sort_values(by='MovieID', inplace=True)
movies.reset_index(inplace=True, drop=True)
ratings.sort_values(by='MovieID', inplace=True)
ratings.reset_index(inplace=True, drop=True)

In [23]:
print(ratings.dtypes)

UserID       int64
MovieID      int64
Rating       int64
Timestamp    int64
dtype: object


In [24]:
print(movies.dtypes)

MovieID     int64
Title      object
Genres     object
dtype: object


In [26]:
movies.head()

Unnamed: 0,MovieID,Title,Genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [28]:
# Split title and release year in separate columns in movies dataframe. Convert year to timestamp.
movies['Year'] = movies.Title.str.extract("\((\d{4})\)", expand=True)

In [29]:
movies.head()

Unnamed: 0,MovieID,Title,Genres,Year
0,1,Toy Story (1995),Animation|Children's|Comedy,1995
1,2,Jumanji (1995),Adventure|Children's|Fantasy,1995
2,3,Grumpier Old Men (1995),Comedy|Romance,1995
3,4,Waiting to Exhale (1995),Comedy|Drama,1995
4,5,Father of the Bride Part II (1995),Comedy,1995


In [31]:
movies.Year = pd.to_datetime(movies.Year, format='%Y')

In [32]:
movies.head()

Unnamed: 0,MovieID,Title,Genres,Year
0,1,Toy Story (1995),Animation|Children's|Comedy,1995-01-01
1,2,Jumanji (1995),Adventure|Children's|Fantasy,1995-01-01
2,3,Grumpier Old Men (1995),Comedy|Romance,1995-01-01
3,4,Waiting to Exhale (1995),Comedy|Drama,1995-01-01
4,5,Father of the Bride Part II (1995),Comedy,1995-01-01


In [33]:
movies.Year = movies.Year.dt.year

In [34]:
movies.head()

Unnamed: 0,MovieID,Title,Genres,Year
0,1,Toy Story (1995),Animation|Children's|Comedy,1995
1,2,Jumanji (1995),Adventure|Children's|Fantasy,1995
2,3,Grumpier Old Men (1995),Comedy|Romance,1995
3,4,Waiting to Exhale (1995),Comedy|Drama,1995
4,5,Father of the Bride Part II (1995),Comedy,1995


In [35]:
# As there are some NaN years, resulting type will be float (decimals)
movies.Title = movies.Title.str[:-7]

In [36]:
movies.head()

Unnamed: 0,MovieID,Title,Genres,Year
0,1,Toy Story,Animation|Children's|Comedy,1995
1,2,Jumanji,Adventure|Children's|Fantasy,1995
2,3,Grumpier Old Men,Comedy|Romance,1995
3,4,Waiting to Exhale,Comedy|Drama,1995
4,5,Father of the Bride Part II,Comedy,1995


In [44]:
# Categorize movies genres properly. Working later with +20MM rows of strings proved very resource consuming
genres_unique = pd.DataFrame(movies.Genres.str.split('|').tolist()).stack().unique()

In [46]:
genres_unique

array(['Animation', "Children's", 'Comedy', 'Adventure', 'Fantasy',
       'Romance', 'Drama', 'Action', 'Crime', 'Thriller', 'Horror',
       'Sci-Fi', 'Documentary', 'War', 'Musical', 'Mystery', 'Film-Noir',
       'Western'], dtype=object)

In [47]:
genres_unique = pd.DataFrame(genres_unique, columns=['Genre'])

In [50]:
genres_unique.head()

Unnamed: 0,Genre
0,Animation
1,Children's
2,Comedy
3,Adventure
4,Fantasy


In [51]:
# Format into DataFrame to store later
movies = movies.join(movies.Genres.str.get_dummies().astype(bool))

In [52]:
movies.head()

Unnamed: 0,MovieID,Title,Genres,Year,Action,Adventure,Animation,Children's,Comedy,Crime,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story,Animation|Children's|Comedy,1995,False,False,True,True,True,False,...,False,False,False,False,False,False,False,False,False,False
1,2,Jumanji,Adventure|Children's|Fantasy,1995,False,True,False,True,False,False,...,True,False,False,False,False,False,False,False,False,False
2,3,Grumpier Old Men,Comedy|Romance,1995,False,False,False,False,True,False,...,False,False,False,False,False,True,False,False,False,False
3,4,Waiting to Exhale,Comedy|Drama,1995,False,False,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
4,5,Father of the Bride Part II,Comedy,1995,False,False,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False


In [54]:
movies.drop('Genres', inplace=True, axis=1)

In [55]:
movies.head()

Unnamed: 0,MovieID,Title,Year,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story,1995,False,False,True,True,True,False,False,...,False,False,False,False,False,False,False,False,False,False
1,2,Jumanji,1995,False,True,False,True,False,False,False,...,True,False,False,False,False,False,False,False,False,False
2,3,Grumpier Old Men,1995,False,False,False,False,True,False,False,...,False,False,False,False,False,True,False,False,False,False
3,4,Waiting to Exhale,1995,False,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
4,5,Father of the Bride Part II,1995,False,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
