# Dataosuces declaration

## IMDB Datasource

In [1]:
from imdb import IMDb
imdb = IMDb()
# [ (x.data['title'], x.data['rating']) for x in imdb.get_top250_movies() ][0:25]

## MovieLens Datasource

In [2]:
import urllib.request
import zipfile

url = 'http://files.grouplens.org/datasets/movielens/ml-20m.zip'
download_path = './data/ml-20m.zip'
extract_path = './data/'

# urllib.request.urlretrieve(url, download_path)

# with zipfile.ZipFile(download_path, 'r') as zip:
#     zip.extractall(extract_path)

In [3]:
datasets = {
    'genome-scores': './data/ml-20m/genome-scores.csv',
    'genome-tags': './data/ml-20m/genome-tags.csv',
    'links': './data/ml-20m/links.csv',
    'movie': './data/ml-20m/movies.csv',
    'ratings': './data/ml-20m/ratings.csv',
    'tags': './data/ml-20m/tags.csv',
}

# Data Profiling

In [4]:
import pandas_profiling
import pandas as pd

def read(dataset):
    print('Reading: ', datasets[dataset])
    return pd.read_csv(datasets[dataset])

In [5]:
from IPython.display import display

# for dataset_k in datasets:
#     df = read(dataset_k)
#     display(df.head())

# Cube creation

In [6]:
links = read('links')
ratings = read('ratings')

Reading:  ./data/ml-20m/links.csv
Reading:  ./data/ml-20m/ratings.csv


## Facts

In [7]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,1112486027
1,1,29,3.5,1112484676
2,1,32,3.5,1112484819
3,1,47,3.5,1112484727
4,1,50,3.5,1112484580


## Dimensions

### Time and Date

In [8]:
from datetime import datetime as dt

mesToTexto = [
    'enero',
    'febrero',
    'marzo',
    'abril',
    'mayo',
    'junio',
    'julio',
    'agosto',
    'septiembre',
    'octubre',
    'noviembre',
    'diciembre',
]

def getDate(ts):
    time = dt.fromtimestamp(ts)
    return {
        'anyo':  time.year,
        'mes':  time.month,
        'dia':  time.day,
        'mes_texto': mesToTexto[time.month-1]
    }

def getHour(ts):
    time = dt.fromtimestamp(ts)
    return {
        'hour':  time.hour,
        'minute':  time.minute,
        'second':  time.second,
    }

(getDate(1112486027), getHour(1112486027))

({'anyo': 2005, 'mes': 4, 'dia': 2, 'mes_texto': 'abril'},
 {'hour': 23, 'minute': 53, 'second': 47})

### Film

In [9]:
def getFilm(movie_id):
    print('Finding movie id')
    imdbId = links.loc[links['movieId'] == movie_id]['imdbId']
    print('Fetching movie')
    movie = imdb.get_movie(imdbId)
    print('Movie fetched')
    return {
        'title': movie.data['title'],
        'year': movie.data['year'],
        '_movieId': movie_id,
    }

getFilm(29)

Finding movie id
Fetching movie
Movie fetched


{'title': 'The City of Lost Children', 'year': 1995, '_movieId': 29}

# ETL

In [10]:
datasets = {
    'genome-scores': './data/ml-20m/genome-scores.csv',
    'genome-tags': './data/ml-20m/genome-tags.csv',
    'links': './data/ml-20m/links.csv',
    'movie': './data/ml-20m/movies.csv',
    'ratings': './data/ml-20m/ratings.csv',
    'tags': './data/ml-20m/tags.csv',
}

In [11]:
import pandas_profiling
import pandas as pd

def read(dataset):
    print('Reading: ', datasets[dataset])
    return pd.read_csv(datasets[dataset])

In [12]:
def extractRating(*args):
    yield {
        'movieId': args[1],
        'rating': args[2],
        'timestamp': args[3],
    }

In [13]:
def addDate(rating):
    date = getDate(int(rating['timestamp']))
#     Insertar en la BBDD el date y obtener su ID
    rating['dateId'] = 'dateidmock'
    yield rating

In [14]:
def addHour(rating):
    hour = getHour(int(rating['timestamp']))
#   Buscar si ya existe la hora
    
#     Insertar en la BBDD el date y obtener su ID
    rating['hourId'] = 'houridmock'
    yield rating

In [16]:
links = read('links')
def addFilm(rating):
#   Buscar si ya existe el id SELECT where movieID
    film = getFilm(int(rating['movieId']))
#     Insertar en la BBDD el date y obtener su ID
    rating['filmId'] = 'filmidmock'
    yield rating

Reading:  ./data/ml-20m/links.csv


In [17]:
def load(rating):
    print(rating)

In [18]:
import bonobo

# extract = bonobo.CsvReader(datasets['ratings'])

def extract():
    yield {'movieId': '2', 'rating': '3.5', 'timestamp': '1112486027'}
    yield {'movieId': '29', 'rating': '3.5', 'timestamp': '1112484676'}
    yield {'movieId': '32', 'rating': '3.5', 'timestamp': '1112484819'}
    yield {'movieId': '47', 'rating': '3.5', 'timestamp': '1112484727'}
    yield {'movieId': '50', 'rating': '3.5', 'timestamp': '1112484580'}
    yield {'movieId': '112', 'rating': '3.5', 'timestamp': '1094785740'}
    yield {'movieId': '151', 'rating': '4.0', 'timestamp': '1094785734'}
    yield {'movieId': '223', 'rating': '4.0', 'timestamp': '1112485573'}
    yield {'movieId': '253', 'rating': '4.0', 'timestamp': '1112484940'}
    yield {'movieId': '260', 'rating': '4.0', 'timestamp': '1112484826'}

graph = bonobo.Graph()
graph.add_chain(
    extract,
#     extractRating,
    addDate,
    addHour,
    addFilm,
    load,
)

GraphRange(graph=<bonobo.structs.graphs.Graph object at 0x7f7f75546668>, input=0, output=4)

In [19]:
bonobo.run(graph)

BonoboWidget()

Finding movie id
Fetching movie


INFO:imdb.parser.http.piculet:using lxml html builder
INFO:imdb.parser.http.piculet:using lxml html builder


Movie fetched
Finding movie id
{'movieId': '2', 'rating': '3.5', 'timestamp': '1112486027', 'dateId': 'dateidmock', 'hourId': 'houridmock', 'filmId': 'filmidmock'}Fetching movie



INFO:imdb.parser.http.piculet:using lxml html builder
INFO:imdb.parser.http.piculet:using lxml html builder


Movie fetched
Finding movie id
{'movieId': '29', 'rating': '3.5', 'timestamp': '1112484676', 'dateId': 'dateidmock', 'hourId': 'houridmock', 'filmId': 'filmidmock'}
Fetching movie


INFO:imdb.parser.http.piculet:using lxml html builder
INFO:imdb.parser.http.piculet:using lxml html builder


Movie fetched
Finding movie id
{'movieId': '32', 'rating': '3.5', 'timestamp': '1112484819', 'dateId': 'dateidmock', 'hourId': 'houridmock', 'filmId': 'filmidmock'}
Fetching movie


INFO:imdb.parser.http.piculet:using lxml html builder
INFO:imdb.parser.http.piculet:using lxml html builder


Movie fetched
Finding movie id
{'movieId': '47', 'rating': '3.5', 'timestamp': '1112484727', 'dateId': 'dateidmock', 'hourId': 'houridmock', 'filmId': 'filmidmock'}
Fetching movie


INFO:imdb.parser.http.piculet:using lxml html builder
INFO:imdb.parser.http.piculet:using lxml html builder


Movie fetched
Finding movie id
{'movieId': '50', 'rating': '3.5', 'timestamp': '1112484580', 'dateId': 'dateidmock', 'hourId': 'houridmock', 'filmId': 'filmidmock'}
Fetching movie


INFO:imdb.parser.http.piculet:using lxml html builder
INFO:imdb.parser.http.piculet:using lxml html builder


Movie fetched
Finding movie id
{'movieId': '112', 'rating': '3.5', 'timestamp': '1094785740', 'dateId': 'dateidmock', 'hourId': 'houridmock', 'filmId': 'filmidmock'}
Fetching movie


INFO:imdb.parser.http.piculet:using lxml html builder
INFO:imdb.parser.http.piculet:using lxml html builder


Movie fetched
Finding movie id
{'movieId': '151', 'rating': '4.0', 'timestamp': '1094785734', 'dateId': 'dateidmock', 'hourId': 'houridmock', 'filmId': 'filmidmock'}
Fetching movie


INFO:imdb.parser.http.piculet:using lxml html builder
INFO:imdb.parser.http.piculet:using lxml html builder


Movie fetched
Finding movie id
{'movieId': '223', 'rating': '4.0', 'timestamp': '1112485573', 'dateId': 'dateidmock', 'hourId': 'houridmock', 'filmId': 'filmidmock'}Fetching movie



INFO:imdb.parser.http.piculet:using lxml html builder
INFO:imdb.parser.http.piculet:using lxml html builder


Movie fetched
Finding movie id
{'movieId': '253', 'rating': '4.0', 'timestamp': '1112484940', 'dateId': 'dateidmock', 'hourId': 'houridmock', 'filmId': 'filmidmock'}
Fetching movie


INFO:imdb.parser.http.piculet:using lxml html builder
INFO:imdb.parser.http.piculet:using lxml html builder


Movie fetched
{'movieId': '260', 'rating': '4.0', 'timestamp': '1112484826', 'dateId': 'dateidmock', 'hourId': 'houridmock', 'filmId': 'filmidmock'}


<bonobo.execution.contexts.graph.GraphExecutionContext at 0x7f7f54909cc0>