# Dataosuces declaration

## IMDB Datasource

In [1]:
from imdb import IMDb
imdb = IMDb()
# [ (x.data['title'], x.data['rating']) for x in imdb.get_top250_movies() ][0:25]

## MovieLens Datasource

In [2]:
import urllib.request
import zipfile

url = 'http://files.grouplens.org/datasets/movielens/ml-20m.zip'
download_path = './data/ml-20m.zip'
extract_path = './data/'

# urllib.request.urlretrieve(url, download_path)

# with zipfile.ZipFile(download_path, 'r') as zip:
#     zip.extractall(extract_path)

In [3]:
datasets = {
    'genome-scores': './data/ml-20m/genome-scores.csv',
    'genome-tags': './data/ml-20m/genome-tags.csv',
    'links': './data/ml-20m/links.csv',
    'movie': './data/ml-20m/movies.csv',
    'ratings': './data/ml-20m/ratings.csv',
    'tags': './data/ml-20m/tags.csv',
}

# Data Profiling

In [4]:
import pandas_profiling
import pandas as pd

def read(dataset):
    print('Reading: ', datasets[dataset])
    return pd.read_csv(datasets[dataset])

In [5]:
from IPython.display import display

# for dataset_k in datasets:
#     df = read(dataset_k)
#     display(df.head())

# Cube creation

In [6]:
links = read('links')
ratings = read('ratings')

Reading:  ./data/ml-20m/links.csv
Reading:  ./data/ml-20m/ratings.csv


## Facts

In [7]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,1112486027
1,1,29,3.5,1112484676
2,1,32,3.5,1112484819
3,1,47,3.5,1112484727
4,1,50,3.5,1112484580


## Dimensions

### Time and Date

In [8]:
from datetime import datetime as dt

mesToTexto = [
    'enero',
    'febrero',
    'marzo',
    'abril',
    'mayo',
    'junio',
    'julio',
    'agosto',
    'septiembre',
    'octubre',
    'noviembre',
    'diciembre',
]

def getDate(ts):
    time = dt.fromtimestamp(ts)
    return {
        'anyo':  time.year,
        'mes':  time.month,
        'dia':  time.day,
        'mes_texto': mesToTexto[time.month-1]
    }

def getHour(ts):
    time = dt.fromtimestamp(ts)
    return {
        'hour':  time.hour,
        'minute':  time.minute,
        'second':  time.second,
    }

(getDate(1112486027), getHour(1112486027))

({'anyo': 2005, 'mes': 4, 'dia': 3, 'mes_texto': 'abril'},
 {'hour': 1, 'minute': 53, 'second': 47})

### Film

In [9]:
def getFilm(movie_id):
    print('Finding movie id')
    imdbId = links.loc[links['movieId'] == movie_id]['imdbId']
    print('Fetching movie')
    movie = imdb.get_movie(imdbId)
    print('Movie fetched')
    return {
        'title': movie.data['title'],
        'year': movie.data['year'],
        '_movieId': int(movie_id),
    }

getFilm(29)

Finding movie id
Fetching movie
Movie fetched


{'title': 'The City of Lost Children', 'year': 1995, '_movieId': 29}

# ETL

In [10]:
datasets = {
    'genome-scores': './data/ml-20m/genome-scores.csv',
    'genome-tags': './data/ml-20m/genome-tags.csv',
    'links': './data/ml-20m/links.csv',
    'movie': './data/ml-20m/movies.csv',
    'ratings': './data/ml-20m/ratings.csv',
    'tags': './data/ml-20m/tags.csv',
}

In [11]:
import sqlite3

db = sqlite3.connect('./data.db')
cursor = db.cursor()

In [12]:
with open('./modeloA.sql') as schema:
    query = schema.read()
    cursor.executescript(query)
    db.commit()

In [13]:
import pandas_profiling
import pandas as pd

def read(dataset):
    print('Reading: ', datasets[dataset])
    return pd.read_csv(datasets[dataset])

In [14]:
def extractRating(*args):
    yield {
        'movieId': args[1],
        'rating': args[2],
        'timestamp': args[3],
    }

In [15]:
def addDate(rating):
    date = getDate(int(rating['timestamp']))
    fecha = (date['dia'], date['mes_texto'], date['anyo'])
    cursor.execute('SELECT * from Fecha WHERE dia=? AND mes=? AND anyo=?', fecha)
    data = cursor.fetchone()
    if (data==None):
        fecha = ("{}-{}-{}".format(date['anyo'], date['mes_texto'], date['dia']),*fecha)
        cursor.execute('INSERT INTO Fecha (fecha, dia, mes, anyo) VALUES (?, ?, ?, ?)', fecha)
        db.commit()
        rating['dateId'] = cursor.lastrowid
    else:
        rating['dateId'] = data[0]   
    yield rating

In [16]:
def addHour(rating):
    hour = getHour(int(rating['timestamp']))
    hora = (hour['hour'],hour['minute'],hour['second'])
    cursor.execute('SELECT * from Hora WHERE hora=? AND minuto=? AND segundo=?', hora)
    data = cursor.fetchone()
    if (data==None):
        hora = (*hora, rating['timestamp'])
        cursor.execute('INSERT INTO Hora (hora, minuto,segundo, timestamp) VALUES (?, ?, ?, ?)', hora)
        db.commit()
        rating['hourId'] = cursor.lastrowid
    else:
        rating['hourId'] = data[0]   
    yield rating

In [17]:
links = read('links')
def addFilm(rating):
    film = getFilm(int(rating['movieId']))
    pelicula = (film['_movieId'], 1)
    cursor.execute('SELECT * from Pelicula WHERE idNatural=? AND 1=?', pelicula)
    data = cursor.fetchone()
    if (data==None):
        pelicula = (film['_movieId'], film['title'], film['year'])
        cursor.execute('INSERT INTO Pelicula (idNatural, titulo, anyoLanzamiento) VALUES (?, ?, ?)', pelicula)
        db.commit()
        rating['filmId'] = cursor.lastrowid
    else:
        rating['filmId'] = data[0]   
    yield rating

Reading:  ./data/ml-20m/links.csv


In [18]:
def load(rating):
    voto = (rating['filmId'],rating['hourId'],rating['dateId'],rating['rating'])
    cursor.execute('INSERT INTO Voto (clvPelicula, clvHora, clvFecha, nota) VALUES (?, ?, ?, ?)', voto)
    db.commit()
    yield

In [19]:
def extract():
    with open(datasets['ratings']) as f:
        next(f)
        for line in f:        # create a list of lists
            line = line.split(',')
            rating = {}
            rating['movieId'] = line[1]
            rating['rating'] = line[2]
            rating['timestamp'] = line[3].strip('\n')
            yield rating

In [20]:
for a in extract():
    for a in addDate(a):
        print('DATE',a)
        for a in addHour(a):
            print('HOUR',a)
            for a in addFilm(a):
                print('FILM',a)
                for a in load(a):
                    pass

DATE {'movieId': '2', 'rating': '3.5', 'timestamp': '1112486027', 'dateId': 1}
HOUR {'movieId': '2', 'rating': '3.5', 'timestamp': '1112486027', 'dateId': 1, 'hourId': 1}
Finding movie id
Fetching movie
Movie fetched
FILM {'movieId': '2', 'rating': '3.5', 'timestamp': '1112486027', 'dateId': 1, 'hourId': 1, 'filmId': 1}
DATE {'movieId': '29', 'rating': '3.5', 'timestamp': '1112484676', 'dateId': 1}
HOUR {'movieId': '29', 'rating': '3.5', 'timestamp': '1112484676', 'dateId': 1, 'hourId': 2}
Finding movie id
Fetching movie
Movie fetched
FILM {'movieId': '29', 'rating': '3.5', 'timestamp': '1112484676', 'dateId': 1, 'hourId': 2, 'filmId': 2}
DATE {'movieId': '32', 'rating': '3.5', 'timestamp': '1112484819', 'dateId': 1}
HOUR {'movieId': '32', 'rating': '3.5', 'timestamp': '1112484819', 'dateId': 1, 'hourId': 3}
Finding movie id
Fetching movie
Movie fetched
FILM {'movieId': '32', 'rating': '3.5', 'timestamp': '1112484819', 'dateId': 1, 'hourId': 3, 'filmId': 3}
DATE {'movieId': '47', 'rati

Movie fetched
FILM {'movieId': '1089', 'rating': '3.5', 'timestamp': '1112484669', 'dateId': 1, 'hourId': 26, 'filmId': 26}
DATE {'movieId': '1090', 'rating': '4.0', 'timestamp': '1112485453', 'dateId': 1}
HOUR {'movieId': '1090', 'rating': '4.0', 'timestamp': '1112485453', 'dateId': 1, 'hourId': 27}
Finding movie id
Fetching movie
Movie fetched
FILM {'movieId': '1090', 'rating': '4.0', 'timestamp': '1112485453', 'dateId': 1, 'hourId': 27, 'filmId': 27}
DATE {'movieId': '1097', 'rating': '4.0', 'timestamp': '1112485701', 'dateId': 1}
HOUR {'movieId': '1097', 'rating': '4.0', 'timestamp': '1112485701', 'dateId': 1, 'hourId': 28}
Finding movie id
Fetching movie
Movie fetched
FILM {'movieId': '1097', 'rating': '4.0', 'timestamp': '1112485701', 'dateId': 1, 'hourId': 28, 'filmId': 28}
DATE {'movieId': '1136', 'rating': '3.5', 'timestamp': '1112484609', 'dateId': 1}
HOUR {'movieId': '1136', 'rating': '3.5', 'timestamp': '1112484609', 'dateId': 1, 'hourId': 29}
Finding movie id
Fetching movi

Movie fetched
FILM {'movieId': '1291', 'rating': '3.5', 'timestamp': '1112485525', 'dateId': 1, 'hourId': 50, 'filmId': 51}
DATE {'movieId': '1304', 'rating': '3.0', 'timestamp': '1094785720', 'dateId': 2}
HOUR {'movieId': '1304', 'rating': '3.0', 'timestamp': '1094785720', 'dateId': 2, 'hourId': 51}
Finding movie id
Fetching movie
Movie fetched
FILM {'movieId': '1304', 'rating': '3.0', 'timestamp': '1094785720', 'dateId': 2, 'hourId': 51, 'filmId': 52}
DATE {'movieId': '1321', 'rating': '4.0', 'timestamp': '1094786062', 'dateId': 2}
HOUR {'movieId': '1321', 'rating': '4.0', 'timestamp': '1094786062', 'dateId': 2, 'hourId': 52}
Finding movie id
Fetching movie
Movie fetched
FILM {'movieId': '1321', 'rating': '4.0', 'timestamp': '1094786062', 'dateId': 2, 'hourId': 52, 'filmId': 53}
DATE {'movieId': '1333', 'rating': '4.0', 'timestamp': '1112484990', 'dateId': 1}
HOUR {'movieId': '1333', 'rating': '4.0', 'timestamp': '1112484990', 'dateId': 1, 'hourId': 53}
Finding movie id
Fetching movi

Movie fetched
FILM {'movieId': '2174', 'rating': '4.0', 'timestamp': '1112485843', 'dateId': 1, 'hourId': 75, 'filmId': 76}
DATE {'movieId': '2193', 'rating': '4.0', 'timestamp': '1112485753', 'dateId': 1}
HOUR {'movieId': '2193', 'rating': '4.0', 'timestamp': '1112485753', 'dateId': 1, 'hourId': 76}
Finding movie id
Fetching movie
Movie fetched
FILM {'movieId': '2193', 'rating': '4.0', 'timestamp': '1112485753', 'dateId': 1, 'hourId': 76, 'filmId': 77}
DATE {'movieId': '2194', 'rating': '3.5', 'timestamp': '1112485486', 'dateId': 1}
HOUR {'movieId': '2194', 'rating': '3.5', 'timestamp': '1112485486', 'dateId': 1, 'hourId': 77}
Finding movie id
Fetching movie
Movie fetched
FILM {'movieId': '2194', 'rating': '3.5', 'timestamp': '1112485486', 'dateId': 1, 'hourId': 77, 'filmId': 78}
DATE {'movieId': '2253', 'rating': '3.5', 'timestamp': '1112486122', 'dateId': 1}
HOUR {'movieId': '2253', 'rating': '3.5', 'timestamp': '1112486122', 'dateId': 1, 'hourId': 78}
Finding movie id
Fetching movi

Movie fetched
FILM {'movieId': '3037', 'rating': '3.5', 'timestamp': '1112485563', 'dateId': 1, 'hourId': 100, 'filmId': 101}
DATE {'movieId': '3081', 'rating': '4.0', 'timestamp': '1094786143', 'dateId': 2}
HOUR {'movieId': '3081', 'rating': '4.0', 'timestamp': '1094786143', 'dateId': 2, 'hourId': 101}
Finding movie id
Fetching movie
Movie fetched
FILM {'movieId': '3081', 'rating': '4.0', 'timestamp': '1094786143', 'dateId': 2, 'hourId': 101, 'filmId': 102}
DATE {'movieId': '3153', 'rating': '4.0', 'timestamp': '1112485816', 'dateId': 1}
HOUR {'movieId': '3153', 'rating': '4.0', 'timestamp': '1112485816', 'dateId': 1, 'hourId': 102}
Finding movie id
Fetching movie
Movie fetched
FILM {'movieId': '3153', 'rating': '4.0', 'timestamp': '1112485816', 'dateId': 1, 'hourId': 102, 'filmId': 103}
DATE {'movieId': '3265', 'rating': '3.5', 'timestamp': '1112484525', 'dateId': 1}
HOUR {'movieId': '3265', 'rating': '3.5', 'timestamp': '1112484525', 'dateId': 1, 'hourId': 103}
Finding movie id
Fetc

Movie fetched
FILM {'movieId': '4878', 'rating': '3.5', 'timestamp': '1112484537', 'dateId': 1, 'hourId': 125, 'filmId': 126}
DATE {'movieId': '4896', 'rating': '4.0', 'timestamp': '1112485731', 'dateId': 1}
HOUR {'movieId': '4896', 'rating': '4.0', 'timestamp': '1112485731', 'dateId': 1, 'hourId': 126}
Finding movie id
Fetching movie
Movie fetched
FILM {'movieId': '4896', 'rating': '4.0', 'timestamp': '1112485731', 'dateId': 1, 'hourId': 126, 'filmId': 127}
DATE {'movieId': '4911', 'rating': '4.0', 'timestamp': '1112485961', 'dateId': 1}
HOUR {'movieId': '4911', 'rating': '4.0', 'timestamp': '1112485961', 'dateId': 1, 'hourId': 127}
Finding movie id
Fetching movie
Movie fetched
FILM {'movieId': '4911', 'rating': '4.0', 'timestamp': '1112485961', 'dateId': 1, 'hourId': 127, 'filmId': 128}
DATE {'movieId': '4915', 'rating': '3.0', 'timestamp': '1112486076', 'dateId': 1}
HOUR {'movieId': '4915', 'rating': '3.0', 'timestamp': '1112486076', 'dateId': 1, 'hourId': 128}
Finding movie id
Fetc

Movie fetched
FILM {'movieId': '6755', 'rating': '3.5', 'timestamp': '1094786067', 'dateId': 2, 'hourId': 149, 'filmId': 151}
DATE {'movieId': '6774', 'rating': '4.0', 'timestamp': '1112485786', 'dateId': 1}
HOUR {'movieId': '6774', 'rating': '4.0', 'timestamp': '1112485786', 'dateId': 1, 'hourId': 150}
Finding movie id
Fetching movie
Movie fetched
FILM {'movieId': '6774', 'rating': '4.0', 'timestamp': '1112485786', 'dateId': 1, 'hourId': 150, 'filmId': 152}
DATE {'movieId': '6807', 'rating': '3.5', 'timestamp': '1112484686', 'dateId': 1}
HOUR {'movieId': '6807', 'rating': '3.5', 'timestamp': '1112484686', 'dateId': 1, 'hourId': 151}
Finding movie id
Fetching movie
Movie fetched
FILM {'movieId': '6807', 'rating': '3.5', 'timestamp': '1112484686', 'dateId': 1, 'hourId': 151, 'filmId': 153}
DATE {'movieId': '6834', 'rating': '3.5', 'timestamp': '1112486060', 'dateId': 1}
HOUR {'movieId': '6834', 'rating': '3.5', 'timestamp': '1112486060', 'dateId': 1, 'hourId': 152}
Finding movie id
Fetc

Movie fetched
FILM {'movieId': '3', 'rating': '4.0', 'timestamp': '974820889', 'dateId': 3, 'hourId': 174, 'filmId': 176}
DATE {'movieId': '62', 'rating': '5.0', 'timestamp': '974820598', 'dateId': 3}
HOUR {'movieId': '62', 'rating': '5.0', 'timestamp': '974820598', 'dateId': 3, 'hourId': 175}
Finding movie id
Fetching movie
Movie fetched
FILM {'movieId': '62', 'rating': '5.0', 'timestamp': '974820598', 'dateId': 3, 'hourId': 175, 'filmId': 177}
DATE {'movieId': '70', 'rating': '5.0', 'timestamp': '974820691', 'dateId': 3}
HOUR {'movieId': '70', 'rating': '5.0', 'timestamp': '974820691', 'dateId': 3, 'hourId': 176}
Finding movie id
Fetching movie
Movie fetched
FILM {'movieId': '70', 'rating': '5.0', 'timestamp': '974820691', 'dateId': 3, 'hourId': 176, 'filmId': 178}
DATE {'movieId': '110', 'rating': '4.0', 'timestamp': '974820658', 'dateId': 3}
HOUR {'movieId': '110', 'rating': '4.0', 'timestamp': '974820658', 'dateId': 3, 'hourId': 177}
Finding movie id
Fetching movie
Movie fetched
F

IntegrityError: UNIQUE constraint failed: Voto.clvPelicula