In [33]:
from math import sqrt
import pandas as pd
import numpy as np

In [34]:
# All functions for recomendation rate
def euclidiana(base, usuario1, usuario2):
    from math import sqrt
    similar = {}
    for item in base[usuario1]:
        if item in base[usuario2]: similar[item] = 1
    if len(similar) == 0: return 0

    somatorio = sum([pow(base[usuario1][item] - base[usuario2][item], 2) 
                     for item in base[usuario1] if item in base[usuario2]])
    
    return 1 / (1 + sqrt(somatorio))


def getSimilares(base, usuario):
    similaridade = [(euclidiana(base, usuario, outro), outro) 
                    for outro in base if outro != usuario]
    return sorted(similaridade, reverse=True)[0:30]


def getRecomendacoesUsuario(base, usuario):
    totais = {}
    somaSimilares = {}
    
    for outro in base:
        if outro == usuario: continue
        
        similaridade = euclidiana(base, usuario, outro)
        if similaridade <= 0: continue
        
        for item in base[outro]:
            if item not in base[usuario]:
                totais.setdefault(item, 0)
                totais[item] += similaridade * base[outro][item]
                somaSimilares.setdefault(item, 0)
                somaSimilares[item] += similaridade
                
    ranking = [(total / somaSimilares[item], item) for item, total in totais.items()]
    
    return sorted(ranking, reverse=True)[0:30]


def carregarMovieLens(path='./ml-100k'):
    filmes = {}
    for linha in open(path + '/u.item'):
        (id, titulo) = linha.split('|')[0:2]
        filmes[id] = titulo
    
    base = {}
    for linha in open(path + '/u.data'):
        (usuario, id_filme, avaliacao) = linha.split('\t')[0:3]
        base.setdefault(usuario, {})
        base[usuario][filmes[id_filme]] = float(avaliacao)
    return base


def calculaItensSimilares(base):
    result = {}
    for item in base:
        similaridade = getSimilares(base, item)
        result[item] = similaridade
    return result


def getRecomendacoesItens(baseUsuario, itensSimilares, usuario):
    notasUsuario = baseUsuario[usuario]
    notas = {}
    totalSimilaridade = {}
    
    for item, nota in notasUsuario.items():
        for similaridade, item2 in itensSimilares[item]:
            if item2 in notasUsuario: continue
            
            notas.setdefault(item2, 0)
            notas[item2] += nota*similaridade
            totalSimilaridade.setdefault(item2, 0)
            totalSimilaridade[item2] += similaridade
            
    ranking = [(nota / totalSimilaridade[item], item) for item, nota in notas.items()]
    
    return sorted(ranking, reverse=True)[0:30]

In [35]:
# Load MovieLens dataframe, filtering items of interest and in a dictionary format
base = carregarMovieLens()

In [36]:
# funtion to verify if string is float
def isfloat(num):
    try:
        float(num)
        return True
    except ValueError:
        return False

    
# Which dictionary/data wants to switch lines to columns:
data = base
##########
df_normal = pd.DataFrame(data=data)
df_invertido = df_normal.transpose()
df_invertido.fillna('NotFloat', inplace=True)
avaliacoesFilmes = {key: df_invertido[key].to_dict() for key in df_invertido.columns}
for item in avaliacoesFilmes.keys():
    remove = []
    for i in avaliacoesFilmes[item].keys():
        if not isfloat(avaliacoesFilmes[item][i]):
            remove.append(i)
    for i in remove:
        del avaliacoesFilmes[item][i]
    remove.clear()
# the new dataset created is called: avaliacoesFilmes

In [37]:
# pre-calculation of items similarity, to speed up the process
itensSimilares = calculaItensSimilares(avaliacoesFilmes)

# Tests

### Users similarity

In [38]:
euclidiana(base, '1', '2')

0.16139047779640892

In [39]:
getSimilares(base, '1')

[(1.0, '812'),
 (1.0, '418'),
 (1.0, '155'),
 (0.5, '729'),
 (0.5, '631'),
 (0.5, '351'),
 (0.5, '309'),
 (0.5, '273'),
 (0.4142135623730951, '876'),
 (0.4142135623730951, '485'),
 (0.4142135623730951, '111'),
 (0.36602540378443865, '687'),
 (0.36602540378443865, '105'),
 (0.3333333333333333, '895'),
 (0.3333333333333333, '811'),
 (0.3333333333333333, '685'),
 (0.3333333333333333, '531'),
 (0.3333333333333333, '39'),
 (0.3333333333333333, '356'),
 (0.3333333333333333, '341'),
 (0.3333333333333333, '282'),
 (0.3333333333333333, '260'),
 (0.3333333333333333, '107'),
 (0.3090169943749474, '9'),
 (0.3090169943749474, '842'),
 (0.3090169943749474, '820'),
 (0.3090169943749474, '696'),
 (0.3090169943749474, '547'),
 (0.3090169943749474, '520'),
 (0.3090169943749474, '516')]

In [40]:
getRecomendacoesUsuario(base, '1')

[(5.000000000000001, 'Saint of Fort Washington, The (1993)'),
 (5.0, 'They Made Me a Criminal (1939)'),
 (5.0, "Someone Else's America (1995)"),
 (5.0, 'Santa with Muscles (1996)'),
 (5.0, 'Prefontaine (1997)'),
 (5.0, 'Marlene Dietrich: Shadow and Light (1996) '),
 (5.0, 'Great Day in Harlem, A (1994)'),
 (5.0, 'Entertaining Angels: The Dorothy Day Story (1996)'),
 (5.0, 'Aiqing wansui (1994)'),
 (4.999999999999999, 'Star Kid (1997)'),
 (4.709540786352371, 'Pather Panchali (1955)'),
 (4.623660893418929, "Some Mother's Son (1996)"),
 (4.605117943969987, 'Anna (1996)'),
 (4.542493480195261, 'Letter From Death Row, A (1998)'),
 (4.514686319613309, 'Close Shave, A (1995)'),
 (4.503301065093885, "Schindler's List (1993)"),
 (4.474029296757844, 'Casablanca (1942)'),
 (4.44823347681708, 'Third Man, The (1949)'),
 (4.4388914364740515, 'Faust (1994)'),
 (4.412532456819868, 'Everest (1998)'),
 (4.402793159814662, 'Rear Window (1954)'),
 (4.364687978517131, 'Bitter Sugar (Azucar Amargo) (1996)')

### Items similarity

In [41]:
euclidiana(avaliacoesFilmes, 'Kolya (1996)', '101 Dalmatians (1996)')

0.1482675827043134

In [42]:
getSimilares(avaliacoesFilmes, '101 Dalmatians (1996)')

[(1.0, 'What Happened Was... (1994)'),
 (1.0, 'Welcome To Sarajevo (1997)'),
 (1.0, 'War Room, The (1993)'),
 (1.0, 'Walk in the Sun, A (1945)'),
 (1.0, 'U.S. Marshalls (1998)'),
 (1.0, 'Twilight (1998)'),
 (1.0, 'Tom & Viv (1994)'),
 (1.0, 'Three Wishes (1995)'),
 (1.0, 'Temptress Moon (Feng Yue) (1996)'),
 (1.0, 'Surviving the Game (1994)'),
 (1.0, 'Safe (1995)'),
 (1.0, 'Run of the Country, The (1995)'),
 (1.0, 'Paradise Road (1997)'),
 (1.0, 'Nobody Loves Me (Keiner liebt mich) (1994)'),
 (1.0, 'No Escape (1994)'),
 (1.0, 'Nil By Mouth (1997)'),
 (1.0, 'My Crazy Life (Mi vida loca) (1993)'),
 (1.0, 'Mediterraneo (1991)'),
 (1.0, 'Man of No Importance, A (1994)'),
 (1.0, 'Madame Butterfly (1995)'),
 (1.0, 'Last Summer in the Hamptons (1995)'),
 (1.0, 'Johns (1996)'),
 (1.0, 'Hunted, The (1995)'),
 (1.0, 'Hard Eight (1996)'),
 (1.0, 'Frankie Starlight (1995)'),
 (1.0, 'Flesh and Bone (1993)'),
 (1.0, 'Fatal Instinct (1993)'),
 (1.0, 'Double vie de Véronique, La (Double Life of Veroni

In [43]:
getRecomendacoesItens(base, itensSimilares, '1')

[(5.0, 'Winnie the Pooh and the Blustery Day (1968)'),
 (5.0, "What's Love Got to Do with It (1993)"),
 (5.0, 'Up Close and Personal (1996)'),
 (5.0, 'Three Musketeers, The (1993)'),
 (5.0, 'Third Man, The (1949)'),
 (5.0, 'Streetcar Named Desire, A (1951)'),
 (5.0, "Stephen King's The Langoliers (1995)"),
 (5.0, 'Stand by Me (1986)'),
 (5.0, 'Spellbound (1945)'),
 (5.0, 'Speed (1994)'),
 (5.0, 'Somewhere in Time (1980)'),
 (5.0, 'Something to Talk About (1995)'),
 (5.0, 'Some Folks Call It a Sling Blade (1993)'),
 (5.0, 'Sliver (1993)'),
 (5.0, 'Shine (1996)'),
 (5.0, 'Shaggy Dog, The (1959)'),
 (5.0, 'Seventh Seal, The (Sjunde inseglet, Det) (1957)'),
 (5.0, 'Rosewood (1997)'),
 (5.0, 'Romeo Is Bleeding (1993)'),
 (5.0, 'Rob Roy (1995)'),
 (5.0, 'Return of Martin Guerre, The (Retour de Martin Guerre, Le) (1982)'),
 (5.0, 'Restoration (1995)'),
 (5.0, 'Quest, The (1996)'),
 (5.0, 'Pollyanna (1960)'),
 (5.0, 'Paths of Glory (1957)'),
 (5.0, 'Other Voices, Other Rooms (1997)'),
 (5.0, '