# Activity
## Our choice of 10 for K was arbitrary -what effect do different K values have on the results?

Our distance metric was also somewhat arbitrary - we just took the cosine distance between the genres and added it to the difference between the normalized popularity scores. Can you improve on that?

In [1]:
# Importação de bibliotecas
import pandas as pd
import numpy as np

# Dados:

In [60]:
r_cols = ['user_id', 'movie_id', 'rating']
ratings = pd.read_csv('/home/ronyus/Documentos/card - 10/Aquivos de Código/More Data Mining and Machine Learning Techniques/Dados/u.data',sep='\t', names=['user_id', 'movie_id', 'rating'], usecols=range(3))
ratings.head()

movieProperties = ratings.groupby('movie_id').agg({'rating': ['size', 'mean']})
movieProperties.head()

movieNumRatings = pd.DataFrame(movieProperties['rating']['size'])
movieNormalizedNumRatings = movieNumRatings.apply(lambda x: (x - np.min(x)) / (np.max(x) - np.min(x)))
movieNormalizedNumRatings.head()

movieDict = {}
with open(r'/home/ronyus/Documentos/card - 10/Aquivos de Código/More Data Mining and Machine Learning Techniques/Dados/u.item', encoding='ISO-8859-1') as f:
    temp = ''
    for line in f:
        # O código percorrerá o arquivo linha por linha resgatando as informaçõe
        fields = line.rstrip('\n').split('|')
        movieID = int(fields[0])
        name = fields[1]
        genres = fields[5:25] 
        # Os gêneros são representados por um vetor de 19 posições onde cada número representa um gênero diferente
        genres = map(int, genres)
        movieDict[movieID] = (
            name,
            np.array(list(genres)),
            movieNormalizedNumRatings.loc[movieID].get('size'),
            movieProperties.loc[movieID].rating.get('mean')
        )

# Actividade:

In [61]:
import operator

def ComputeDistance(movieA, movieB):
    # Similaridade de cosseno para gêneros
    dot_product = np.dot(movieA[1], movieB[1])
    norm_A = np.linalg.norm(movieA[1])
    norm_B = np.linalg.norm(movieB[1])
    
    if norm_A == 0 or norm_B == 0:
        genre_distance = 1.0
    else:
        cosine_similarity = dot_product / (norm_A * norm_B)
        genre_distance = 1 - cosine_similarity
    
    # Diferença de popularidade
    popularity_diff = abs(movieA[2] - movieB[2])
    
    # Diferença de rating normalizada
    rating_diff = abs(movieA[3] - movieB[3]) / 5.0
    
    # Combinar com pesos
    return 0.6 * genre_distance + 0.3 * rating_diff + 0.1 * popularity_diff

def getNeighbors(movieID, K, distance_metric):
    distances = []
    for movie in movieDict:
        if movie != movieID:
            dist = distance_metric(movieDict[movieID], movieDict[movie])
            distances.append((movie, dist))
    
    distances.sort(key=operator.itemgetter(1))
    neighbors = []
    for x in range(K):
        neighbors.append(distances[x][0])
    return neighbors


In [62]:
k_values = [3, 5, 10, 15, 20]

for K in k_values:
    print(f"\nK = {K}")
    neighbors = getNeighbors(1, K, ComputeDistance)
    avgRating = sum([movieDict[neighbor][3] for neighbor in neighbors]) / K
    print(f"avgRating: {avgRating}")
    for neighbor in neighbors:
        print(movieDict[neighbor][0] + " " + str(movieDict[neighbor][3]))


K = 3
avgRating: 3.4863130780939002
Aladdin (1992) 3.8127853881278537
Aladdin and the King of Thieves (1996) 2.8461538461538463
Winnie the Pooh and the Blustery Day (1968) 3.8

K = 5
avgRating: 3.647653433414996
Aladdin (1992) 3.8127853881278537
Aladdin and the King of Thieves (1996) 2.8461538461538463
Winnie the Pooh and the Blustery Day (1968) 3.8
Pinocchio (1940) 3.6732673267326734
Grand Day Out, A (1992) 4.106060606060606

K = 10
avgRating: 3.523175471133854
Aladdin (1992) 3.8127853881278537
Aladdin and the King of Thieves (1996) 2.8461538461538463
Winnie the Pooh and the Blustery Day (1968) 3.8
Pinocchio (1940) 3.6732673267326734
Grand Day Out, A (1992) 4.106060606060606
Wrong Trousers, The (1993) 4.466101694915254
Sword in the Stone, The (1963) 3.3292682926829267
Matilda (1996) 3.210526315789474
Home Alone (1990) 3.0875912408759123
Goofy Movie, A (1995) 2.9

K = 15
avgRating: 3.394006623586503
Aladdin (1992) 3.8127853881278537
Aladdin and the King of Thieves (1996) 2.84615384615