In [163]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import math as m

# 1.

In [164]:
df = pd.read_csv('rating.csv')
df.head(8)

Unnamed: 0,Name,Lady,Snake,Luck,Superman,Dupree,Night
0,Lisa Rose,2.5,3.5,3.0,3.5,2.5,3.0
1,Gene Seymour,3.0,3.5,1.5,5.0,3.5,3.0
2,Micheal Philips,2.5,3.0,,3.5,,4.0
3,Claudia Puig,,3.5,3.0,4.0,2.5,4.5
4,Mich Lasalle,3.0,4.0,2.0,3.0,2.0,3.0
5,Jack Matthews,3.0,4.0,,5.0,3.5,3.0
6,Toby,,4.5,,4.0,1.0,
7,Anne,1.5,,4.0,,2.0,


# 2.

## 2.a

In [165]:
def sim_distanceManhattan(data, user1, user2):
    distance = 0
    common_items = 0
    for item in data.columns:
        if not pd.isna(data.loc[user1, item]) and not pd.isna(data.loc[user2, item]):
            distance += abs(data.loc[user1, item] - data.loc[user2, item])
            common_items += 1
    if common_items == 0:
        return np.inf
    return distance

print("Manhattan distance between Lisa Rose and Anne:", sim_distanceManhattan(df.set_index('Name'), 'Lisa Rose', 'Anne'))

Manhattan distance between Lisa Rose and Anne: 2.5


In [166]:
def sim_distanceEuclidean(data, user1, user2):
    distance = 0
    common_items = 0
    for item in data.columns:
        if not pd.isna(data.loc[user1, item]) and not pd.isna(data.loc[user2, item]):
            distance += (data.loc[user1, item] - data.loc[user2, item]) ** 2
            common_items += 1
    if common_items == 0:
        return np.inf
    return np.sqrt(distance)

print("Euclidean distance between Lisa Rose and Anne:", sim_distanceEuclidean(df.set_index('Name'), 'Lisa Rose', 'Anne'))
print("Euclidean distance between Lisa Rose and Gene Seymour:", sim_distanceEuclidean(df.set_index('Name'), 'Lisa Rose', 'Gene Seymour'))

Euclidean distance between Lisa Rose and Anne: 1.5
Euclidean distance between Lisa Rose and Gene Seymour: 2.3979157616563596


## 2.b

In [167]:
def recommendNearestNeighbor(data, user):
    distances = []
    for other_user in data.index:
        if other_user != user:
            distance = sim_distanceManhattan(data, user, other_user)
            distances.append((distance, other_user))
    distances.sort()
    nearest_neighbor = distances[0][1]
    
    recommendations = {}
    for item in data.columns:
        if pd.isna(data.loc[user, item]) and not pd.isna(data.loc[nearest_neighbor, item]):
            recommendations[item] = data.loc[nearest_neighbor, item]
    
    return recommendations

print("Recommendations for Lisa Rose:", recommendNearestNeighbor(df.set_index('Name'), 'Lisa Rose'))
print("Recommendations for Toby:", recommendNearestNeighbor(df.set_index('Name'), 'Toby'))

Recommendations for Lisa Rose: {}
Recommendations for Toby: {'Lady': np.float64(1.5), 'Luck': np.float64(4.0)}


## 2.d & 2.e
Here 2.d is done before 2.c to be able to use the pearson correlation coefficient in the recomandation computation

In [168]:
def data_double_sum(data, u1, u2):
    sum = 0
    for moovie in data.columns:
        if not pd.isna(data.loc[u1, moovie]) and not pd.isna(data.loc[u2, moovie]):
            sum += data.loc[u1, moovie] * data.loc[u2, moovie]
    return sum

print("Double sum between Lisa Rose and Gene Seymour:", data_double_sum(df.set_index('Name'), 'Lisa Rose', 'Gene Seymour'))

def data_simple_sum(data, u1, u2):
    sum = 0
    for moovie in data.columns:
        if not pd.isna(data.loc[u1, moovie]) and not pd.isna(data.loc[u2, moovie]):
            sum += data.loc[u1, moovie]
    return sum

print("Simple sum between Lisa Rose and Gene Seymour:", data_simple_sum(df.set_index('Name'), 'Lisa Rose', 'Gene Seymour'))

def data_squared_sum(data, u1, u2):
    sum = 0
    for moovie in data.columns:
        if not pd.isna(data.loc[u1, moovie]) and not pd.isna(data.loc[u2, moovie]):
            sum += data.loc[u1, moovie] ** 2
    return sum

print("Squared sum between Lisa Rose and Gene Seymour:", data_squared_sum(df.set_index('Name'), 'Lisa Rose', 'Gene Seymour'))

def data_length(data, u1, u2):
    sum = 0
    for moovie in data.columns:
        if not pd.isna(data.loc[u1, moovie]) and not pd.isna(data.loc[u2, moovie]):
            sum += 1
    return sum

print("Length between Lisa Rose and Gene Seymour:", data_length(df.set_index('Name'), 'Lisa Rose', 'Gene Seymour'))

# Here i decided to simplify a bit the math of the denominator know sqrt(a)\*sqrt(b) =  sqrt(a\*b)
def pearson_correlation(data, u1, u2):
    numerator = data_double_sum(data, u1, u2) - ( (data_simple_sum(data, u1, u2) * data_simple_sum(data, u2, u1) ) / data_length(data, u1, u2))
    # print("Numerator:", numerator)
    denominator = m.sqrt((data_squared_sum(data, u1, u2) - (data_simple_sum(data, u1, u2) ** 2) / data_length(data, u1, u2)) 
                         * (data_squared_sum(data, u2, u1) - (data_simple_sum(data, u2, u1) ** 2) / data_length(data, u1, u2)))
    # print("Denominator:", denominator)
    if denominator == 0:
        return 0
    return numerator / denominator

print("Pearson correlation between Lisa Rose and Gene Seymour:", pearson_correlation(df.set_index('Name'), 'Lisa Rose', 'Gene Seymour'))

Double sum between Lisa Rose and Gene Seymour: 59.5
Simple sum between Lisa Rose and Gene Seymour: 18.0
Squared sum between Lisa Rose and Gene Seymour: 55.0
Length between Lisa Rose and Gene Seymour: 6
Pearson correlation between Lisa Rose and Gene Seymour: 0.39605901719066977


In [169]:
def cosine_similarity(data, u1, u2):
    numerator = data_double_sum(data, u1, u2)
    # print("Numerator:", numerator)
    denominator = m.sqrt(data_squared_sum(data, u1, u2) *(data_squared_sum(data, u2, u1)) )
    # print("Denominator:", denominator)
    if denominator == 0:
        return 0
    return numerator / denominator

print("cosine_similarity between Lisa Rose and Gene Seymour:", cosine_similarity(df.set_index('Name'), 'Lisa Rose', 'Gene Seymour'))

cosine_similarity between Lisa Rose and Gene Seymour: 0.9606463013980242


## 2.c

### 2.c.step1

In [170]:
def total(data, moovie, user, type='manhattan'):
    total = 0
    for other_user in data.index:
        if other_user != user and not pd.isna(data.loc[other_user, moovie]):
            if type == 'euclidean':
                total += data.loc[other_user, moovie] / ( 1+ sim_distanceEuclidean(data, user, other_user))
            if type == 'manhattan':
                total += data.loc[other_user, moovie] / ( 1+ sim_distanceManhattan(data, user, other_user))
            if type == 'pearson':
                total += data.loc[other_user, moovie] / ( 1+ pearson_correlation(data, user, other_user))
            if type == 'cosine':
                total += data.loc[other_user, moovie] / ( 1+ cosine_similarity(data, user, other_user))
            else:
                total =-1

    return total

print('= ' + str( total(df.set_index('Name'), 'Night', 'Anne')))

= -1


In [171]:
def s(data, moovie, user, type='manhattan'):
    sum = 0
    for other_user in data.index:
        if other_user != user and not pd.isna(data.loc[other_user, moovie]):
            if type == 'euclidean':
                sum += 1 / ( 1+ sim_distanceEuclidean(data, user, other_user))
            if type == 'manhattan':
                sum += 1 / ( 1+ sim_distanceManhattan(data, user, other_user))
            if type == 'pearson':
                sum += 1 / ( 1+ pearson_correlation(data, user, other_user))
            if type == 'cosine':
                sum += 1 / ( 1+ cosine_similarity(data, user, other_user))
            else:
                sum =-1

    return sum

print('= ' + str( s(df.set_index('Name'), 'Night', 'Anne')))

= -1


In [172]:
def s_prime(data, moovie, user, type='manhattan'):
    return total(data, moovie, user, type) / s(data, moovie, user, type)

print('Predicted rating for Anne on Night: ' + str( s_prime(df.set_index('Name'), 'Night', 'Anne')))

Predicted rating for Anne on Night: 1.0


### 2.c.step2

In [173]:
def best_recommend(data, user, moovie_list, type='manhattan'):
    recommendations = {}
    for moovie in moovie_list:
        if pd.isna(data.loc[user, moovie]):
            predicted_rating = s_prime(data, moovie, user, type)
            recommendations[moovie] = float(predicted_rating)

    # Trier les recommandations par note prédite décroissante
    recommendations = dict(sorted(recommendations.items(), key=lambda item: item[1], reverse=True))
    return recommendations

moovie_list = {"Snake", "Superman", "Night"}
print("Best recommendations for Anne:", best_recommend(df.set_index('Name'), 'Anne', moovie_list, type='manhattan'))
print("Best recommendations for Anne:", best_recommend(df.set_index('Name'), 'Anne', moovie_list, type='euclidean'))

Best recommendations for Anne: {'Superman': 1.0, 'Snake': 1.0, 'Night': 1.0}
Best recommendations for Anne: {'Superman': 1.0, 'Snake': 1.0, 'Night': 1.0}


In [174]:
def total_exp(data, moovie, user, type='manhattan'):
    total = 0
    for other_user in data.index:
        if other_user != user and not pd.isna(data.loc[other_user, moovie]):
            if type == 'euclidean':
                total += data.loc[other_user, moovie] /  m.exp(-sim_distanceEuclidean(data, user, other_user))
            if type == 'manhattan':
                total += data.loc[other_user, moovie] / m.exp(-sim_distanceManhattan(data, user, other_user))
            if type == 'pearson':
                total += data.loc[other_user, moovie] / m.exp(-pearson_correlation(data, user, other_user))
            if type == 'cosine':
                total += data.loc[other_user, moovie] / m.exp(-cosine_similarity(data, user, other_user))
            else:
                total =-1

    return total

print('= ' + str( total_exp(df.set_index('Name'), 'Night', 'Anne')))

= -1


In [175]:
def s_exp(data, moovie, user, type='manhattan'):
    sum = 0
    for other_user in data.index:
        if other_user != user and not pd.isna(data.loc[other_user, moovie]):
            if type == 'euclidean':
                sum += 1 /  m.exp(-sim_distanceEuclidean(data, user, other_user))
            if type == 'manhattan':
                sum += 1 /  m.exp(-sim_distanceManhattan(data, user, other_user))
            if type == 'pearson':
                sum += 1 /  m.exp(-pearson_correlation(data, user, other_user))
            if type == 'cosine':
                sum += 1 /  m.exp(-cosine_similarity(data, user, other_user))
            else:
                sum =-1

    return sum

print('= ' + str( s_exp(df.set_index('Name'), 'Night', 'Anne')))

= -1


In [176]:
def s_prime_exp(data, moovie, user, type='manhattan'):
    return total_exp(data, moovie, user, type) / s_exp(data, moovie, user, type)

print('Predicted rating for Anne on Night: ' + str( s_prime_exp(df.set_index('Name'), 'Night', 'Anne')))

Predicted rating for Anne on Night: 1.0


In [177]:
def best_recommend_exp(data, user, moovie_list, type='manhattan'):
    recommendations = {}
    for moovie in moovie_list:
        if pd.isna(data.loc[user, moovie]):
            predicted_rating = s_prime_exp(data, moovie, user, type)
            recommendations[moovie] = float(predicted_rating)

    # Trier les recommandations par note prédite décroissante
    recommendations = dict(sorted(recommendations.items(), key=lambda item: item[1], reverse=True))
    return recommendations

moovie_list = {"Snake", "Superman", "Night"}
print("Best recommendations for Anne:", best_recommend_exp(df.set_index('Name'), 'Anne', moovie_list, type='manhattan'))
print("Best recommendations for Anne:", best_recommend_exp(df.set_index('Name'), 'Anne', moovie_list, type='euclidean'))

Best recommendations for Anne: {'Superman': 1.0, 'Snake': 1.0, 'Night': 1.0}
Best recommendations for Anne: {'Superman': 1.0, 'Snake': 1.0, 'Night': 1.0}


## 2.d & 2.e sequel

In [178]:
moovie_list = {"Snake", "Superman", "Night"}
print("Best recommendations for Anne:", best_recommend(df.set_index('Name'), 'Anne', moovie_list, type='pearson'))
print("Best recommendations for Anne:", best_recommend(df.set_index('Name'), 'Anne', moovie_list, type='cosine'))

Best recommendations for Anne: {'Superman': 1.0, 'Snake': 1.0, 'Night': 1.0}
Best recommendations for Anne: {'Superman': 4.0053503123050636, 'Snake': 3.7124560447528605, 'Night': 3.402857861715499}
 {'Superman': 1.0, 'Snake': 1.0, 'Night': 1.0}
Best recommendations for Anne: {'Superman': 4.0053503123050636, 'Snake': 3.7124560447528605, 'Night': 3.402857861715499}


# 3

In [179]:
df = pd.read_csv('ex3.csv')
df.head(8)

Unnamed: 0,Artist,Angelica,Bill,Chan,Dan,Hailey,Jordyn,Sam,Veronica
0,Blues Traveler,3.5,2.0,5.0,3.0,,,5.0,3.0
1,Broken Bells,2.0,3.5,1.0,4.0,4.0,4.5,2.0,
2,Deadmau5,,4.0,1.0,4.5,1.0,4.0,,
3,Norah Jones,4.5,,3.0,,4.0,5.0,3.0,5.0
4,Phoenix,5.0,2.0,5.0,3.0,,5.0,5.0,4.0
5,Slightly Stoopid,1.5,3.5,1.0,4.5,,4.5,4.0,2.5
6,The Strokes,2.5,,,4.0,4.0,4.0,5.0,3.0
7,Vampire Weekend,2.0,3.0,,2.0,1.0,4.0,,


In [180]:
song_list = df['Artist'].tolist()
print(song_list)

['Blues Traveler', 'Broken Bells', 'Deadmau5', 'Norah Jones', 'Phoenix', 'Slightly Stoopid', 'The Strokes', 'Vampire Weekend']


In [None]:
# Voir la structure complète du DataFrame
print("Colonnes:", df.columns.tolist())
print("\nPremières lignes:")
print(df.head())
print("\nIndex actuel:", df.index.tolist()[:5])

In [None]:
# Transposer le DataFrame : les artistes deviennent les colonnes, les personnes l'index
df_transposed = df.set_index('Artist').T

artist_list = df['Artist'].tolist()
print("Best recommendations for Veronica:", best_recommend(df_transposed, 'Veronica', artist_list, type='pearson'))

KeyError: 'Veronica'