## First similarity study -> Collaborative filtering with different formulas for distance

#### Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import jaccard_score
from scipy import sparse
from scipy.spatial.distance import pdist, squareform

#### Loading the datasources

In [2]:
electionDetailsDF = pd.read_csv('projectDetails.csv', sep=";")
print("Election details",electionDetailsDF.columns,"\n")

Election details Index(['ID', 'project_id', 'project_title', 'created_at', 'checked_out_at',
       'project_url', 'vote_finished'],
      dtype='object') 



In [3]:
##Creating the binary matrix
voter_matrix = pd.crosstab(electionDetailsDF['ID'], electionDetailsDF['project_id'])

print(voter_matrix)

project_id  4    5    6    7    8    9    10   11   12   13   ...  194  195  \
ID                                                            ...             
9             0    0    0    0    0    0    0    0    0    0  ...    0    0   
11            0    0    0    0    0    0    0    0    0    0  ...    0    0   
12            0    0    0    0    0    0    0    0    0    0  ...    0    0   
13            0    1    0    0    0    0    0    0    0    0  ...    0    0   
15            0    0    0    0    0    0    0    0    0    0  ...    0    0   
...         ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...   
5228          0    0    0    0    0    0    0    0    0    0  ...    0    0   
5229          0    0    0    1    0    0    0    1    0    0  ...    0    0   
5230          0    0    0    0    0    0    0    0    0    0  ...    0    0   
5231          0    0    0    0    0    0    0    0    0    0  ...    0    0   
5232          0    0    0    0    0    0    0    0  

**Item-item collaborative filtering**

In [28]:
##Magnitude vector to apply normalization
magnitude = np.sqrt(np.square(voter_matrix).sum(axis=1))

##Unit vector (normalized matrix)
voter_matrix = voter_matrix.divide(magnitude, axis='index')

def calculate_similarity_cosine(data_items):
    """
    Calculate the column-wise cosine similarity for a sparse
    matrix. Return a new dataframe matrix with similarities.
    """
    data_sparse = sparse.csr_matrix(data_items)
    similarities = cosine_similarity(data_sparse.transpose())
    sim = pd.DataFrame(data=similarities, index = data_items.columns, columns=data_items.columns)
    return sim

def calculate_similarity_jaccard(data_items):
    """
    Calculate the column-wise cosine similarity for a sparse
    matrix. Return a new dataframe matrix with similarities.
    """
    distances = pdist(data_items, metric='jaccard')
    similarities = 1 - squareform(distances)
    return pd.DataFrame(similarities, index=data_items.index, columns=data_items.index)

def calculate_similarity_pearson(data_items):
    """
    Calculate the column-wise cosine similarity for a sparse
    matrix. Return a new dataframe matrix with similarities.
    """
    similarities = data_items.corr(method='pearson')
    return similarities

def calculate_similarity_euclidean(data_items):
    """
    Calculate Euclidean distance and convert to similarity.
    Similarity = 1 / (1 + distance)
    """
    distances = pdist(data_items.T, metric='euclidean')
    similarities = 1 / (1 + squareform(distances))
    return pd.DataFrame(similarities, index=data_items.columns, columns=data_items.columns)

def calculate_similarity_manhattan(data_items):
    distances = pdist(data_items.T, metric='cityblock')
    similarities = 1 / (1 + squareform(distances))
    return pd.DataFrame(similarities, index=data_items.columns, columns=data_items.columns)

def calculate_similarity_tanimoto(data_items):
    """
    Generalized Jaccard (Tanimoto) similarity for non-binary data.
    """
    A = data_items.values.T
    dot_product = A @ A.T
    square_sum = np.sum(A ** 2, axis=1).reshape(-1, 1)
    denominator = square_sum + square_sum.T - dot_product
    similarity = dot_product / (denominator + 1e-10)
    return pd.DataFrame(similarity, index=data_items.columns, columns=data_items.columns)

def calculate_similarity_adjusted_cosine(data_items):
    """
    Adjusted cosine similarity - subtract mean user rating before computing cosine similarity.
    """
    mean_user_ratings = data_items.mean(axis=1)
    adjusted_data = data_items.sub(mean_user_ratings, axis=0).fillna(0)
    data_sparse = sparse.csr_matrix(adjusted_data)
    similarities = cosine_similarity(data_sparse.T)
    return pd.DataFrame(similarities, index=data_items.columns, columns=data_items.columns)

def calculate_similarity_hamming(data_items):
    """
    Calculate Hamming similarity (1 - Hamming distance) between columns (items).
    Assumes binary data.
    """
    # Transpose so we're comparing items (columns)
    distances = pdist(data_items.T, metric='hamming')
    similarities = 1 - squareform(distances)
    return pd.DataFrame(similarities, index=data_items.columns, columns=data_items.columns)

def compare_similarity_metrics(data_items,project_id,top_n=11):
    cosine = calculate_similarity_cosine(voter_matrix)
    jaccard = calculate_similarity_jaccard(voter_matrix)
    pearson = calculate_similarity_pearson(voter_matrix)
    euclidean = calculate_similarity_euclidean(voter_matrix)
    manhattan = calculate_similarity_manhattan(voter_matrix)
    tanimoto = calculate_similarity_tanimoto(voter_matrix)
    hamming = calculate_similarity_hamming(voter_matrix)

    similarity_comparator = {
        "cosine":list(cosine.loc[project_id].nlargest(top_n).round(4).items())[1:],
        #"jaccard":list(jaccard.loc[project_id].nlargest(top_n).round(4).items()),
        "pearson":list(pearson.loc[project_id].nlargest(top_n).round(4).items())[1:],
        "euclidean":list(euclidean.loc[project_id].nlargest(top_n).round(4).items())[1:],
        "manhattan":list(manhattan.loc[project_id].nlargest(top_n).round(4).items())[1:],
        "tanimoto":list(tanimoto.loc[project_id].nlargest(top_n).round(4).items())[1:],
        "hamming":list(hamming.loc[project_id].nlargest(top_n).round(4).items())[1:]
    }

    return pd.DataFrame(similarity_comparator)
    
print(compare_similarity_metrics(voter_matrix, 84))


          cosine        pearson      euclidean      manhattan       tanimoto  \
0    (87, 0.677)   (87, 0.6731)   (87, 0.2109)   (87, 0.0524)   (87, 0.5116)   
1   (86, 0.6361)   (86, 0.6313)   (86, 0.2008)     (86, 0.04)   (86, 0.4663)   
2   (90, 0.1316)   (90, 0.1245)   (50, 0.1736)    (50, 0.026)   (90, 0.0656)   
3   (81, 0.0966)   (81, 0.0889)  (176, 0.1736)   (176, 0.026)   (81, 0.0455)   
4    (82, 0.087)   (82, 0.0777)  (187, 0.1736)   (187, 0.026)   (82, 0.0441)   
5   (89, 0.0742)   (89, 0.0697)  (181, 0.1731)  (181, 0.0259)   (83, 0.0317)   
6   (83, 0.0658)   (83, 0.0578)   (55, 0.1725)  (178, 0.0257)   (89, 0.0267)   
7  (186, 0.0355)  (186, 0.0305)  (168, 0.1725)   (55, 0.0256)   (186, 0.013)   
8   (74, 0.0221)   (183, 0.014)   (46, 0.1715)  (168, 0.0256)   (74, 0.0104)   
9  (183, 0.0214)   (74, 0.0131)  (178, 0.1715)   (46, 0.0252)  (183, 0.0102)   

         hamming  
0   (87, 0.9945)  
1   (86, 0.9916)  
2   (50, 0.9861)  
3  (176, 0.9861)  
4  (178, 0.9861)  
5  (1

#### Checking the highest similarity using only distance

In [26]:
# project_ids=list(voter_matrix.columns)
# h_cosine={
#     "project_id":0,
#     "sim_val":0,
#     "most_sim_project_id":0
# }
# # h_jaccard={
# #     "sim_val":0,
# #     "project_id":0
# # }
# h_pearson={
#     "project_id":0,
#     "sim_val":0,
#     "most_sim_project_id":0
# }
# h_euclidean={
#     "project_id":0,
#     "sim_val":0,
#     "most_sim_project_id":0
# }
# h_manhattan={
#     "project_id":0,
#     "sim_val":0,
#     "most_sim_project_id":0
# }
# h_tanimoto={
#     "project_id":0,
#     "sim_val":0,
#     "most_sim_project_id":0
# }
# h_hamming={
#     "project_id":0,
#     "sim_val":0,
#     "most_sim_project_id":0
# }

# for project_id in project_ids:
#     # print(project_id)
#     similarities = compare_similarity_metrics(voter_matrix,project_id,top_n=2)
#     #print(similarities)
#     cosine = similarities['cosine']
#     pearson = similarities['pearson']
#     euclidean = similarities['euclidean']
#     manhattan = similarities['manhattan']
#     tanimoto = similarities['tanimoto']
#     hamming = similarities['hamming']

#     ##Check cosine highest
#     if cosine[0][1] > h_cosine['sim_val']:
#         h_cosine['project_id']=project_id
#         h_cosine['sim_val'] = cosine[0][1]
#         h_cosine['most_sim_project_id']=cosine[0][0]

#     ##Check pearson highest
#     if pearson[0][1] > h_pearson['sim_val']:
#         h_pearson['project_id']=project_id
#         h_pearson['sim_val'] = pearson[0][1]
#         h_pearson['most_sim_project_id']=pearson[0][0]

#     ##Check euclidean highest
#     if euclidean[0][1] > h_euclidean['sim_val']:
#         h_euclidean['project_id']=project_id
#         h_euclidean['sim_val'] = euclidean[0][1]
#         h_euclidean['most_sim_project_id']=euclidean[0][0]

#     ##Check manhattan highest
#     if manhattan[0][1] > h_manhattan['sim_val']:
#         h_manhattan['project_id']=project_id
#         h_manhattan['sim_val'] = manhattan[0][1]
#         h_manhattan['most_sim_project_id']=manhattan[0][0]

#     ##Check tanimoto highest
#     if tanimoto[0][1] > h_tanimoto['sim_val']:
#         h_tanimoto['project_id']=project_id
#         h_tanimoto['sim_val'] = tanimoto[0][1]
#         h_tanimoto['most_sim_project_id']=tanimoto[0][0]

#     ##Check hamming highest
#     if hamming[0][1] > h_hamming['sim_val']:
#         h_hamming['project_id']=project_id
#         h_hamming['sim_val'] = hamming[0][1]
#         h_hamming['most_sim_project_id']=hamming[0][0]

#     # if project_id == 5:
#     #     break


In [27]:
# # print(project_ids)
# print(h_cosine)
# print(h_pearson)
# print(h_euclidean)
# print(h_manhattan)
# print(h_tanimoto)
# print(h_hamming)

{'project_id': 84, 'sim_val': 0.677, 'most_sim_project_id': 87}
{'project_id': 84, 'sim_val': 0.6731, 'most_sim_project_id': 87}
{'project_id': 50, 'sim_val': 0.4641, 'most_sim_project_id': 176}
{'project_id': 50, 'sim_val': 0.3022, 'most_sim_project_id': 176}
{'project_id': 84, 'sim_val': 0.5116, 'most_sim_project_id': 87}
{'project_id': 50, 'sim_val': 0.9991, 'most_sim_project_id': 176}


With the result of the comparison code written 2 blocks up (they are commented because it takes a long time to run them): 
```
cosine -> {'project_id': 84, 'sim_val': 0.677, 'most_sim_project_id': 87}
pearson -> {'project_id': 84, 'sim_val': 0.6731, 'most_sim_project_id': 87}
euclidean -> {'project_id': 50, 'sim_val': 0.4641, 'most_sim_project_id': 176}
manhattan -> {'project_id': 50, 'sim_val': 0.3022, 'most_sim_project_id': 176}
tanimoto -> {'project_id': 84, 'sim_val': 0.5116, 'most_sim_project_id': 87}
hamming -> {'project_id': 50, 'sim_val': 0.9991, 'most_sim_project_id': 176}
```

We can see that cosine, pearson and tanimoto produces the same results with different intensities. This is also the case for euclidean, manhattan and hamming

## Second similarity study -> calculating distances after embedding

The idea of this second study is to test the performance of different embedding techniques and check their performance regarding project similarity:
1- TF-IDF
2- Word2Vec