### Imports

In [1]:
# Data processing
import pandas as pd
import numpy as np
import scipy.stats

# Visualization
import seaborn as sns

# Similarity
from sklearn.metrics.pairwise import cosine_similarity

### Data loading

In [2]:
electionDetailsDF = pd.read_csv('projectDetails.csv', sep=";")
print("Election details",electionDetailsDF.columns,"\n")

Election details Index(['ID', 'project_id', 'project_title', 'created_at', 'checked_out_at',
       'project_url', 'vote_finished'],
      dtype='object') 



In [3]:
voter_matrix = pd.crosstab(electionDetailsDF['ID'], electionDetailsDF['project_id'])

data_items = voter_matrix.copy()

print(voter_matrix)

project_id  4    5    6    7    8    9    10   11   12   13   ...  194  195  \
ID                                                            ...             
9             0    0    0    0    0    0    0    0    0    0  ...    0    0   
11            0    0    0    0    0    0    0    0    0    0  ...    0    0   
12            0    0    0    0    0    0    0    0    0    0  ...    0    0   
13            0    1    0    0    0    0    0    0    0    0  ...    0    0   
15            0    0    0    0    0    0    0    0    0    0  ...    0    0   
...         ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...   
5228          0    0    0    0    0    0    0    0    0    0  ...    0    0   
5229          0    0    0    1    0    0    0    1    0    0  ...    0    0   
5230          0    0    0    0    0    0    0    0    0    0  ...    0    0   
5231          0    0    0    0    0    0    0    0    0    0  ...    0    0   
5232          0    0    0    0    0    0    0    0  

In [4]:
##Matrix normalization

# magnitude = sqrt(x2 + y2 + z2 + ...)
magnitude = np.sqrt(np.square(data_items).sum(axis=1))

# unitvector = (x / magnitude, y / magnitude, z / magnitude, ...)
data_items_normalized = data_items.divide(magnitude, axis='index')

print(data_items_normalized)

project_id  4        5    6        7    8    9    10       11   12   13   ...  \
ID                                                                        ...   
9           0.0  0.00000  0.0  0.00000  0.0  0.0  0.0  0.00000  0.0  0.0  ...   
11          0.0  0.00000  0.0  0.00000  0.0  0.0  0.0  0.00000  0.0  0.0  ...   
12          0.0  0.00000  0.0  0.00000  0.0  0.0  0.0  0.00000  0.0  0.0  ...   
13          0.0  0.57735  0.0  0.00000  0.0  0.0  0.0  0.00000  0.0  0.0  ...   
15          0.0  0.00000  0.0  0.00000  0.0  0.0  0.0  0.00000  0.0  0.0  ...   
...         ...      ...  ...      ...  ...  ...  ...      ...  ...  ...  ...   
5228        0.0  0.00000  0.0  0.00000  0.0  0.0  0.0  0.00000  0.0  0.0  ...   
5229        0.0  0.00000  0.0  0.57735  0.0  0.0  0.0  0.57735  0.0  0.0  ...   
5230        0.0  0.00000  0.0  0.00000  0.0  0.0  0.0  0.00000  0.0  0.0  ...   
5231        0.0  0.00000  0.0  0.00000  0.0  0.0  0.0  0.00000  0.0  0.0  ...   
5232        0.0  0.00000  0.

**We can get user similarity by cosine or pearson based on the similarity calculus we want to carry on**

In [5]:
##Identify similar users using pearson correlation
user_similarity = data_items_normalized.T.corr()
user_similarity.head()

ID,9,11,12,13,15,16,17,18,19,20,...,5222,5223,5224,5226,5227,5228,5229,5230,5231,5232
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
9,1.0,1.0,-0.008792,-0.008792,-0.008792,0.574427,-0.008792,-0.007161,0.574427,-0.005051,...,-0.008792,-0.008792,-0.008792,-0.008792,-0.008792,-0.008792,-0.008792,-0.008792,-0.008792,-0.008792
11,1.0,1.0,-0.008792,-0.008792,-0.008792,0.574427,-0.008792,-0.007161,0.574427,-0.005051,...,-0.008792,-0.008792,-0.008792,-0.008792,-0.008792,-0.008792,-0.008792,-0.008792,-0.008792,-0.008792
12,-0.008792,-0.008792,1.0,-0.015306,0.323129,-0.015306,-0.015306,-0.012466,-0.015306,-0.008792,...,-0.015306,0.323129,-0.015306,-0.015306,-0.015306,-0.015306,-0.015306,-0.015306,-0.015306,-0.015306
13,-0.008792,-0.008792,-0.015306,1.0,-0.015306,-0.015306,-0.015306,-0.012466,-0.015306,-0.008792,...,-0.015306,-0.015306,0.323129,-0.015306,-0.015306,-0.015306,-0.015306,-0.015306,-0.015306,-0.015306
15,-0.008792,-0.008792,0.323129,-0.015306,1.0,-0.015306,-0.015306,-0.012466,-0.015306,0.574427,...,-0.015306,0.323129,-0.015306,-0.015306,-0.015306,-0.015306,-0.015306,-0.015306,-0.015306,-0.015306


In [6]:
# User similarity matrix using cosine similarity
user_similarity_cosine = cosine_similarity(data_items_normalized)
print(user_similarity_cosine)

[[1. 1. 0. ... 0. 0. 0.]
 [1. 1. 0. ... 0. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 1. 0. 0.]
 [0. 0. 0. ... 0. 1. 0.]
 [0. 0. 0. ... 0. 0. 1.]]


In [9]:
##Input of similar users threshold
n = 10

picked_user_id = 5218

##Voter similarity threshold
user_similarity_threshold = 0.3

# Get top n similar users
similar_users = user_similarity[user_similarity[picked_user_id]>user_similarity_threshold][picked_user_id].sort_values(ascending=False)[:n]

# Print out top n similar users
print(f'The similar users for user {picked_user_id} are', similar_users)

The similar users for user 5218 are ID
5218    1.000000
1377    0.661565
1073    0.661565
3024    0.661565
2190    0.661565
2621    0.661565
1871    0.661565
1269    0.661565
4302    0.661565
3479    0.661565
Name: 5218, dtype: float64


In [None]:
picked_projects_ids = data_items_normalized