#### Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import jaccard_score
from scipy import sparse
from scipy.spatial.distance import pdist, squareform

#### Loading the datasources

In [2]:
electionDetailsDF = pd.read_csv('projectDetails.csv', sep=";")
print("Election details",electionDetailsDF.columns,"\n")

Election details Index(['ID', 'project_id', 'project_title', 'created_at', 'checked_out_at',
       'project_url', 'vote_finished'],
      dtype='object') 



In [3]:
##Creating the binary matrix
voter_matrix = pd.crosstab(electionDetailsDF['ID'], electionDetailsDF['project_id'])

print(voter_matrix)

project_id  4    5    6    7    8    9    10   11   12   13   ...  194  195  \
ID                                                            ...             
9             0    0    0    0    0    0    0    0    0    0  ...    0    0   
11            0    0    0    0    0    0    0    0    0    0  ...    0    0   
12            0    0    0    0    0    0    0    0    0    0  ...    0    0   
13            0    1    0    0    0    0    0    0    0    0  ...    0    0   
15            0    0    0    0    0    0    0    0    0    0  ...    0    0   
...         ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...   
5228          0    0    0    0    0    0    0    0    0    0  ...    0    0   
5229          0    0    0    1    0    0    0    1    0    0  ...    0    0   
5230          0    0    0    0    0    0    0    0    0    0  ...    0    0   
5231          0    0    0    0    0    0    0    0    0    0  ...    0    0   
5232          0    0    0    0    0    0    0    0  

**Item-item collaborative filtering**

In [6]:
##Magnitude vector to apply normalization
magnitude = np.sqrt(np.square(voter_matrix).sum(axis=1))

##Unit vector (normalized matrix)
voter_matrix = voter_matrix.divide(magnitude, axis='index')

def calculate_similarity(data_items):
    """
    Calculate the column-wise cosine similarity for a sparse
    matrix. Return a new dataframe matrix with similarities.
    """
    data_sparse = sparse.csr_matrix(data_items)
    similarities = cosine_similarity(data_sparse.transpose())
    sim = pd.DataFrame(data=similarities, index = data_items.columns, columns=data_items.columns)
    return sim

def calculate_similarity_jaccard(data_items):
    """
    Calculate the column-wise cosine similarity for a sparse
    matrix. Return a new dataframe matrix with similarities.
    """
    distances = pdist(data_items, metric='jaccard')
    similarities = 1 - squareform(distances)
    return pd.DataFrame(similarities, index=data_items.index, columns=data_items.index)

data_matrix = calculate_similarity(voter_matrix)
data_matrix_test = calculate_similarity_jaccard(voter_matrix)

print(data_matrix.loc[9].nlargest(11))
print(data_matrix_test.loc[9].nlargest(11))

project_id
9      1.000000
7      0.202053
5      0.123059
4      0.093903
12     0.087337
27     0.068525
11     0.066182
19     0.062021
6      0.060111
126    0.059572
21     0.057758
Name: 9, dtype: float64
ID
9      1.0
11     1.0
28     1.0
29     1.0
40     1.0
43     1.0
47     1.0
73     1.0
89     1.0
277    1.0
422    1.0
Name: 9, dtype: float64
