# entire thing

In [None]:
from google.colab import drive
import pandas as pd

## Fetch data

In [None]:
users = pd.read_csv("/content/drive/MyDrive/main/ml-1m/users_processed.csv")
movies = pd.read_csv("/content/drive/MyDrive/main/ml-1m/movies_processed.csv")
ratings_train = pd.read_csv("/content/drive/MyDrive/main/ml-1m/ratings_train_set.csv")
ratings_test = pd.read_csv("/content/drive/MyDrive/main/ml-1m/ratings_test_set.csv")
# ratings.head()

In [None]:
users.head()

Unnamed: 0.1,Unnamed: 0,user_id,age_group,occupation,zip,M,F,gender_switch,occupation_one_hot,age_group_one_hot
0,0,u1,1,10,48067,0,1,0,0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0,1 0 0 0 0 0 0
1,1,u2,56,16,70072,1,0,1,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0,0 0 0 0 0 0 1
2,2,u3,25,15,55117,1,0,1,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0,0 0 1 0 0 0 0
3,3,u4,45,7,2460,1,0,1,0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0,0 0 0 0 1 0 0
4,4,u5,25,20,55455,1,0,1,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1,0 0 1 0 0 0 0


In [None]:
users = users.drop(columns=["Unnamed: 0","occupation","zip"])

In [None]:
movies.head()

Unnamed: 0.1,Unnamed: 0,id,title,year,genre
0,0,m1,Toy Story,1995,0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0
1,1,m2,Jumanji,1995,0 1 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0
2,2,m3,Grumpier Old Men,1995,0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0
3,3,m4,Waiting to Exhale,1995,0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0
4,4,m5,Father of the Bride Part II,1995,0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0


## Graph

Bipartite graph with two partite sets - one for users, and the other for movies; edges, weighted with ratings, exist according to the user ratings.

In [None]:
edge_list = []
U = []
M = []
for i in range(len(ratings_train)):
  U.append(ratings_train["user_id"][i])
  M.append(ratings_train["movie_id"][i])
  edge = (ratings_train['user_id'][i], ratings_train['movie_id'][i], ratings_train['rating'][i])
  edge_list.append(edge)
U = list(set(U))
M = list(set(M))
print("Number of users: ", len(U))
print("Number of movies: ", len(M))
print("Number of existing edges:",len(edge_list))

Number of users:  4476
Number of movies:  3672
Number of existing edges: 750157


In [None]:
import networkx as nx
from networkx.algorithms import bipartite

g = nx.Graph()
g.add_nodes_from(U,bipartite=0)
g.add_nodes_from(M,bipartite=1)
g.add_weighted_edges_from(edge_list, weight='rating')
print("Number of nodes:", g.number_of_nodes())
print("Number of edges:", g.number_of_edges())

Number of nodes: 8148
Number of edges: 750157


## CN-based topological metrics

In [None]:
from math import sqrt, log

def common_neighbors(g,u,v):
    u_nbr = set(g.neighbors(u))
    v_nbr = set(g.neighbors(v))
    return len(u_nbr.intersection(v_nbr))

def jaccard_coefficient(g,u,v):
	u_nbr = set(g.neighbors(u))
	v_nbr = set(g.neighbors(v))
	return len(u_nbr.intersection(v_nbr))/len(u_nbr.union(v_nbr))

def sorensen_index(g,u,v):
	u_nbr = set(g.neighbors(u))
	v_nbr = set(g.neighbors(v))
	return len(u_nbr.intersection(v_nbr))/(len(u_nbr)+len(v_nbr))

def leicht_holme_nerman(g,u,v):
	u_nbr = set(g.neighbors(u))
	v_nbr = set(g.neighbors(v))
	return len(u_nbr.intersection(v_nbr))/(len(u_nbr)*len(v_nbr))

def salton_cosine_similarity(g,u,v):
    u_nbr = set(g.neighbors(u))
    v_nbr = set(g.neighbors(v))
    return len(u_nbr.intersection(v_nbr))/sqrt(len(u_nbr)*len(v_nbr))

def hub_promoted_index(g,u,v):
    u_nbr = set(g.neighbors(u))
    v_nbr = set(g.neighbors(v))
    return len(u_nbr.intersection(v_nbr))/min(len(u_nbr),len(v_nbr))

def hub_depressed_index(g,u,v):
    u_nbr = set(g.neighbors(u))
    v_nbr = set(g.neighbors(v))
    return len(u_nbr.intersection(v_nbr))/max(len(u_nbr),len(v_nbr))

# def preferential_attachment(g,u,v):
#     u_nbr = set(g.neighbors(u))
#     v_nbr = set(g.neighbors(v))
#     return len(u_nbr)*len(v_nbr)

def resource_allocation(g,u,v):
    u_nbr = set(g.neighbors(u))
    v_nbr = set(g.neighbors(v))
    cn = u_nbr.intersection(v_nbr)
    ra = 0
    for i in cn:
        ra += 1/float(len(set(g.neighbors(i))))
    return ra

def adamic_adar(g,u,v):
    u_nbr = set(g.neighbors(u))
    v_nbr = set(g.neighbors(v))
    cn = u_nbr.intersection(v_nbr)
    aa = 0
    for i in cn:
        aa += 1/log(len(set(g.neighbors(i))))
    return aa

## Movie, Rating and Genre Similarities


In [None]:
from scipy import spatial
import numpy as np

def common_movie_rating_similarity(g,u,v): 
  u_nbr = set(g.neighbors(u))
  v_nbr = set(g.neighbors(v))
  common_movies = u_nbr.intersection(v_nbr)
  if len(common_movies)==0:
    return 0
  # print("common_movies:", common_movies)
  u_rating = [g[u][m]['rating'] for m in common_movies]
  # print(u_rating)
  v_rating = [g[v][m]['rating'] for m in common_movies]
  # print(v_rating)
  result = 1 - spatial.distance.cosine(u_rating, v_rating)
  return result

In [None]:
similarity = common_movie_rating_similarity(g,'u697','u5232')
print(similarity)

NetworkXError: ignored

In [None]:
import warnings
warnings.filterwarnings("ignore")

user_genre = pd.DataFrame(columns=['user_id','genre_overview'])
user_genre['user_id'] = U
for i in range(len(user_genre)):
   u_nbr = g.neighbors(user_genre['user_id'][i])
   u_genre = np.zeros(18, dtype=int)
   if i%500==0:
     print("{}/{}".format(i,len(U)))
   for m in u_nbr:
     genre_str = list(map(int, movies.loc[movies['id']==m]['genre'].to_string().split()))[1:] 
     # print(genre_str)
     genre_bin = np.array(genre_str)
     # print("m:", genre_bin)
     u_genre = np.add(u_genre, genre_bin)
  
   # normalize
   u_genre = u_genre/float(sum(u_genre))
   u_genre_rounded = [round(x,5) for x in u_genre]
   user_genre['genre_overview'][i] = u_genre_rounded
   # break
user_genre.head()

In [None]:
# user_genre.to_csv("drive/My Drive/main/ml-1m/user_genre_overview.csv")
user_genre = pd.read_csv("drive/My Drive/main/ml-1m/user_genre_overview.csv")

In [None]:
x = user_genre.loc[user_genre['user_id']=='u-1']
len(x)

0

In [None]:
MOVIE_GENRES = ['Action','Adventure','Animation',"Children's",'Comedy','Crime','Documentary','Drama','Fantasy','Film-Noir','Horror','Musical','Mystery','Romance','Sci-Fi','Thriller','War','Western']
def genre_similarity(u,v):
  u_genre = user_genre.loc[user_genre['user_id']==u]
  if len(u_genre)>0:   
    u_g = u_genre.iloc[0]['genre_overview']
  else:
    return 0
  v_genre = user_genre.loc[user_genre['user_id']==v]
  if len(v_genre)>0:
    v_g = v_genre.iloc  [0]['genre_overview']
  else:
    return 0
  if type(u_g) is str:
    u_g = list(map(float, u_g.lstrip("[").rstrip("]").split(',')))
    u_g = np.array(u_g)
    # print(u_g)
  if type(v_g) is str:
    v_g = list(map(float, v_g.lstrip("[").rstrip("]").split(',')))
    v_g = np.array(v_g)
    # print(v_g)
  result = 1 - spatial.distance.cosine(u_g, v_g)
  return result

In [None]:
genre_similarity('u697','u5232')

0.5674165814675588

In [None]:
user_genre.head()

Unnamed: 0.1,Unnamed: 0,user_id,genre_overview
0,0,u4165,"[0.03297, 0.0, 0.05495, 0.04396, 0.50549, 0.02..."
1,1,u4536,"[0.17857, 0.04762, 0.0119, 0.02381, 0.09524, 0..."
2,2,u3009,"[0.2381, 0.06349, 0.0, 0.0, 0.1746, 0.07937, 0..."
3,3,u166,"[0.09401, 0.07051, 0.01998, 0.047, 0.16099, 0...."
4,4,u3596,"[0.04082, 0.03265, 0.01633, 0.01224, 0.16735, ..."


## User Feature Similarity

In [None]:
users.head()

Unnamed: 0,user_id,age_group,M,F,gender_switch,occupation_one_hot,age_group_one_hot
0,u1,1,0,1,0,0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0,1 0 0 0 0 0 0
1,u2,56,1,0,1,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0,0 0 0 0 0 0 1
2,u3,25,1,0,1,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0,0 0 1 0 0 0 0
3,u4,45,1,0,1,0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0,0 0 0 0 1 0 0
4,u5,25,1,0,1,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1,0 0 1 0 0 0 0


In [None]:
users_refined = pd.read_csv("/content/drive/MyDrive/main/ml-1m/users_refined.csv")
users_refined["occupation_one_hot"] = users["occupation_one_hot"]
users_refined = users_refined.drop(columns=["occupation","Unnamed: 0"])
users_refined.head()

NameError: ignored

In [None]:
fv = []
for i in range(len(users)):
  feature_vector = str(users_refined["age_group"][i]) +","+ str(users_refined["gender_switch"][i]) +","+ str(users_refined["zip_zones"][i]) +","+ ','.join(users_refined["occupation_one_hot"][i].split())
  feature_vector = list(map(int, feature_vector.split(',')))
  fv.append(feature_vector)
  # break
# feature_vector
users_refined["feature_vector"] = fv
users_refined.head()

Unnamed: 0,user_id,age_group,gender_switch,zip_zones,occupation_one_hot,feature_vector
0,u1,0,0,9,0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0,"[0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ..."
1,u2,6,1,14,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0,"[6, 1, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
2,u3,2,1,11,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0,"[2, 1, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
3,u4,4,1,0,0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0,"[4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ..."
4,u5,2,1,11,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1,"[2, 1, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."


In [None]:
# users_refined.to_csv("/content/drive/MyDrive/main/ml-1m/users_with_feature_vector.csv", index=False)

In [None]:
from scipy import spatial
import numpy as np
import ast
def user_feature_similarity(u,v):
  u_index = users_refined.index[users_refined['user_id'] == u].to_list()[0]
  u_feature = users_refined["feature_vector"][u_index]
  u_feature = ast.literal_eval(u_feature)
  v_index = users_refined.index[users_refined['user_id'] == v].to_list()[0]
  v_feature = users_refined["feature_vector"][v_index]
  v_feature = ast.literal_eval(v_feature)
  u_feature = np.array(u_feature)
  v_feature = np.array(v_feature)
  return (1 - spatial.distance.cosine(u_feature,v_feature))

In [None]:
user_feature_similarity('u1','u5')

0.9701221217219766

In [None]:
users_refined = pd.read_csv("/content/drive/MyDrive/main/ml-1m/users_with_feature_vector.csv")
users_refined.head(3)

Unnamed: 0,user_id,age_group,gender_switch,zip_zones,occupation_one_hot,feature_vector
0,u1,0,0,9,0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0,"[0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ..."
1,u2,6,1,14,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0,"[6, 1, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
2,u3,2,1,11,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0,"[2, 1, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."


In [None]:
user_feature_similarity('u634','u3377') 

0.0

In [None]:
users_refined[users_refined["user_id"]=='u634']

Unnamed: 0,user_id,age_group,gender_switch,zip_zones,occupation_one_hot,feature_vector
633,u634,0,0,9,0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0,"[0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ..."


In [None]:
users_refined[users_refined["user_id"]=='u3377']

Unnamed: 0,user_id,age_group,gender_switch,zip_zones,occupation_one_hot,feature_vector
3376,u3377,2,1,0,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0,"[2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


## Similar Users

In [None]:
from itertools import combinations

U = list(combinations(U,2))

u_len = len(U)
step_size = u_len//5
u1 = pd.DataFrame(U[:step_size])
print('u1 ',len(u1))
u2 = pd.DataFrame(U[step_size:2*step_size])
print('u2 ',len(u2))
u3 = pd.DataFrame(U[2*step_size:3*step_size])
print('u3 ',len(u3))
u4 = pd.DataFrame(U[3*step_size:4*step_size])
print('u4 ',len(u4))
u5 = pd.DataFrame(U[4*step_size:])
print('u5 ',len(u5))
print(len(u1)+len(u2)+len(u3)+len(u4)+len(u5))
print(len(U))

u1  2003010
u2  2003010
u3  2003010
u4  2003010
u5  2003010
10015050
10015050


In [None]:
u1.to_csv("/content/drive/MyDrive/main/ml-1m/user_group_colab.csv",index = False)
u2.to_csv("/content/drive/MyDrive/main/ml-1m/user_sneha_colab.csv",index = False)
u3.to_csv("/content/drive/MyDrive/main/ml-1m/user_sneha_pycharm.csv",index = False)
u4.to_csv("/content/drive/MyDrive/main/ml-1m/user_srijeet_colab.csv",index = False)
u5.to_csv("/content/drive/MyDrive/main/ml-1m/user_srijeet_anaconda.csv",index = False)

In [None]:
len(U)  # training set has 4476 users

4476

In [None]:
u1 = pd.read_csv("/content/drive/MyDrive/main/ml-1m/may30_transfer_file/user_srijeet_anaconda.csv")
# u1 = list(u1['1'])
len(u1)

2003010

In [None]:
u1.head()

Unnamed: 0,0,1
0,u2254,u2223
1,u2254,u2511
2,u2254,u3194
3,u2254,u1455
4,u2254,u3685


In [None]:
import warnings
warnings.filterwarnings("ignore")
from itertools import combinations


feature_columns = ["node1", "node2", "cmrs", "genre", "cn","aa","hpi", "jc", "ra", "si","lhn","scs","hdi","user_feature_sim"]
# user_user_links = pd.DataFrame(columns=feature_columns) 
user_user_links_dict = []
count = 0
for i in range(len(u1)//2,len(u1)):
    u, v = u1['0'][i],u1['1'][i]
    if count%10000 == 0:
      print("Done with {} node-pairs".format(count))
    entry = {}
    entry["node1"] = u
    entry["node2"] = v
    entry["user_feature_sim"] = user_feature_similarity(u,v)
    entry["cn"] = common_neighbors(g,u,v)   
    entry["jc"] = jaccard_coefficient(g,u,v)
    
    # if jc>0.20:
    entry["genre"] = genre_similarity(u,v)
      # if genre > 0.90:
    entry["cmrs"] = common_movie_rating_similarity(g,u,v)
        # if cmrs > 0.90:                         
    entry["aa"] = adamic_adar(g,u,v)
    entry["hpi"] = hub_promoted_index(g,u,v)
    entry["ra"] = resource_allocation(g,u,v)
    entry["si"] = sorensen_index(g,u,v)
    entry["lhn"] = leicht_holme_nerman(g,u,v)
    entry["scs"] = salton_cosine_similarity(g,u,v)       
    entry["hdi"] = hub_depressed_index(g,u,v)
    
    user_user_links_dict.append(entry)

          # user_user_links = user_user_links.append(pd.Series([u,v,cmrs,genre,cn,aa,hpi,jc,ra,si,lhn,scs,hdi], index=feature_columns),  ignore_index=True)         
    count+=1
    # break

user_user_links = pd.DataFrame.from_dict(user_user_links_dict)
        
user_user_links.head()

Done with 0 node-pairs
Done with 10000 node-pairs
Done with 20000 node-pairs
Done with 30000 node-pairs
Done with 40000 node-pairs
Done with 50000 node-pairs
Done with 60000 node-pairs
Done with 70000 node-pairs
Done with 80000 node-pairs
Done with 90000 node-pairs
Done with 100000 node-pairs
Done with 110000 node-pairs
Done with 120000 node-pairs
Done with 130000 node-pairs
Done with 140000 node-pairs
Done with 150000 node-pairs
Done with 160000 node-pairs
Done with 170000 node-pairs
Done with 180000 node-pairs
Done with 190000 node-pairs
Done with 200000 node-pairs
Done with 210000 node-pairs
Done with 220000 node-pairs
Done with 230000 node-pairs
Done with 240000 node-pairs
Done with 250000 node-pairs
Done with 260000 node-pairs
Done with 270000 node-pairs
Done with 280000 node-pairs
Done with 290000 node-pairs
Done with 300000 node-pairs
Done with 310000 node-pairs
Done with 320000 node-pairs
Done with 330000 node-pairs
Done with 340000 node-pairs
Done with 350000 node-pairs
Done w

Unnamed: 0,node1,node2,user_feature_sim,cn,jc,genre,cmrs,aa,hpi,ra,si,lhn,scs,hdi
0,u2821,u451,0.868711,9,0.03913,0.886202,0.961151,1.352425,0.3,0.015471,0.037657,0.001435,0.11366,0.043062
1,u2821,u3980,0.828571,0,0.0,0.554629,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,u2821,u101,0.797081,2,0.014925,0.578206,0.993884,0.263363,0.066667,0.001007,0.014706,0.000629,0.035466,0.018868
3,u2821,u152,0.824786,4,0.08,0.86311,0.949947,0.556525,0.166667,0.003051,0.074074,0.005556,0.149071,0.133333
4,u2821,u2690,0.886405,1,0.019231,0.764847,1.0,0.132832,0.043478,0.000538,0.018868,0.001449,0.038069,0.033333


In [None]:
len(user_user_links_dict)

1001505

In [None]:
#user_user_links.to_csv("drive/My Drive/main/ml-1m/user_user_links_srijeet_anaconda.csv")

In [None]:
aa = user_user_links["aa"]
max_aa = max(aa)
aa = aa/max_aa
user_user_links["un_normed_aa"] = user_user_links["aa"]
user_user_links["aa"] = aa

In [None]:
hyperparameters = {"cmrs":0.2,"genre":0.2,"cn":0.,"aa":0.1,"hpi":0.025,"jc":0.025,"ra":0.01,"si":0.01,"lhn":0.01,"scs":0.01,"hdi":0.01,"user_feature_sim":0.4}
net_sim = []
for i in range(len(user_user_links)):
  sim = 0
  for key in hyperparameters.keys():
    sim += hyperparameters[key]*user_user_links[key][i]
  net_sim.append(sim)
  if i%10000==0:
    print("Done with:", i)

user_user_links["weighted_similarity"] = net_sim
user_user_links.head()

In [None]:
from statistics import mean, median
import matplotlib.pyplot as plt

for i in user_user_links.columns[2:]:
  print(i)
  print("max: ", max(user_user_links[i]))
  print("min: ", min(user_user_links[i]))
  print("mean: ", mean(user_user_links[i]))
  print("median: ", median(user_user_links[i]))
  plt.hist(user_user_links[i])
  plt.xlabel(i)
  # plt.yscale('log')
  plt.show()

##Users_content

In [None]:
users = pd.read_csv("/content/drive/MyDrive/main/ml-1m/users_processed.csv")
users.head()

Unnamed: 0.1,Unnamed: 0,user_id,age_group,occupation,zip,M,F,gender_switch,occupation_one_hot,age_group_one_hot
0,0,u1,1,10,48067,0,1,0,0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0,1 0 0 0 0 0 0
1,1,u2,56,16,70072,1,0,1,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0,0 0 0 0 0 0 1
2,2,u3,25,15,55117,1,0,1,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0,0 0 1 0 0 0 0
3,3,u4,45,7,2460,1,0,1,0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0,0 0 0 0 1 0 0
4,4,u5,25,20,55455,1,0,1,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1,0 0 1 0 0 0 0


In [None]:
users_refined = pd.read_csv("/content/drive/MyDrive/main/ml-1m/users_refined.csv")

In [None]:
users_refined = users_refined.drop(columns="occupation")

In [None]:
users_refined["occupation"] = users["occupation_one_hot"]
users_refined.head()

In [None]:
fv = []
for i in range(len(users_refined)):
  feature_vector = str(users_refined["age_group"][i]) +","+ str(users_refined["gender_switch"][i]) +","+ str(users_refined["zip_zones"][i]) +","+ ','.join(users_refined["occupation"][i].split())
  feature_vector = list(map(int, feature_vector.split(',')))
  fv.append(feature_vector)
  # break
# feature_vector
users_refined["feature_vector"] = fv

In [None]:
users_refined.head()

In [None]:
# users_refined["feature_vector"][1]
users = users_refined.drop(columns="Unnamed: 0")

In [None]:
from scipy import spatial
import numpy as np
def user_feature_similarity(u,v):
  u = np.array(u)
  v = np.array(v)
  return (1 - spatial.distance.cosine(u,v))

In [None]:
user_user_links = pd.read_csv("drive/My Drive/main/ml-1m/user_user_links_genre90_rating90_jc25.csv")

In [None]:
user_user_links = user_user_links.drop(columns=["Unnamed: 0"])
user_user_links.head()

Unnamed: 0,node1,node2,cmrs,genre,cn,aa,hpi,jc,ra,si,lhn,scs,hdi
0,u3592,u1998,0.979146,0.989543,145,21.829119,0.407303,0.255282,0.233991,0.203366,0.001141,0.406733,0.406162
1,u3592,u411,0.962918,0.98648,211,32.403812,0.591036,0.270166,0.379729,0.212702,0.000931,0.443161,0.332283
2,u4005,u1546,0.956481,0.963056,119,17.021363,0.495833,0.257019,0.123637,0.204467,0.00145,0.415363,0.347953
3,u4005,u2683,0.971309,0.975549,109,15.310465,0.454167,0.280928,0.098996,0.219316,0.001767,0.438889,0.424125
4,u5957,u1246,0.961352,0.95857,168,25.258028,0.702929,0.261275,0.270967,0.207152,0.001229,0.454373,0.293706


In [None]:
user_feature_sim = []
for i in range(len(user_user_links)):
  u = user_user_links["node1"][i]
  v = user_user_links["node2"][i]
  u_index = users_refined.index[users_refined['user_id'] == u].to_list()[0]
  u_feature = users_refined["feature_vector"][u_index]
  v_index = users_refined.index[users_refined['user_id'] == v].to_list()[0]
  v_feature = users_refined["feature_vector"][v_index]
  # print("u_feature: ", u_feature)
  # print("v_feature: ", v_feature)
  user_sim = user_feature_similarity(u_feature, v_feature)
  user_feature_sim.append(user_sim)
  if i%10000==0:
    print("Done with: ",i)
  # break
user_user_links["user_feature_sim"] = user_feature_sim
user_user_links.head()


Done with:  0
Done with:  10000
Done with:  20000
Done with:  30000
Done with:  40000


Unnamed: 0,node1,node2,cmrs,genre,cn,aa,hpi,jc,ra,si,lhn,scs,hdi,user_feature_sim
0,u3592,u1998,0.979146,0.989543,145,21.829119,0.407303,0.255282,0.233991,0.203366,0.001141,0.406733,0.406162,0.81986
1,u3592,u411,0.962918,0.98648,211,32.403812,0.591036,0.270166,0.379729,0.212702,0.000931,0.443161,0.332283,0.980231
2,u4005,u1546,0.956481,0.963056,119,17.021363,0.495833,0.257019,0.123637,0.204467,0.00145,0.415363,0.347953,0.825723
3,u4005,u2683,0.971309,0.975549,109,15.310465,0.454167,0.280928,0.098996,0.219316,0.001767,0.438889,0.424125,0.725866
4,u5957,u1246,0.961352,0.95857,168,25.258028,0.702929,0.261275,0.270967,0.207152,0.001229,0.454373,0.293706,0.060523


In [None]:
user_user_links["user_feature_sim"].describe()

count    44514.000000
mean         0.830771
std          0.247365
min          0.000000
25%          0.790809
50%          0.952563
75%          0.989764
max          1.000000
Name: user_feature_sim, dtype: float64

In [None]:
hyperparameters = {"cmrs":0.2,"genre":0.2,"cn":0.,"aa":0.1,"hpi":0.025,"jc":0.025,"ra":0.01,"si":0.01,"lhn":0.01,"scs":0.01,"hdi":0.01,"user_feature_sim":0.4}
net_sim = []
for i in range(len(user_user_links)):
  sim = 0
  for key in hyperparameters.keys():
    sim += hyperparameters[key]*user_user_links[key][i]
  net_sim.append(sim)
  if i%10000==0:
    print("Done with:", i)

user_user_links["weighted_similarity"] = net_sim
user_user_links.head()

Done with: 0
Done with: 10000
Done with: 20000
Done with: 30000
Done with: 40000


Unnamed: 0,node1,node2,cmrs,genre,cn,aa,hpi,jc,ra,si,lhn,scs,hdi,user_feature_sim,weighted_similarity,un_normed_aa
0,u3592,u1998,0.979146,0.989543,145,0.100805,0.407303,0.255282,0.233991,0.203366,0.001141,0.406733,0.406162,0.81986,0.760841,21.829119
1,u3592,u411,0.962918,0.98648,211,0.149639,0.591036,0.270166,0.379729,0.212702,0.000931,0.443161,0.332283,0.980231,0.832154,32.403812
2,u4005,u1546,0.956481,0.963056,119,0.078604,0.495833,0.257019,0.123637,0.204467,0.00145,0.415363,0.347953,0.825723,0.751807,17.021363
3,u4005,u2683,0.971309,0.975549,109,0.070703,0.454167,0.280928,0.098996,0.219316,0.001767,0.438889,0.424125,0.725866,0.716997,15.310465
4,u5957,u1246,0.961352,0.95857,168,0.11664,0.702929,0.261275,0.270967,0.207152,0.001229,0.454373,0.293706,0.060523,0.456237,25.258028


In [None]:
user_user_links["aa"].describe()

count    44514.000000
mean         0.173852
std          0.096330
min          0.005495
25%          0.110127
50%          0.165587
75%          0.226487
max          1.000000
Name: aa, dtype: float64

In [None]:
user_user_links["weighted_similarity"].describe()

count    44514.000000
mean         0.766836
std          0.100452
min          0.407697
25%          0.749513
50%          0.809303
75%          0.827061
max          0.965916
Name: weighted_similarity, dtype: float64

In [None]:
user_user_links.to_csv("/content/drive/MyDrive/main/ml-1m/user_user_link_final.csv", index_label=False)
uul = pd.read_csv("/content/drive/MyDrive/main/ml-1m/user_user_link_final.csv")
uul.head()

Unnamed: 0,node1,node2,cmrs,genre,cn,aa,hpi,jc,ra,si,lhn,scs,hdi,user_feature_sim,weighted_similarity,un_normed_aa
0,u3592,u1998,0.979146,0.989543,145,0.100805,0.407303,0.255282,0.233991,0.203366,0.001141,0.406733,0.406162,0.81986,0.760841,21.829119
1,u3592,u411,0.962918,0.98648,211,0.149639,0.591036,0.270166,0.379729,0.212702,0.000931,0.443161,0.332283,0.980231,0.832154,32.403812
2,u4005,u1546,0.956481,0.963056,119,0.078604,0.495833,0.257019,0.123637,0.204467,0.00145,0.415363,0.347953,0.825723,0.751807,17.021363
3,u4005,u2683,0.971309,0.975549,109,0.070703,0.454167,0.280928,0.098996,0.219316,0.001767,0.438889,0.424125,0.725866,0.716997,15.310465
4,u5957,u1246,0.961352,0.95857,168,0.11664,0.702929,0.261275,0.270967,0.207152,0.001229,0.454373,0.293706,0.060523,0.456237,25.258028


## ...


In [None]:
users.head()

Unnamed: 0,user_id,age_group,gender_switch,zip_zones,occupation,feature_vector
0,u1,0,0,9,0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0,"[0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ..."
1,u2,6,1,14,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0,"[6, 1, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
2,u3,2,1,11,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0,"[2, 1, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
3,u4,4,1,0,0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0,"[4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ..."
4,u5,2,1,11,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1,"[2, 1, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."


In [None]:
from scipy import spatial
import numpy as np
def user_feature_similarity(u,v):
    u_feature = users["feature_vector"][users.index[users['user_id']== u][0]]
#     print(u_feature)
    v_feature = users["feature_vector"][users.index[users['user_id']== v][0]]
#     print(v_feature)
    u_feature = np.array(u_feature).astype('float')
    v_feature = np.array(v_feature).astype('float')
    return (1 - spatial.distance.cosine(u_feature,v_feature))

In [None]:
def cn_based_similarity(g,u,v):
  u_nbr = set(g.neighbors(u))
  v_nbr = set(g.neighbors(v))
  cn = len(u_nbr.intersection(v_nbr))
  jc = cn/len(u_nbr.union(v_nbr))
  si = cn/(len(u_nbr)+len(v_nbr))
  lhn = cn/(len(u_nbr)*len(v_nbr))
  scs = cn/sqrt(len(u_nbr)*len(v_nbr))
  hpi = cn/min(len(u_nbr),len(v_nbr))
  hdi = cn/max(len(u_nbr),len(v_nbr))
  cn_set = u_nbr.intersection(v_nbr)
  ra = 0
  for i in cn_set:
    ra += 1/float(len(set(g.neighbors(i))))
  aa = 0
  for i in cn_set:
    aa += 1/log(len(set(g.neighbors(i))))
  return cn, jc, si, lhn, scs, hpi, hdi, ra, aa

In [None]:
sim_metrics = ["cmrs", "genre", "cn", "aa", "hpi", "jc", "ra", "si", "lhn", "scs", "hdi", "user_feature"]
hyperparameters = {"cmrs":0.2,"genre":0.2,"cn":0.,"aa":0.1,"hpi":0.025,"jc":0.025,"ra":0.01,"si":0.01,"lhn":0.01,"scs":0.01,"hdi":0.01,"user_feature":0.4}

def net_similarity(g,u,v):
    cmrs = cn = aa = hpi = jc = ra = si = lhn = scs = hdi = 0
    user_feature = user_feature_similarity(u,v)*hyperparameters["user_feature"]
    genre = genre_similarity(u,v)*hyperparameters["genre"]
    if genre >= 0.90:
      cmrs = common_movie_rating_similarity(g,u,v)*hyperparameters["cmrs"]
      if cmrs >= 0.90:
        cn = common_neighbors(g,u,v)*hyperparameters["cn"]   
        
        if cn != 0:
          
          aa = adamic_adar(g,u,v)*hyperparameters["aa"]
          hpi = hub_promoted_index(g,u,v)*hyperparameters["hpi"]
          jc = jaccard_coefficient(g,u,v)*hyperparameters["jc"]
          ra = resource_allocation(g,u,v)*hyperparameters["ra"]
          si = sorensen_index(g,u,v)*hyperparameters["si"]
          lhn = leicht_holme_nerman(g,u,v)*hyperparameters["lhn"]
          scs = salton_cosine_similarity(g,u,v)*hyperparameters["scs"]
          hdi = hub_depressed_index(g,u,v)*hyperparameters["hdi"]
    
    
    net_sim = cmrs + genre + cn + aa + hpi + jc + ra + si + lhn + scs + hdi + user_feature
    return net_sim

In [None]:
import heapq
n = 10
user_id = []
top_n = []
# count = 0
for u in U:
    sim_users = {}
    for v in U:
      if v!=u:
        sim_users[v] = net_similarity(g,u,v)
    top_n_sim_users = heapq.nlargest(n, sim_users, key=sim_users.get)
    top_n.append(top_n_sim_users)
    user_id.append(u)
    # count += 1
    # if count%500 == 0:
    #     print("Done with ", count)
    break
# print(sim_users)

print(top_n_sim_users)

# user_similar_users = pd.DataFrame(columns=["user", "top_n_similar_users"])
# user_similar_users["user"] = user_id
# user_similar_users["top_n_similar_users"] = top_n
# user_similar_users.head()

['u1090', 'u5962', 'u3628', 'u5134', 'u1064', 'u2490', 'u1352', 'u5259', 'u96', 'u259']


In [None]:
sim_users['u96']

0.5904346112232448

In [None]:
common_movie_rating_similarity(g,'u5527','u1979')

0

In [None]:
uul = pd.read_csv("/content/drive/MyDrive/main/ml-1m/user_user_link_final.csv")

In [None]:
uul.head()

Unnamed: 0,node1,node2,cmrs,genre,cn,aa,hpi,jc,ra,si,lhn,scs,hdi,user_feature_sim,weighted_similarity,un_normed_aa
0,u3592,u1998,0.979146,0.989543,145,0.100805,0.407303,0.255282,0.233991,0.203366,0.001141,0.406733,0.406162,0.81986,0.760841,21.829119
1,u3592,u411,0.962918,0.98648,211,0.149639,0.591036,0.270166,0.379729,0.212702,0.000931,0.443161,0.332283,0.980231,0.832154,32.403812
2,u4005,u1546,0.956481,0.963056,119,0.078604,0.495833,0.257019,0.123637,0.204467,0.00145,0.415363,0.347953,0.825723,0.751807,17.021363
3,u4005,u2683,0.971309,0.975549,109,0.070703,0.454167,0.280928,0.098996,0.219316,0.001767,0.438889,0.424125,0.725866,0.716997,15.310465
4,u5957,u1246,0.961352,0.95857,168,0.11664,0.702929,0.261275,0.270967,0.207152,0.001229,0.454373,0.293706,0.060523,0.456237,25.258028


In [None]:
len(set(uul["node1"]).union(set(uul["node2"])))

2723