In [1]:
#Homophily is a network characteristic. Homophily occurs when nodes that are neighbors in a network also share a characteristic
#more often than nodes that are not network neighbors.  In this case study, we will investigate homophily of several 
#characteristics of individuals connected in social networks in rural India.

In [8]:
from collections import Counter
import numpy as np

def marginal_prob(chars):
    whole= dict(Counter(chars.values()))
    y=np.sum(np.array(list(whole.values())))
    for i in whole:
        whole[i]=whole[i]/y
    return whole
    
def chance_homophily(chars):
    nchars=marginal_prob(chars)
    return np.sum(np.square(list(nchars.values())))

favorite_colors = {
    "ankit":  "red",
    "xiaoyu": "blue",
    "mary":   "blue"
}

color_homophily = chance_homophily(favorite_colors)
print(color_homophily)

0.5555555555555556


In [6]:
#OR:
#from collections import Counter
#import numpy as np

#def marginal_prob(chars):
#    frequencies = dict(Counter(chars.values()))
#    sum_frequencies = sum(frequencies.values())
#    return {char: freq / sum_frequencies for char, freq in frequencies.items()}
                
#def chance_homophily(chars):
#    marginal_probs = marginal_prob(chars)
#    return np.sum(np.square(list(marginal_probs.values())))

#favorite_colors = {
#    "ankit":  "red",
#    "xiaoyu": "blue",
#    "mary":   "blue"
#}

#color_homophily = chance_homophily(favorite_colors)
#print(color_homophily)
#0.5555555555555556

In [30]:
import pandas as pd

df  = pd.read_csv("https://courses.edx.org/asset-v1:HarvardX+PH526x+2T2019+type@asset+block@individual_characteristics.csv", low_memory=False, index_col=0)
df1 = df[df["village"]==1]
df2 = df[df["village"]==2]
#OR
#df1 = pd.DataFrame(df.iloc[:203])
#df2 = pd.DataFrame(df.iloc[203:406])

df1.head()

Unnamed: 0,village,adjmatrix_key,pid,hhid,resp_id,resp_gend,resp_status,age,religion,caste,...,privategovt,work_outside,work_outside_freq,shgparticipate,shg_no,savings,savings_no,electioncard,rationcard,rationcard_colour
0,1,5,100201,1002,1,1,Head of Household,38,HINDUISM,OBC,...,PRIVATE BUSINESS,Yes,0.0,No,,No,,Yes,Yes,GREEN
1,1,6,100202,1002,2,2,Spouse of Head of Household,27,HINDUISM,OBC,...,,,,No,,No,,Yes,Yes,GREEN
2,1,23,100601,1006,1,1,Head of Household,29,HINDUISM,OBC,...,OTHER LAND,No,,No,,No,,Yes,Yes,GREEN
3,1,24,100602,1006,2,2,Spouse of Head of Household,24,HINDUISM,OBC,...,PRIVATE BUSINESS,No,,Yes,1.0,Yes,1.0,Yes,No,
4,1,27,100701,1007,1,1,Head of Household,58,HINDUISM,OBC,...,OTHER LAND,No,,No,,No,,Yes,Yes,GREEN


In [59]:
sex1=dict()
for i in range(len(df1)):
    sex1[df1["pid"][i]]=df1["resp_gend"][i]

caste1=dict()
for i in range(len(df1)):
    caste1[df1["pid"][i]]=df1["caste"][i]
    
religion1=dict()
for i in range(len(df1)):
    religion1[df1["pid"][i]]=df1["religion"][i]

sex2=dict()
for i in range(203,406):
    sex2[df2["pid"][i]]=df2["resp_gend"][i]

caste2=dict()
for i in range(203,406):
    caste2[df2["pid"][i]]=df2["caste"][i]

religion2=dict()
for i in range(203,406):
    religion2[df2["pid"][i]]=df2["religion"][i]

In [53]:
caste2[202802]

'OBC'

In [61]:
#OR
#sex1 = df1.set_index("pid")["resp_gend"].to_dict()
#caste1 = df1.set_index("pid")["caste"].to_dict()
#religion1 = df1.set_index("pid")["religion"].to_dict()

#sex2 = df2.set_index("pid")["resp_gend"].to_dict()
#caste2 = df2.set_index("pid")["caste"].to_dict()
#religion2 = df2.set_index("pid")["religion"].to_dict()

In [62]:
sex_1 = chance_homophily(sex1)
print("sex1: ") 
print(sex_1)

sex_2=chance_homophily(sex2)
print("sex2: ") 
print(sex_2)

caste_1=chance_homophily(caste1)
print("caste1: ") 
print(caste_1)

caste_2=chance_homophily(caste2)
print("caste2: ") 
print(caste_2)

religion_1=chance_homophily(religion1)
print("religion1: ") 
print(religion_1)

religion_2=chance_homophily(religion2)
print("religion2: ") 
print(religion_2)

sex1: 
0.5027299861680701
sex2: 
0.5005945303210464
caste1: 
0.6741488509791551
caste2: 
0.425368244800893
religion1: 
0.9804896988521925
religion2: 
1.0


In [65]:
def homophily(G, chars, IDs):
    """
    Given a network G, a dict of characteristics chars for node IDs,
    and dict of node IDs for each node in the network,
    find the homophily of the network.
    """
    num_same_ties = 0
    num_ties = 0
    for n1, n2 in G.edges():
        if IDs[n1] in chars and IDs[n2] in chars:
            if G.has_edge(n1, n2):
                num_ties+=1
                if chars[IDs[n1]] == chars[IDs[n2]]:
                    num_same_ties+=1
    return (num_same_ties / num_ties)


In [90]:
data_filepath1 = "https://courses.edx.org/asset-v1:HarvardX+PH526x+2T2019+type@asset+block@key_vilno_1.csv"
data_filepath2 = "https://courses.edx.org/asset-v1:HarvardX+PH526x+2T2019+type@asset+block@key_vilno_2.csv"

pid1=pd.read_csv(data_filepath1, index_col=0)
pid2=pd.read_csv(data_filepath2, index_col=0)

In [77]:
pid1.iloc[100]

0    102205
Name: 100, dtype: int64

In [86]:
import networkx as nx
A1 = np.array(pd.read_csv("https://courses.edx.org/asset-v1:HarvardX+PH526x+2T2019+type@asset+block@adj_allVillageRelationships_vilno1.csv", index_col=0))
A2 = np.array(pd.read_csv("https://courses.edx.org/asset-v1:HarvardX+PH526x+2T2019+type@asset+block@adj_allVillageRelationships_vilno2.csv", index_col=0))
G1 = nx.to_networkx_graph(A1)
G2 = nx.to_networkx_graph(A2)

pid1 = pd.read_csv(data_filepath1, dtype=int)['0'].to_dict() #since column name is '0'
pid2 = pd.read_csv(data_filepath2, dtype=int)['0'].to_dict()

print("sex1: ") 
print(homophily(G1, sex1, pid1))
print("sex2: ") 
print(homophily(G2, sex2, pid2))
print("caste1: ") 
print(homophily(G1, caste1, pid1))
print("caste2: ") 
print(homophily(G2, caste2, pid2))
print("religion1: ") 
print(homophily(G1, religion1, pid1))
print("religion2: ") 
print(homophily(G2, religion2, pid2))

sex1: 
0.5908629441624366
sex2: 
0.5658073270013568
caste1: 
0.7959390862944162
caste2: 
0.8276797829036635
religion1: 
0.9908629441624366
religion2: 
1.0


In [87]:
#So what is shown that observed hompifily is higher than chance. The observed looks at nodes that are connected with an edge,
#so the connected nodes have higher hompilify than when compared to chance.