# Project 1
### DATA 620
### David Moste &emsp; Euclid zhang &emsp; Samuel Reeves  
### 6/4/2021

Import required libraries

In [91]:
import requests
import networkx as nx
import pandas as pd
import numpy as np
from scipy.stats import ttest_ind
pd.options.mode.chained_assignment = None 

Load the edges data

In [2]:
response = requests.get("https://raw.githubusercontent.com/ezaccountz/Data_620/main/project%201/0.edges").text

In [7]:
lines = response.split("\n")[:-1]

In [8]:
data_full = pd.DataFrame(data = [n.split(" ") for n in lines], columns = ['node1','node2'])
data_full

Unnamed: 0,node1,node2
0,236,186
1,122,285
2,24,346
3,271,304
4,176,9
...,...,...
5033,171,58
5034,326,20
5035,85,75
5036,98,332


Load the features data
The features of the nodes are anonymized. Feature ID 77 and 78 are the features representing gender

In [9]:
response = requests.get("https://raw.githubusercontent.com/ezaccountz/Data_620/main/project%201/0.feat").text
lines = response.split("\n")[:-1]
feature_full = pd.DataFrame(data = [n.split(" ") for n in lines])
feature_full

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,215,216,217,218,219,220,221,222,223,224
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,0,0,0,0,0,0,0,1,0,...,0,0,0,1,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
342,343,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
343,344,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
344,345,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
345,346,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Since the two gender features are mutually exclusive. We will only need to use 1. We select feature 77, which has column index 78.

In [72]:
feature_gender = feature_full[[0,78]]

feature_gender.columns = ['Node ID','Gender']

feature_gender['Node ID'] = [int(i) for i in feature_gender['Node ID']]
feature_gender['Gender'] = [int(i) for i in feature_gender['Gender']]
feature_gender

Unnamed: 0,Node ID,Gender
0,1,1
1,2,0
2,3,0
3,4,0
4,5,0
...,...,...
342,343,0
343,344,0
344,345,1
345,346,0


Create a graph

In [53]:
g = nx.from_pandas_edgelist(data_full, 'node1','node2')

Run summary

In [105]:
summarys = pd.DataFrame(dict(
    DEGREE = dict(g.degree),
    DEGREE_CENTRALITY = nx.degree_centrality(g),
    EIGENVECTOR = nx.eigenvector_centrality(g),
    CLOSENESS_CENTRALITY = nx.closeness_centrality(g),
    BETWEENNESS_CENTRALITY = nx.betweenness_centrality(g),
)) 

Convert the row index back to Node ID and sort them by the Node ID

In [106]:
summarys.reset_index(inplace=True)
summarys['index'] = [int(i) for i in summarys['index']]
summarys = summarys.sort_values(by = 'index')
summarys = summarys.rename(columns={"index":"Node ID"})
summarys

Unnamed: 0,Node ID,DEGREE,DEGREE_CENTRALITY,EIGENVECTOR,CLOSENESS_CENTRALITY,BETWEENNESS_CENTRALITY
125,1,16,0.048193,0.021892,0.301867,0.004335
197,2,9,0.027108,0.000002,0.200411,0.000177
31,3,16,0.048193,0.047858,0.298711,0.005137
293,4,9,0.027108,0.000029,0.204187,0.011466
209,5,12,0.036145,0.016386,0.292592,0.008219
...,...,...,...,...,...,...
149,343,17,0.051205,0.000007,0.231914,0.008575
58,344,8,0.024096,0.021933,0.280325,0.000165
96,345,15,0.045181,0.045091,0.301289,0.000452
5,346,26,0.078313,0.009060,0.284127,0.004653


Left join merging to add the gender indicator

In [107]:
summarys = summarys.join(feature_gender.set_index('Node ID'), on = 'Node ID')
summarys

Unnamed: 0,Node ID,DEGREE,DEGREE_CENTRALITY,EIGENVECTOR,CLOSENESS_CENTRALITY,BETWEENNESS_CENTRALITY,Gender
125,1,16,0.048193,0.021892,0.301867,0.004335,1
197,2,9,0.027108,0.000002,0.200411,0.000177,0
31,3,16,0.048193,0.047858,0.298711,0.005137,0
293,4,9,0.027108,0.000029,0.204187,0.011466,0
209,5,12,0.036145,0.016386,0.292592,0.008219,0
...,...,...,...,...,...,...,...
149,343,17,0.051205,0.000007,0.231914,0.008575,0
58,344,8,0.024096,0.021933,0.280325,0.000165,0
96,345,15,0.045181,0.045091,0.301289,0.000452,1
5,346,26,0.078313,0.009060,0.284127,0.004653,0


Hypothesis tests of the centrality measurements between the group Gender = 0 and the group Gender = 1
The p-values are:

In [108]:
for centrality in summarys.columns[1:-1]:
    print(centrality)
    ttest,pval = ttest_ind(summarys[centrality][summarys['Gender'] == 0],summarys[centrality][summarys['Gender'] == 1])
    print(pval)

DEGREE
0.44785766197029886
DEGREE_CENTRALITY
0.4478576619702975
EIGENVECTOR
0.4125448361237033
CLOSENESS_CENTRALITY
0.08679832799079408
BETWEENNESS_CENTRALITY
0.276876807301128


For DEGREE, DEGREE_CENTRALITY and BETWEENNESS_CENTRALITY, there is no strong evidence to reject the null hypothesis that the measurements are different across groups. For CLOSENESS_CENTRALITY, we can reject the null hypothesis at 10% level. The people in one gender may be closer in the network.