## Build a contact network

# Download dataset

In [1]:
#!wget https://lp-prod-resources.s3.amazonaws.com/628/66549/2021-06-25-19-30-14/PeopleLocations.csv

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
import datetime as dt
from sklearn import preprocessing
from geopy.distance import geodesic 

In [3]:
# read in csv to pd.DataFrame
df = pd.read_csv("PeopleLocations.csv", sep = ";", 
                dtype={'id': np.str_, 'Lat': np.float32, 'Lon': np.float32})

In [4]:
# convert type to category
df["Covid19"] = df["Covid19"].astype("category")

In [5]:
# check the dtypes
df.dtypes

ID           object
Lat         float32
Lon         float32
Date         object
Time         object
Covid19    category
dtype: object

In [6]:
# rename the first column 
df.rename(columns={"ID": "IDcol"}, inplace=True)

In [7]:
# store the header of the first colum of dataframe df in variable "IDcol"
IDcol = df.columns[0]

# determine a list of different individuals for which there is at least one record in the csv file
uniquepart = df[IDcol].unique()

In [8]:
# a bit of the data
df.head(10)

Unnamed: 0,IDcol,Lat,Lon,Date,Time,Covid19
0,Person1,60.18539,25.009689,09-06-2021,13:52:09,n
1,Person2,60.185387,25.009678,09-06-2021,13:52:09,n
2,Person3,60.18539,25.009695,09-06-2021,13:52:09,n
3,Person4,60.18539,25.009689,09-06-2021,13:52:09,y
4,Person5,60.185387,25.009672,09-06-2021,13:52:09,n
5,Person6,60.185394,25.009706,09-06-2021,13:52:09,n
6,Person7,60.185383,25.009668,09-06-2021,13:52:09,y
7,Person8,60.185387,25.009686,09-06-2021,13:52:09,n
8,Person9,60.185379,25.009634,09-06-2021,13:52:09,n
9,Person10,60.185387,25.009678,09-06-2021,13:52:09,n


In [21]:
nrfeatures=6

# store the first colum of dataframe df in variable "IDcol"

IDcol = df.columns[0]

# determine a list of different individuals for which there is at least one record in the csv file

uniquepart = df[IDcol].unique()       

# count the number of different individuals. this will be the number of nodes in the contace network

nrnodes = len(uniquepart)

To build the contact network we add an edge between nodes representing individuals for which we can find location recording which are closer than 2 meters. 

In [25]:
# create networkx object `G` by adding nodes for each individual with a record in "PeopleLocations.csv"

G = nx.Graph()

# we use a label encoder used to transfrom values 'y'/'n' for Covid19 infection to values 1 and 0
le = preprocessing.LabelEncoder()                
le.fit(["n", "y"])

# iterate over individuals represnted by network nodes indexed by nodeidx=0,1,...

for nodeidx in range(nrnodes): 
    
    # read in identifier of individual from list `uniquepart` and store in variable "personid"
    personid = uniquepart[nodeidx]
    
    # create dataframe "dmydf" by selecting all rows from dataframe `df` with attribute `ID` equal to `personid`
    dmydf = pd.DataFrame(df.loc[df['IDcol'] == personid].copy())
    # create dataframe "dmydf_features" by selecting all rows from dataframe `df` with attribute `ID` equal to `personid`
    dmydf_features = pd.DataFrame(df.loc[df['IDcol'] == personid].copy())
    
    # reset index of dataframe dmydf 
    dmydf.reset_index(drop=True, inplace=True) 
    # reset index of dataframe dmydf_features 
    dmydf_features.reset_index(drop=True, inplace=True) 
    
    # read in latitude of first location recording in `dmydf` and store in variable `latitude`
    latitude=dmydf.loc[0,['Lat']][0]
    
    # read in longitude of first location recording in `dmydf` and store in variable `longitude`
    longitude=dmydf.loc[0,['Lon']][0]
    
    # read in Covid19 infection status of first location recording in `dmydf` and store in variable `valtmp`
    valtmp=dmydf.loc[0,['Covid19']][0]
    
    # use le.transform() to map the infection status `valtmp` as `y`->1 and `n`-> 0
    infected=le.transform([valtmp])
    
    # read in the date of the recording and store in variable date_tmp
    date_tmp = dt.datetime.strptime(dmydf.loc[0,['Date']][0], '%d-%m-%Y').date() 
    
    # read in the time of the recording and store in variable time_tmp
    time_tmp = dt.datetime.strptime(dmydf.loc[0,['Time']][0], '%H:%M:%S').time()
    
    # combine date and time of location racording using `datetime.combine()
    mydatetime = dt.datetime.combine(date_tmp, time_tmp)
    
    # add a node with index `nodeidx`
    G.add_node(nodeidx)
    # set the node attribute "name" to the string stored in "personid"
    G.nodes[nodeidx]['name']= personid
    # set the node attribute "coords" to a numpy array with entries "latitude" and "longitude"
    G.nodes[nodeidx]['coords']= np.array([latitude,longitude])
    # set the node attribute "timestamp" to the value of "mydatetime"
    G.nodes[nodeidx]['timestamp'] = mydatetime
    # set the node attribute "y" equal to 1 if individual has been reported as Covid-19 infected and 0 otherwise
    G.nodes[nodeidx]['y'] = infected[0] 
    # set the node attribute "w" to a numpy array of shape (6,) and entries all zero
    G.nodes[nodeidx]['w'] = np.zeros(nrfeatures)    
    # set the node attribute "b" to 0.0
    G.nodes[nodeidx]['b'] = 0.0  

    # read in the features x1,...,x6 from dataframe "dmydf_features" and store in numpy array "dmyvec"
    dmyvec = np.zeros(nrfeatures)
    for iterfeature in range(nrfeatures):
        keytmp = "x%d"% (iterfeature+1)
        dmyvec[iterfeature]=dmydf_features.loc[0,[keytmp]][0]
    
    # set the node attribute "x" to the numpy array "dmyvec"
    G.nodes[nodeidx]['x'] = dmyvec


KeyError: "None of [Index(['x1'], dtype='object')] are in the [index]"

In [None]:
# two nested for-loops over node indices 0,1,...,nrnodes-1 
# the loop variables are named "nodeidx1" and "nodeidx2"

for nodeidx1 in range(nrnodes): 
    for nodeidx2 in range(nrnodes): 
        # test if nodeidx1 is different from nodeidx2
        if nodeidx1!=nodeidx2 : 
            # compute the geodesic distance between individualas "nodeidx1" and "nodeidx2" in meters 
            nodedist=geodesic(G.nodes[nodeidx1]['coords'],G.nodes[nodeidx2]['coords']).meters
            # if distance is below two meters connect invididuals by and edge. 
            if  nodedist<2: 
                G.add_edge(nodeidx1,nodeidx2)

In [None]:
# Create new graph object "SubGraph" using G.subgraph() consisting of nodes 0,1,2,3,4
SubGraph = G.subgraph([0,1,2,3,4])

# read out node attribute `b`from all nodes in "SubGraph" and store in variable "labels"
labels = nx.get_node_attributes(SubGraph, 'b') 

# plot "SubGraph" using nx.draw_networkx() with "labels" as node labels 
nx.draw_networkx(SubGraph,labels = labels) 

Personalized Diagnosis

his milestone requires you to learn personalized predictors for a Covid-19 infection. To this end you will the combine the gradient descent algorithm for logistic regression with a network averaging method for aggregating local gradients computed for each individual. 

More formally, we assign each invidiual $i$ a linear classifier with weight vector $\mathbf{w}^{(i)}=\big(w^{(i)}_{1},\ldots,w^{(i)}_{6}\big)^{T}$ and intercept (bias) term $b^{(i)}$. Given an individual $i$ with features $\mathbf{x}^{(i)}$ (extracted from an audio recording) we diagnose a Covid-19 infection if $\mathbf{w}^{T} \mathbf{x}^{(i)} +b^{(i)} \geq0$. To learn the weight vector and  intercept term for the node $i$ that belongs to the component $\mathcal{C}$ of the contact network, we use a sufficient number of gradient descent steps
$$ \mathbf{w}^{(k+1)} = \mathbf{w}^{(k)} - \alpha \mathbf{g}^{(k)} \mbox{ with } \mathbf{g}^{(k)}= (1/|\mathcal{C}|) \sum_{j \in \mathcal{C}} \big(h\big(\big(\mathbf{w}^{(k)}\big)^{T} \mathbf{x}^{(j)}\big) - y^{(j)}\big) \mathbf{x}^{(j)} $$ 
and
$$ b^{(k+1)} = b^{(k)} - \alpha v^{(k)} \mbox{ with } v^{(k)}= (1/|\mathcal{C}|) \sum_{j \in \mathcal{C}} \big(h\big(\big(\mathbf{w}^{(k)}\big)^{T} \mathbf{x}^{(j)}\big) - y^{(j)}\big)  $$. 

We will estimate the gradients $\mathbf{g}^{(k)}$ and $v^{(k)}$ using the averaging algorithm that we used in Project 2 for computing the average infection rates.

In [None]:
# define sigmoid function
# helps us computer the probability of an individual being infected
# with Covid 19

def sigmoid(X, theta):
    '''
    Computes the sigmoid of the linear combination of X and theta.

    Parameters
    ----------
    X : numpy array of shape (n, m)

    theta : numpy array of shape (m,)
        The parameters of the logistic regression model.

    Returns
    -------
    numpy array of shape (n,)
        The sigmoid of the linear combination of X and theta.
    
    Examples
    --------
    >>> X = np.array([[1, 2, 3], [4, 5, 6]])
    >>> theta = np.array([1, 2, 3])
    >>> sigmoid(X, theta)
    array([0.99987661, 1.        ])
    '''
    # compute the linear combination of x and theta
    z = np.dot(X, theta[1:]) + theta[0]

    # compute the sigmoid of z
    return 1 / (1 + np.exp(-z))


In [None]:
# maps each node to a dictionary of its neighbors
weights_tmp_dic=nx.get_node_attributes(G,'w')

# make zeros based on the number of nodes and features
weights_tmp = np.zeros((nrnodes,nrfeatures))

# maps each node to an intercept value
intercept_tmp_dic=nx.get_node_attributes(G,'b')

# maps zero based on the number of nodes
intercept_tmp = np.zeros(nrnodes)

# maps each node to a feature vector
features_tmp_dic=nx.get_node_attributes(G,'x')

# make zeros based on the number of nodes and features
features_tmp = np.zeros((nrnodes,nrfeatures))

# maps each node to a label
label_tmp_dic=nx.get_node_attributes(G,'y')

# maps zero based on the number of nodes
label_tmp = np.zeros(nrnodes)

# loop over all nodes
for iternode in range(nrnodes):
      weights_tmp[iternode,:] = weights_tmp_dic[iternode]
      intercept_tmp[iternode] = intercept_tmp_dic[iternode]
      features_tmp[iternode,:] = features_tmp_dic[iternode]
      label_tmp[iternode] = label_tmp_dic[iternode]

# set step-size
alpha = 1/10

# copy weights and intercept to new variables
weights_old = weights_tmp.copy() 
intercept_old = intercept_tmp.copy()
gradient_tmp = np.zeros((nrnodes,nrfeatures+1)) # each row hold the gradient for intercept and weights 
gradient_old = np.zeros((nrnodes,nrfeatures+1)) 

# 50 iterations
nriters=50

# create "Metropolis-Hastings" weights and store them in numpy array `W_MH`
W_MH = np.zeros((nrnodes,nrnodes)) # create array for MH weights and init to all zeroes
# iterate over all edges in the contact network G
for edge in G.edges(): 
    node_a = edge[0] # first node of edge
    node_b = edge[1] # second node of edge
    # set weights W[node_a,node_b] and W[node_b,node_a] to 1/(max(degree(node_a),degree(node_b))+1)
    W_MH[node_a,node_b] = 1/(np.max([G.degree(node_a),G.degree(node_b)])+1) 

    # set weights W[node_a,node_b] and W[node_b,node_a] to 1/(max(degree(node_a),degree(node_b))+1)
    W_MH[node_b,node_a] = 1/(np.max([G.degree(node_a),G.degree(node_b)])+1)

# loop over all nodes in the contact network G
for nodedmy in G.nodes(): 
# set weights W[nodedmy,nodedmy] to 1 - sum of weights for all neighbors of nodedmy
    W_MH[nodedmy,nodedmy] = 1-np.sum(W_MH[nodedmy,:])
    
# set number of iterations for gradient descent to default value 200
nrlogregiters = 10

# main loop for the federated learning algorithm 
# each iteration amounts to network averaging of all local gradients 

for iterlogreg in range(nrlogregiters):
# compute gradients at each node 
    for iternode in range(nrnodes):
# stack weights and intercept into theta
        theta = np.hstack((intercept_tmp[iternode],weights_tmp[iternode]))
        # compute sgmoid function of predictor value w^T x
        hx = sigmoid(features_tmp[iternode], theta)
        # calculate error
        error = hx - label_tmp[iternode]
        # compute gradient for local loss function and store in gradient_tmp
        gradient_tmp[iternode,:] = np.hstack((error,error*features_tmp[iternode]))
          
    
    
# average gradients using nriters consensus iterations
    for iterdmy in range(nriters):
        # read in current values of "Rate" attributes into numpy array `graphsigold`
        gradient_old = gradient_tmp 
        # update estimate "gradient_tmp" by applying W_MH to current estimate
        gradient_tmp = np.dot(W_MH, gradient_old)
    
    # do a gradient descent step for intercept_tmp using step size alpha
    intercept_tmp -= alpha*gradient_tmp[:,0]
    # do a gradient descent step for weights_tmp using step size alpha
    weights_tmp -= alpha*gradient_tmp[:,1:]

    


# loop over all nodes in the contact network G store the weights in "weights_tmp" in the node attribute "weights"
# store the incepts in "intercept_tmp" in the node attribute "intercep"

for node_i in G.nodes(data=False): 
    G.node[node_i]['w'] = weights_tmp[node_i]
    G.node[node_i]['b'] = intercept_tmp[node_i]
  
    print("weights node %d :"%node_i,weights_tmp[node_i])

# summary of the project

In this project, we will build a contact network from the location data of the individuals. Then we will use the contact network to learn personalized predictors for a Covid-19 infection. To this end you will the combine the gradient descent algorithm for logistic regression with a network averaging method for aggregating local gradients computed for each individual.