## Build a contact network

# Download dataset

In [None]:
#!wget https://lp-prod-resources.s3.amazonaws.com/628/66549/2021-06-25-19-30-14/PeopleLocations.csv

In [27]:
import numpy as np
import pandas as pd
import networkx as nx
from sklearn import preprocessing

In [2]:
# read in csv to pd.DataFrame
df = pd.read_csv("PeopleLocations.csv", sep = ";", 
                 parse_dates=['Date'], dtype={'id': np.str_, 'Lat': np.float32, 'Lon': np.float32})

In [3]:
# convert type to category
df["Covid19"] = df["Covid19"].astype("category")

In [4]:
# check the dtypes
df.dtypes

ID                 object
Lat               float32
Lon               float32
Date       datetime64[ns]
Time               object
Covid19          category
dtype: object

In [5]:
# rename the first column 
df.rename(columns={"ID": "IDcol"}, inplace=True)

In [18]:
# a bit of the data
df.head(10)

Unnamed: 0,IDcol,Lat,Lon,Date,Time,Covid19
0,Person1,60.18539,25.009689,2021-09-06,13:52:09,n
1,Person2,60.185387,25.009678,2021-09-06,13:52:09,n
2,Person3,60.18539,25.009695,2021-09-06,13:52:09,n
3,Person4,60.18539,25.009689,2021-09-06,13:52:09,y
4,Person5,60.185387,25.009672,2021-09-06,13:52:09,n
5,Person6,60.185394,25.009706,2021-09-06,13:52:09,n
6,Person7,60.185383,25.009668,2021-09-06,13:52:09,y
7,Person8,60.185387,25.009686,2021-09-06,13:52:09,n
8,Person9,60.185379,25.009634,2021-09-06,13:52:09,n
9,Person10,60.185387,25.009678,2021-09-06,13:52:09,n


In [26]:
# determine a list of different individuals for which there is at least one record in the csv file 
dups = df[df.IDcol.duplicated()]

# count the number of different individuals. this will be the number of nodes in the contace network 
no_of_nodes = df.shape[0]


f"They are no duplicates in the dataset. However, {no_of_nodes} is."

'They are no duplicates in the dataset. However, 300 is.'

In [28]:
# preprocessing the dataframe: Covid19 column
le = preprocessing.LabelEncoder()
le.fit_transform(df.Covid19)

array([0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1,
       1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0,
       1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1,
       0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0,
       1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0])

In [7]:
# create a graph object
G = nx.Graph()

In [13]:
G.add_nodes_from({df['IDcol']})

TypeError: 'Series' objects are mutable, thus they cannot be hashed

In [12]:
G.nodes()

NodeView(('names',))