---
# Data modeling, importing, Indexing and Querying Crane datasets

Date: 31-01-2020 <br>
Concept version: 1.0 <br>
Author: Pieter Lems  <br>

© Copyright 2019 Ministerie van Defensie

This notebook wil provide information relating to creating data models for MongoDB.<br>
To create the data models we are going to use Python and MongoEngine. The notebook also shows how to import the data into the mongoDB datastores.<br>


## Contents of notebook
- Importing the required modules 
- Reading the datasets
- Validating the datasets
- Connecting to the database
    - Create Docker MongoDB database (if needed)
    - Connect
- Creating the model
- Loading the data using the model
    - Creating the import functions
    - Load the data
- Querying the data (pre-indexing)
- Indexing the data
- Querying the data (post-indexing)
- Loading GeoJSON data (Not needed but implented to show how its done)

### The data sets in used in this notebook can be found in the folder ("../Data/Crane_JSON/")

## Importing the required modules

In [1]:
import pandas as pd

from mongoengine import * 

from datetime import datetime 

##  Reading the datasets

In [2]:
Agentha = pd.read_json(
    '../Data/Crane_JSON/Agnetha-SW.json')

Frida = pd.read_json(
    '../Data/Crane_JSON/Frida-SW.json')

Cajsa = pd.read_json(
    '../Data/Crane_JSON/Cajsa-SW.json')

Nena = pd.read_json(
    '../Data/Crane_JSON/Nena-GE.json')

Lotta = pd.read_json(
    '../Data/Crane_JSON/Lotta-GE.json')

Lita = pd.read_json(
    '../Data/Crane_JSON/Lita-LT.json')

###  Connecting to the database

#### Create Docker container
Uncomment the next line if you dont have a mongoDB docker container
and you want to import the data in a docker container.

This command will download a MongoDB docker image and run the container on port 27017 (localhost:27017)

In [3]:
#!docker run -d -p 27017:27017 mongo:latest

#### Connect to a database called: "Crane_Database"

In [4]:
connect('Crane_Database')

MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, read_preference=Primary())

---

## Creating the model

In [5]:
# Creating the Tracker document
class Tracker(Document):
    
    # Name of the study
    study_name = StringField()
    
    # Name of the bird, in latin.
    individual_taxon_canonical_name = StringField()
    
    # Id of the crane 
    individual_local_identifier = IntField()
    
    #Start date of the study
    start_date = DateTimeField()
    
    #End date of the study
    end_date = DateTimeField()

    #Name of the crane
    name = StringField()
    
    #Amount of the transmissions related to the tracker
    transmission_Count= IntField()
    
    
    
# Creating the Geometry document
class Geometry(EmbeddedDocument):
 
    # coordinates of transmission (coord=[1,2])
    coord = PointField()
    
    # altitude of tansmission
    alt = FloatField()

    
# Creating the Speed document    
class Speed(EmbeddedDocument):
    
    # Speed of the Crane
    ground_speed = FloatField()
    
    # Heading of the Crane in degrees
    heading = IntField()
    
# Creating the TrackerMetadata document
class TrackerMetadata(EmbeddedDocument):
    
    #Is the tracker still visible or not?
    visible = BooleanField()
    
    # Type of sensor used in tracker.
    sensor_type = StringField()
    
    # Voltage level of the tracker.
    tag_voltage = FloatField()
    
    
# Creating the Transmission document 
class Transmission(Document):
    
    # Identifier of the transmission
    event_id = IntField()
    
    # Timestamp of when transmission was send 
    timestamp = DateTimeField()
    
    # Embedded geometry of transmission
    geometry = EmbeddedDocumentField(Geometry)
    
    # Embedded speed related data of transmission
    speed = EmbeddedDocumentField(Speed)
    
    # Embedded metadata of transmission
    metadata = EmbeddedDocumentField(TrackerMetadata)
    
    # Reference to the tracker the transmission belongs to
    tracker = ReferenceField(Tracker)
    

---
## Loading the data using the model

###  Creating the import function

In [6]:
def load_data(df,name,country):
    
    # Here we create metadata for the tracker.
    start_Date = df.at[0,'timestamp']
    end_Date = df.at[df.shape[0]-1,'timestamp']
    transmission_Count = df.shape[0]
    
    #Create a new tracker, this is only done once 
    tracker = Tracker(study_name = df.at[0,'study-name'],
                      individual_taxon_canonical_name = df.at[0,'individual-taxon-canonical-name'],
                      individual_local_identifier = df.at[0,'individual-local-identifier'],
                      start_date = start_Date,
                      end_date = end_Date,
                      name = name,
                      transmission_Count = transmission_Count)
    
    # Save the tracker to the database.
    tracker.save()
    
    # Create an empty list of transmissions to which will append the new transmissions 
    # after they have been created. This list will be passed to the mongodb bulk insert feature.
    transmissions = []
    
    # Print when list appending process starts. 
    print('Start appending transmissions to list from: ' + str(name) )
    
    # For each row in the dataframe the following code is executed.
    for index,row in df.iterrows():
        
        if country == "sw":  
            # Create geometry document for Swedish sets in which we pass the required values.
            # NOTE: To use Geometry queries you have to insert te longitude value first.
            geometry = Geometry(coord = [row['location-long'],row['location-lat']],
                                alt = row['height-above-ellipsoid'])
        else:
            # Create geometry document for the remaining datasets in which we pass the required values.
            # NOTE: To use Geometry queries you have to insert te longitude value first.
            geometry = Geometry(coord = [row['location-long'],row['location-lat']],
                                alt = row['height-above-msl'])
        
        # Create the metadata document in which we pass the required values.
        metadata = TrackerMetadata(visible = row['visible'],
                                   sensor_type = row['sensor-type'],
                                   tag_voltage = row['tag-voltage'])
        
        # Create the speed document in which we pass the required values.
        speed = Speed(ground_speed = row['ground-speed'])
        
        # Create transmission document and append them to the transmissions list.
        transmissions.append(Transmission(event_id = row['event-id'],
                                          timestamp = row['timestamp'],
                                          geometry = geometry,
                                          speed = speed,
                                          metadata = metadata,
                                          tracker = tracker))
        
    # Print when list appending is done.
    print('Bulk inserting: '+ str(transmission_Count) + ' transmissions from: ' + str(name) )
        
    # Bulk insert the populated transmissions list.
    Transmission.objects.insert(transmissions,load_bulk=True)

    # Print if insert is succesfull.
    print("Done inserting "+ str(len(df.index)) + " transmissions")

### Loading the data using the load_data function.

In [7]:
load_data(Lita,"Lita",'lt')
load_data(Nena,"Nena",'ge')
load_data(Lotta,"Lotta",'ge')
load_data(Agentha,"Agnetha",'sw')
load_data(Frida,"Frida",'sw')
load_data(Cajsa,"Cajsa",'sw')

Start appending transmissions to list from: Lita
Bulk inserting: 254228 transmissions from: Lita
Done inserting 254228 transmissions
Start appending transmissions to list from: Nena
Bulk inserting: 11626 transmissions from: Nena
Done inserting 11626 transmissions
Start appending transmissions to list from: Lotta
Bulk inserting: 29934 transmissions from: Lotta
Done inserting 29934 transmissions
Start appending transmissions to list from: Agnetha
Bulk inserting: 44534 transmissions from: Agnetha
Done inserting 44534 transmissions
Start appending transmissions to list from: Frida
Bulk inserting: 123805 transmissions from: Frida
Done inserting 123805 transmissions
Start appending transmissions to list from: Cajsa
Bulk inserting: 67887 transmissions from: Cajsa
Done inserting 67887 transmissions


---
### Querying the data pre-index

First we will run a couple of queries before we create the indexes on the database. By doing this, we can compare the time it takes to return a certain amount of data with and without an indexed database. To find information related to the execution of the query add .explain() behind the query.

---

#### Query to find ID of crane Frida 

In [19]:
Tracker.objects(name = 'Frida').only('name','id').to_json()

'[{"_id": {"$oid": "5e1dbcae507872e91a9d5313"}, "name": "Frida"}]'

#### Query to check executing speed of transmissions related to Frida

In [None]:
Transmission.objects(tracker='5e1dbcae507872e91a9d5313').explain()

#### Query to find ID of crane lotta 

In [None]:
Tracker.objects(name = 'Lotta').only('name','id').to_json()

#### Query to return al items related to Crane: Lotta

In [None]:
Transmission.objects(tracker='5e1dbd6e507872e91aa06d4c').explain()

###### It took 143 miliseconds to return 29934 results using a COLLSCAN (Collection scan)

#### Query to return al items related to Crane: Lotta, between 2018-06-01 and 2018-09-01

In [None]:
Transmission.objects(Q(tracker='5dde98e87990f3ac79500deb')&
                     Q(timestamp__gte=datetime(2018,6,1)) &
                     Q(timestamp__lte=datetime(2018,9,1))).explain()

##### It took 80 miliseconds to return 110 results using a COLLSCAN (Collection scan)

#### Query to return al items in a predefined bound box (The Netherlands in this case)
Bounds of the box can be found using the following webite: https://www.keene.edu/campus/maps/tool/

In [None]:
Transmission.objects(geometry__coord__geo_within_box=[
    (3.2299835,50.7920471),(7.4926788,53.5729383)]).explain()

##### It took 744 miliseconds to return 489 results using a COLLSCAN (Collection scan)

#### Query to return al items in a predefined polygone (The Netherlands in this case)
Bounds of the polygone can be found using the following webite: https://www.keene.edu/campus/maps/tool/

In [None]:
Transmission.objects(geometry__coord__geo_within=[[
    [3.2409668,52.2395743],[3.8781738,51.1672889],
    [5.1443481,51.9950282],[3.2409668,52.2395743]]]).explain()

##### It took 720 miliseconds to return 131 results using a COLLSCAN (Collection scan)

### Indexing the database

##### there are 2 ways to create indexes on data. 
- Create an index when modeling the data.<br>
to create an index while creating the data model, we have to add a meta field to  the 	document we want to create an index on. For example: If we want to create an index on 	the altitude field in the geometry document, we add the following meta field to our geometry document:

In [None]:
class Geometry(EmbeddedDocument):
 
    coord = PointField()
    
    alt = FloatField()
    
    meta = {
        'collection': 'altitude',
        'indexes': [
          {'fields': ['alt']}
        ]
    }

- Create indexes after modeling the data <br>
  We can also create the indexes after we created the datamodel. We are going to use this way to create indexes below. For example: if we want to create an index on the altitude field after creating the data model we would run the following command: <br>
  Transmission.create_index(("geometry.alt"))
  
- Create indexes using pymongo
    add 2d index to coord field db.signals.ensureIndex({"geometry.coord.coordinates":"2d"});


  

##### We want to create 4 indexes 
- 2D Sphere index
  This index will be used to query the coordinates of the crane
  (This was automaticly done when assiging PointField() to the coordinates entry, when creating the database model)
- 2D index
  We need this index to be able to find coordinates in a cetrain box 
- timestamp index 
  We need this index because we will query on the timestamp a lot of times
- tracker index (in the transmission collection)
  We need this index because we will query to find transmissions per tracker using the tracker id

#### Create an index on the tracker field in the transmission collection

In [8]:
Transmission.create_index(("tracker"))

'tracker_1'

#### Create an index on the timestamp field in the transmission collection

In [9]:
Transmission.create_index(("timestamp"))

'timestamp_1'

#### Create an 2D index on the coordinates field in the transmission collection

In [10]:
Transmission.create_index(("geometry.coord"))

'geometry.coord_1'

---
### Querying the data post-index

#### Query to check executing speed after indexing 

In [None]:
Transmission.objects(tracker='5de04102b54094744cf72be1').explain()

#### Query to return al items related to Crane: Lotta

In [None]:
Transmission.objects(tracker='5e1dbd6e507872e91aa06d4c').explain()

###### It took 30 miliseconds to return 29934 results using a IXSCAN (Index scan)

#### Query to return al items in a predefined bound box (The Netherlands in this case)
Bounds of the box can be found using the following webite: https://www.keene.edu/campus/maps/tool/

In [None]:
Transmission.objects(geometry__coord__geo_within_box=[
      (3.2299835,50.7920471),(7.4926788,53.5729383)]).explain()

##### It took 20 miliseconds to return 489 results using a IXSCAN (Index scan)

#### Query to return al items related to Crane: Lotta, between 2018-06-01 and 2018-09-01

In [None]:
Transmission.objects(Q(tracker='5e1dbd6e507872e91aa06d4c')&
                     Q(timestamp__gte=datetime(2018,6,1)) &
                     Q(timestamp__lte=datetime(2018,9,1))).explain()

##### It took 0 miliseconds to return 110 results using a IXSCAN (Index scan)

#### Query to return al items in a predefined polygone (The Netherlands in this case)
Bounds of the polygone can be found using the following webite: https://www.keene.edu/campus/maps/tool/

In [None]:
Transmission.objects(geometry__coord__geo_within=[[
    [3.2409668,52.2395743],[3.8781738,51.1672889],
    [5.1443481,51.9950282],[3.2409668,52.2395743]]]).explain()

##### It took 3 miliseconds to return 131 results using a IXSCAN (Index scan)

---
## More Queries

In [None]:
#Select all trackers by study name
# Parameters:
# - route_name

def select_Tracker_by_name(study_name):
    result = Tracker.objects(study_name__contains=study_name).to_json()
    return pd.read_json(result)


In [None]:
# All transmissions between a predefined DTG
# Parameters: 
# - Date time group 1
# - Date time group 2

def transmissions_between_dtg(dtg_1,dtg_2):
    result = Transmission.objects(Q(timestamp__gte=dtg_1) & 
                                  Q(timestamp__lte=dtg_2)).to_json()
    return pd.read_json(result)


In [None]:
#Select all Transmissions in predefined Sphere
# parameters:
# - lon
# - lat
# - radius

def transmissions_in_sphere(lat,lon,radius):
    result = Transmission.objects(geometry__coord__geo_within_sphere=[(lon,lat),radius]).to_json()
    return pd.read_json(result)

In [None]:
#Select all Transmissions in predefined polygone
# Use https://www.keene.edu/campus/maps/tool/ to find desired polygone.
# parameters:
# - point 1
# - point 2
# - point 3
# - point 4
def select_transmissions_in_polygone(p1,p2,p3,p4):
    result = Transmission.objects(geometry__coord__geo_within=[[p1,p2,p3,p4]]).to_json()
    return pd.read_json(result)

In [None]:
#Select all Transmissions in predefined box
# Use https://www.keene.edu/campus/maps/tool/ to find desired box.
# parameters:
# - <bottom left coordinates>
# - <upper right coordinates>

def select_transmissions_in_box(p1,p2):
    result = Transmission.objects(geometry__coord__geo_within_box=[p1,p2]).to_json()
    return pd.read_json(result)

---
## Load GeoJSON Data

This is implented just to show how it's done.

---

In [None]:
# Define the file which we want to load.
inputfile = "../Data/Crane_GeoJSON/20181003_Dataset_SV_GPS_Crane_9381_STAW_Crane_RRW-BuGBk_Frida.json"

# Define the databese in which the data will be loaded.
to_database = 'GeoJSON_Database'

# Define the collection in which the data will be loaded.
to_collection =  'Transmissions'

# Define the server to which we will connect.
to_server = 'localhost'

# Define the port the server is running on.
to_port = '27017'

# Create the MongoDB connection string
uri = 'mongodb://' + to_server + ':' + to_port +'/'

# Set user to false (If no user is needed)
# Set to username if authentication is required
db_user = False

# If authentication is required, use the following code
if db_user:
  db_password = 'Your password'
  uri = 'mongodb://' + db_user + ':' + db_password + '@' + to_server + ':' + to_port +'/' + to_database

# Read the geojson file
with open(inputfile,'r') as f:
      geojson = json.loads(f.read())   

        
#Function for loading GeoJSON in MongoDB without model
# Parameter 1 = geojson to insert
# Parameter 2 = Collection to insert to 
# Parameter 3 = Database to insert to 
# Parameter 4 = Server the database is running on
# Parameter 5 = Port server is running on
# Parameter 6 = MongoDB connectionstring 

def load_geojson(inputfile, to_collection, to_database,
                 to_server, to_port, uri):

    
    # Assign connection related values to variables.
    client = MongoClient(uri)
    db = client[to_database]
    collection = db[to_collection]

    # create MongoDB index on geometry feature
    # More info on indexing can be found in the cookbook:
    # "Data modeling in MongoDB"
    collection.create_index([("geometry", GEOSPHERE)])
    
    # Initialize the bulk operation
    bulk = collection.initialize_unordered_bulk_op()

    # For each item in the feature object of our GeoJSON
    for feature in geojson['features']:
        
      # Convert datetime to valid format if needed
      #timestamp_w_tz = feature['properties']['timestamp']
      #feature['properties']['timestamp'] = datetime.strptime(timestamp, '%Y-%m-%dT%H:%M:%S.%fZ')

      # append all features to bulk insert
      bulk.insert(feature)

    # Execute bulk insert
    result = bulk.execute()
    
    # Print when data is inserted
    print("Features successully inserted")
    

In [None]:
load_geojson(inputfile, to_collection, to_database, to_server, to_port, uri)