## Code Details
Author: Rory Angus<br>
Created: 19NOV18<br>
Version: 0.1<br>
***
This code is to test a writing of data to a MongoDB. <br>
This is a proof of concept and the data is real. However, it does not bring all of it into Mongo, only the key fields.
This uses data that was extracted after the SSO data model was implemented at LE on the platform.
The data now is in two parts. The groups and their members as well as the users linked to the results.
Please note that the results from doing the survey can be more than two per journey. 

# Package Importing + Variable Setting

In [1]:
import pandas as pd
import numpy as np

import datetime

# mongo stuff
import pymongo
from pymongo import MongoClient
from bson.objectid import ObjectId
import bson

# json stuff
import json

In [2]:
# the file to read. This needs to be manually updated
readLoc = "~/datasets/CLARA/190328_052400_LE_LivePlatform_IndividualResults.json"
# if true the code outputs to the notebook a whole of diagnostic data that is helpful when writing but not so much when running it for real
verbose = False
# first run will truncate the target database and reload it from scratch. Once delta updates have been implmented this needs adjusting
first_run = True

# Set display options

In [3]:
# further details found by running:
# pd.describe_option('display')
# set the values to show all of the columns etc.
pd.set_option('display.max_columns', None)  # or 1000
pd.set_option('display.max_rows', None)  # or 1000
pd.set_option('display.max_colwidth', -1)  # or 199

# locals() # show all of the local environments

# Connect to Mongo DB

In [4]:
# create the connection to MongoDB
# define the location of the Mongo DB Server
# in this instance it is a local copy running on the dev machine. This is configurable at this point.
client = MongoClient('127.0.0.1', 27017)

# define what the database is called.
db = client.CLARA

# define the collection
raw_data_collection = db.raw_data_user_results

Command to clean the databzse if needed when running this code

In [5]:
# Delete the raw_data_collection - used for testing
if first_run:
    raw_data_collection.drop()

# Functions Definitions

In [6]:
# The web framework gets post_id from the URL and passes it as a string
def get(post_id):
    # Convert from string to ObjectId:
    document = client.db.collection.find_one({'_id': ObjectId(post_id)})

# Place CLARA Results from JSON File into Mongo

In [7]:
#import the data file

claraDf = pd.read_json(readLoc, orient='records')
if verbose:
    display(claraDf)

In [8]:
if verbose:

    # count columns and rows
    print("Number of columns are " + str(len(claraDf.columns)))
    print("Number of rows are " + str(len(claraDf.index)))
    print()

    # output the shape of the dataframe
    print("The shape of the data frame is " + str(claraDf.shape))
    print()

    # output the column names
    print("The column names of the data frame are: ")
    print(*claraDf, sep='\n')
    print()

    # output the column names and datatypes
    print("The datatypes of the data frame are: ")
    print(claraDf.dtypes)
    print()

### To Do
The index need to be replaced with the unique identifier for the student

In [9]:
# Loop through the data frame and build a list
# the list will be used for a bulk update of MongoDB

# I am having to convert to strings for the intergers as Mongo cannot handle the int64 datatype.
# It also cant handle the conversion to int32 at the point of loading the rows, so string is the fall back position

# define the list to hold the data
clara_row = []

# loop through dataframe and create each item in the list
for index, row in claraDf.iterrows():
    clara_row.insert(
        index, {
            "rowIndex":
            index,
            "userId":
            claraDf['userId'].iloc[index],
            "nameId":
            claraDf['nameId'].iloc[index],
            "primaryEmail":
            claraDf['primaryEmail'].iloc[index],
            "journeyId":
            claraDf['journeyId'].iloc[index].astype('str'),
            "journeyTitle":
            claraDf['journeyTitle'].iloc[index],
            "journeyPurpose":
            claraDf['journeyPurpose'].iloc[index],
            "journeyGoal":
            claraDf['journeyGoal'].iloc[index],
            "journeyCreatedAt":
            claraDf['journeyCreatedAt'].iloc[index],
            "claraId":
            claraDf['claraId'].iloc[index].astype('str'),
            "claraResultsJourneyStep":
            claraDf['claraResultsJourneyStep'].iloc[index],
            "claraResultsCreatedAt":
            claraDf['claraResultsCreatedAt'].iloc[index],
            "claraResultCompletedAt":
            claraDf['claraResultCompletedAt'].iloc[index],
            "claraResult1":
            claraDf['claraResult1'].iloc[index],
            "claraResult2":
            claraDf['claraResult2'].iloc[index],
            "claraResult3":
            claraDf['claraResult3'].iloc[index],
            "claraResult4":
            claraDf['claraResult4'].iloc[index],
            "claraResult5":
            claraDf['claraResult5'].iloc[index],
            "claraResult6":
            claraDf['claraResult6'].iloc[index],
            "claraResult7":
            claraDf['claraResult7'].iloc[index],
            "claraResult8":
            claraDf['claraResult8'].iloc[index],
            "insertdate":
            datetime.datetime.utcnow()
        })

if verbose:
    print(clara_row[0])

In [10]:
# bulk update the database

raw_data_collection.insert_many(clara_row)

if verbose:
    print(raw_data_collection.inserted_ids)

## Create Index

In [11]:
# Only create the indexes onthe first run through
if first_run:
    # put the restult into a list so it can be looked at later.
    result = []

    # Create some indexes
    result.append(
        raw_data_collection.create_index([('index', pymongo.ASCENDING)],
                                         unique=False))
    result.append(
        raw_data_collection.create_index([('userId', pymongo.ASCENDING)],
                                         unique=False))
    result.append(
        raw_data_collection.create_index([('nameId', pymongo.ASCENDING)],
                                         unique=False))
    result.append(
        raw_data_collection.create_index([('primaryEmail', pymongo.ASCENDING)],
                                         unique=False))
    result.append(
        raw_data_collection.create_index([('journeyId', pymongo.ASCENDING)],
                                         unique=False))
    result.append(
        raw_data_collection.create_index(
            [('journeyCreatedAt', pymongo.ASCENDING)], unique=False))
    result.append(
        raw_data_collection.create_index([('claraId', pymongo.ASCENDING)],
                                         unique=False))
    result.append(
        raw_data_collection.create_index(
            [('claraResultsCreatedAt', pymongo.ASCENDING)], unique=False))
    result.append(
        raw_data_collection.create_index(
            [('claraResultCompletedAt', pymongo.ASCENDING)], unique=False))
    result.append(
        raw_data_collection.create_index(
            [('claraResultsStep', pymongo.ASCENDING)], unique=False))
    result.append(
        raw_data_collection.create_index([('insertdate', pymongo.ASCENDING)],
                                         unique=False))

    if verbose:
        print(result)