## Code Details
Author: Rory Angus<br>
Created: 19NOV18<br>
Version: 0.1<br>
***
This code is to test a writing of data to a MongoDB. <br>
This is a proof of concept and the data is real. However, it does not bring all of it into Mongo, only the key fields.
This uses data that was extracted after the SSO data model was implemented at LE on the platform.
The data now is in three parts. The groups and their members, the users linked to the results as well as coaching/coachee relationship. The groups and their members as well as the users linked to the results.

Please note that the results from doing the survey can be more than two per journey. 

# Package Importing + Variable Setting

In [1]:
import pandas as pd
import numpy as np

import datetime

# mongo stuff
import pymongo
from pymongo import MongoClient
from bson.objectid import ObjectId
import bson

In [2]:
readLoc = "~/datasets/CLARA/190328_052400_LE_LivePlatform_ClaraUsersGroups.json"
# if true the code outputs to the notebook a whole of diagnostic data that is helpful when writing but not so much when running it for real
verbose = False
# first run will truncate the target database and reload it from scratch. Once delta updates have been implmented this needs adjusting
first_run = True

# Set display options

In [3]:
# further details found by running:
# pd.describe_option('display')
# set the values to show all of the columns etc.
pd.set_option('display.max_columns', None)  # or 1000
pd.set_option('display.max_rows', None)  # or 1000
pd.set_option('display.max_colwidth', -1)  # or 199

# locals() # show all of the local environments

# Connect to Mongo DB

In [4]:
# create the connection to MongoDB
# define the location of the Mongo DB Server
# in this instance it is a local copy running on the dev machine. This is configurable at this point.
client = MongoClient('127.0.0.1', 27017)

# define what the database is called.
db = client.CLARA

# define the collection
raw_data_collection = db.raw_data_group_user

Command to clean the databzse if needed when running this code

In [5]:
# Delete the raw_data_collection - used for testing
if first_run:
    raw_data_collection.drop()

# Functions Definitions

In [6]:
# The web framework gets post_id from the URL and passes it as a string
def get(post_id):
    # Convert from string to ObjectId:
    document = client.db.collection.find_one({'_id': ObjectId(post_id)})

# Place Group Memebership Data from CSV File into Mongo

In [7]:
#import the student file

claraDf = pd.read_json(readLoc, orient='records')

In [8]:
if verbose:

    # count columns and rows
    print("Number of columns are " + str(len(claraDf.columns)))
    print("Number of rows are " + str(len(claraDf.index)))
    print()

    # output the shape of the dataframe
    print("The shape of the data frame is " + str(claraDf.shape))
    print()

    # output the column names
    print("The column names of the data frame are: ")
    print(*claraDf, sep='\n')
    print()

    # output the column names and datatypes
    print("The datatypes of the data frame are: ")
    print(claraDf.dtypes)
    print()

Number of columns are 8
Number of rows are 463

The shape of the data frame is (463, 8)

The column names of the data frame are: 
addedAt
description
displayName
groupId
orgUserId
removedAt
type
userId

The datatypes of the data frame are: 
addedAt        object
description    object
displayName    object
groupId        int64 
orgUserId      int64 
removedAt      object
type           object
userId         object
dtype: object



In [9]:
# mongo is not able to store integers so convert them to strings

claraDf['orgUserId'] = claraDf['orgUserId'].astype(str)
claraDf['groupId'] = claraDf['groupId'].astype(str)

# output the column names and datatypes
if verbose:
    print("The datatypes of the data frame are: ")
    print(claraDf.dtypes)
    print()

The datatypes of the data frame are: 
addedAt        object
description    object
displayName    object
groupId        object
orgUserId      object
removedAt      object
type           object
userId         object
dtype: object



In [11]:
# Loop through the data frame and build a list
# the list will be used for a bulk update of MongoDB

# define the list to hold the data
clara_row = []

# loop through dataframe and create each item in the list
for index, row in claraDf.iterrows():
    clara_row.insert(
        index, {
            "userGroup_index": index,
            "groupId": claraDf['groupId'].iloc[index],
            "groupType": claraDf['type'].iloc[index],
            "groupName": claraDf['description'].iloc[index],
            "addedAt": claraDf['addedAt'].iloc[index],
            "removedAt": claraDf['removedAt'].iloc[index],
            "userDisplayName": claraDf['displayName'].iloc[index],
            "userId": claraDf['userId'].iloc[index],
            "orgUserId": claraDf['orgUserId'].iloc[index],
            "insertdate": datetime.datetime.utcnow()
        })

if verbose:
    print(clara_row[5])
    


{'userGroup_index': 5, 'groupId': '337', 'groupType': 'Role in school', 'groupName': 'UTS Students', 'addedAt': '2018-05-08 15:03:12', 'removedAt': None, 'userDisplayName': 'Ahmed Tawfik', 'userId': 'e69e9c49-d162-416c-a6fd-d2770583e025', 'orgUserId': '3', 'insertdate': datetime.datetime(2019, 3, 28, 6, 34, 19, 960689)}


In [12]:
# bulk update the mongo database

raw_data_collection.insert_many(clara_row)

if verbose:
    print(raw_data_collection.inserted_ids)

Collection(Database(MongoClient(host=['127.0.0.1:27017'], document_class=dict, tz_aware=False, connect=True), 'CLARA'), 'raw_data_group_user.inserted_ids')


## Create Index

In [14]:
# Only create the indexes onthe first run through
if first_run:
    # put the restult into a list so it can be looked at later.
    result = []

    # Create some indexes
    result.append(
        raw_data_collection.create_index(
            [('userGroup_index', pymongo.ASCENDING)], unique=False))
    result.append(
        raw_data_collection.create_index([('groupId', pymongo.ASCENDING)],
                                         unique=False))
    result.append(
        raw_data_collection.create_index([('groupType', pymongo.ASCENDING)],
                                         unique=False))
    result.append(
        raw_data_collection.create_index([('groupName', pymongo.ASCENDING)],
                                         unique=False))
    result.append(
        raw_data_collection.create_index(
            [('userDisplayName', pymongo.ASCENDING)], unique=False))
    result.append(
        raw_data_collection.create_index([('userId', pymongo.ASCENDING)],
                                         unique=False))
    result.append(
        raw_data_collection.create_index([('orgUserId', pymongo.ASCENDING)],
                                         unique=False))
    result.append(
        raw_data_collection.create_index([('insertdate', pymongo.ASCENDING)],
                                         unique=False))
    result.append(
        raw_data_collection.create_index([('addedAt', pymongo.ASCENDING)],
                                         unique=False))
    result.append(
        raw_data_collection.create_index([('removedAt', pymongo.ASCENDING)],
                                         unique=False))

    if verbose:
        print(result)

['userGroup_index_1', 'groupId_1', 'groupType_1', 'groupName_1', 'userDisplayName_1', 'userId_1', 'orgUserId_1', 'insertdate_1', 'addedAt_1', 'removedAt_1']
