## Code Details
Author: Rory Angus<br>
Created: 19JAN19<br>
Version: 0.1<br>
***
This code loads the data from Mongo and processes it to make the information about who has access to the different CLARA results based on their group membership.<br>
At the moment this code is experimental and its functionality will be defined as I start to understand the data better. 

# Package Importing + Variable Setting

In [1]:
import matplotlib
#need to use this otherwise nothing appears in the notebook from the charting point of view
matplotlib.use('module://ipykernel.pylab.backend_inline')
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
from itertools import cycle, islice

import pandas as pd
import numpy as np
from math import pi
from math import ceil
from math import floor
import datetime

# mongo stuff
import pymongo
from pymongo import MongoClient
from bson.objectid import ObjectId

In [2]:
# packages for the widgets
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets

In [3]:
# if true the code outputs to the notebook a whole of diagnostic data that is helpful when writing but not so much when running it for real
verbose = False
# first run will truncate the target database and reload it from scratch. Once delta updates have been implmented this needs adjusting
first_run = True

# Set display options

In [4]:
# further details found by running:
# pd.describe_option('display')
# set the values to show all of the columns etc.
pd.set_option('display.max_columns', None)  # or 1000
pd.set_option('display.max_rows', None)  # or 1000
pd.set_option('display.max_colwidth', -1)  # or 199

# locals() # show all of the local environments

# Connect to Mongo DB

In [5]:
# create the connection to MongoDB
# define the location of the Mongo DB Server
# in this instance it is a local copy running on the dev machine. This is configurable at this point.
client = MongoClient('127.0.0.1', 27017)

# define what the database is called.
db = client.CLARA

# define the collections

coachDataCollection = db.raw_data_coach_coachee
groupDataCollection = db.raw_data_group_user
resultsDataCollection = db.raw_data_combined_user_results
usersDataCollection = db.raw_data_claraUsers

## Read Data 
This uses a variable that is defined above and puts it into a filter based on the student index. <br>
This needs to be replaced with the student ID.

In [6]:
# Use this to define what to search on
searchCriteria = [
    "f2f3c425-d4bf-4b1a-8112-e5d23d48719b",
    "539693e6-bd4c-4c25-aeed-f62789032181"
]

### Read the coaching relationship file

In [7]:
# Need to introduce this check as the commands are different if I want to conduct a serach or return all values
# create a variable to determine if I want to perform a wildcard search

wildcardSearch = True


In [8]:
# Get the group information from the data base
# Query the data from the database using a filter
queryField = "userId"
sortField = "coachId"

# define the search query
query = {
    queryField: {
        '$in': searchCriteria
    }  # matches ensuring only the requested students are supplied
}

if wildcardSearch:
    # no filter is provided for a wildcard search
    # return sorted results
    cursor = coachDataCollection.find().sort([(sortField, 1)])
else:
    # return filtered and sorted results
    cursor = coachDataCollection.find(query).sort([(sortField, 1)])

# put the results into a dataframe
dfCoach = pd.DataFrame(list(cursor))

if verbose:
    print(dfCoach.shape)
    display(dfCoach.head())

In [9]:
dfCoach.drop(['_id', 'insertdate', 'userGroup_index'],
             inplace=True,
             axis=1,
             errors='ignore')

if verbose:
    display(dfCoach)

### Read the users file

In [10]:
# Need to introduce this check as the commands are different if I want to conduct a serach or return all values
# create a variable to determine if I want to perform a wildcard search

wildcardSearch = True


In [11]:
# Get the group information from the data base
# Query the data from the database using a filter
queryField = "userId"
sortField = "coachId"

# define the search query
query = {
    queryField: {
        '$in': searchCriteria
    }  # matches ensuring only the requested students are supplied
}

if wildcardSearch:
    # no filter is provided for a wildcard search
    # return sorted results
    cursor = usersDataCollection.find().sort([(sortField, 1)])
else:
    # return filtered and sorted results
    cursor = usersDataCollection.find(query).sort([(sortField, 1)])

# put the results into a dataframe
dfUsers = pd.DataFrame(list(cursor))

if verbose:
    print(dfUsers.shape)
    display(dfUsers.head())

In [12]:
# drop columns that are not needed from the data frame
# 'orgUserId' is needed to link back to the results
dfUsers.drop([
    '_id', 'insertdate', 'user_index', 'isSSO', 'additionalData',
    'learningPlatformUserId', 'coachId' 
],
             inplace=True,
             axis=1,
             errors='ignore')

if verbose:
    display(dfUsers.head())

### Merge the files together to bring in the coach's name and identifiers against the coachees record

In [13]:
# add in the coaches name if there is one by merging the users file into the coach / coachee file by joining on the coach id
dfCoach = pd.merge(
    dfCoach, dfUsers, how='left', left_on='coachId', right_on='orgUser_Id')

if verbose:
    display(dfCoach.head())

In [14]:
# drop the unneeded columns - this column is a duplicate of the coachId
dfCoach.drop([
    'orgUser_Id', 'AvatarSupplied', 'clientUserId', 'declaraLinked',
    'languagePreference', 'orgUser_Id', 'primaryEmail', 'userDeletedAt',
    'userStatus'
],
             inplace=True,
             axis=1,
             errors='ignore')

if verbose:
    display(dfCoach.head())

In [15]:
# Rename the columns to indicate which are the ones that belong to the coaches DisplayName and id's
dfCoach.columns = [
    'coachId', 'dateFrom', 'dateTo', 'learnerId', 'coachDisplayName', 'coachNameId', 'coachOrgUserId'
]

if verbose:
    display(dfCoach.head())

In [16]:
# Update the userse table with the coach / coachee details
dfUsers = pd.merge(
    dfUsers, dfCoach, how='left', left_on='orgUser_Id', right_on='learnerId')

if verbose:
    display(dfUsers.head())    

In [17]:
if verbose:
    print(*dfUsers, sep='\n')
    display(dfUsers)     

In [18]:
# rename columns
# The key renameing is orgUserId -> userId & orgUser_Id -> user_Id

dfUsers.columns = ['AvatarSupplied', 'clientUserId', 'declaraLinked', 'displayName', 'languagePreference', 'nameId', 'userId', 'user_Id', 'primaryEmail', 'userDeletedAt', 'userStatus', 'coachId', 'dateFrom', 
'dateTo', 'learnerId', 'coachDisplayName', 'coachNameId', 'coachOrgUserId']

if verbose:
    display(dfUsers.head())

In [19]:
# Extract the list of coaches

# get the unique users from data frame. This is what we will interate through
coachNames = dfUsers['coachDisplayName'].dropna().unique()

if verbose:
    display(coachNames.shape)
    display(coachNames)

## Group Data Retrieval

### Take note of the setting here of the wild card
The code is left like this in case there is a need to use a search criteria at a later date, but for now check to see if it is using a wild card in the next cell

In [20]:
# Need to introduce this check as the commands are different if I want to conduct a serach or return all values
# create a variable to determine if I want to perform a wildcard search

wildcardSearch = True


In [21]:
# Get the group information from the data base
# Query the data from the database using a filter
queryField = "userId"
sortField = "groupName"

# define the search query
query = {
    queryField: {
        '$in': searchCriteria
    }  # matches ensuring only the requested students are supplied
}

if wildcardSearch:
    # no filter is provided for a wildcard search
    # return sorted results
    cursor = groupDataCollection.find().sort([(sortField, 1)])
else:
    # return filtered and sorted results
    cursor = groupDataCollection.find(query).sort([(sortField, 1)])

# put the results into a dataframe
dfGroup = pd.DataFrame(list(cursor))

if verbose:
    print(dfGroup.shape)
    display(dfGroup.head())

In [22]:
if verbose:

    # count columns and rows
    print("Number of columns are " + str(len(dfGroup.columns)))
    print("Number of rows are " + str(len(dfGroup.index)))
    print()

    # output the shape of the dataframe
    print("The shape of the data frame is " + str(dfGroup.shape))
    print()

    # output the column names
    print("The column names of the data frame are: ")
    print(*dfGroup, sep='\n')
    print()

    # output the column names and datatypes
    print("The datatypes of the data frame are: ")
    print(dfGroup.dtypes)
    print()

# Group stuff
This needs work because this is not the correct relationship in this code. A intersection table needs to be used and also account of the dates that the surveys were taken.

In [23]:
# get the unique users from data frame. This is what we will interate through
groupId = dfGroup['groupName'].unique()

if verbose:
    display(groupId.shape)
    display(groupId)

### CLARA Results Retrieval

In [24]:
# Need to introduce this check as the commands are different if I want to conduct a serach or return all values
# create a variable to determine if I want to perform a wildcard search
wildcardSearch = True

# Get the group information from the data base
# Query the data from the database using a filter
queryField = "userId"
sortField = "groupName"

# define the search query
query = {
    queryField: {
        '$in': searchCriteria
    }  # matches ensuring only the requested students are supplied
}

if wildcardSearch:
    # no filter is provided for a wildcard search
    # return sorted results
    cursor = resultsDataCollection.find().sort([(sortField, 1)])
else:
    # return filtered and sorted results
    cursor = resultsDataCollection.find(query).sort([(sortField, 1)])

# put the results into a dataframe
dfResults = pd.DataFrame(list(cursor))

if verbose:
    print(dfResults.shape)
    display(dfResults.head())

In [25]:
# Display information about the data that has been retrieved
if verbose:

    # count columns and rows
    print("Number of columns are " + str(len(dfResults.columns)))
    print("Number of rows are " + str(len(dfResults.index)))
    print()

    # output the shape of the dataframe
    print("The shape of the data frame is " + str(dfResults.shape))
    print()

    # output the column names
    print("The column names of the data frame are: ")
    print(*dfResults, sep='\n')
    print()

    # output the column names and datatypes
    print("The datatypes of the data frame are: ")
    print(dfResults.dtypes)
    print()

# Create additional columns
These will help in understanding the data and if the surveys are valid or not.
There are many test journeys and the like that we don't want to include in the analysis

## Second Survey
This makes a column that indicates if the student completed a second survey

In [26]:
# test to see if the start date for the 2nd survey is blank, if so, then False
dfResults["completedSecondSurvey"] = np.where(
    dfResults["measure_ClaraResultsCreatedAt"].isnull(), False, True)

## Survey Duration

In [27]:
# Duration of surveys

# convert the number fields to a datetime field
dfResults.loc[:, "diagnose_ClaraResultsCreatedAt"] = pd.to_datetime(
    dfResults.loc[:, 'diagnose_ClaraResultsCreatedAt'])

dfResults.loc[:, "diagnose_ClaraResultCompletedAt"] = pd.to_datetime(
    dfResults.loc[:, 'diagnose_ClaraResultCompletedAt'])

dfResults.loc[:, "measure_ClaraResultsCreatedAt"] = pd.to_datetime(
    dfResults.loc[:, 'measure_ClaraResultsCreatedAt'])

dfResults.loc[:, "measure_ClaraResultCompletedAt"] = pd.to_datetime(
    dfResults.loc[:, 'measure_ClaraResultCompletedAt'])

In [28]:
# this calcs the duration of the surveys in HH:MM:SS
dfResults.loc[:,
              "surveyOneDuration"] = dfResults.loc[:,
                                                   "diagnose_ClaraResultCompletedAt"] - dfResults.loc[:,
                                                                                                      "diagnose_ClaraResultsCreatedAt"]
dfResults.loc[:,
              "surveyTwoDuration"] = dfResults.loc[:,
                                                   "measure_ClaraResultCompletedAt"] - dfResults.loc[:,
                                                                                                     "measure_ClaraResultsCreatedAt"]

In [29]:
# Calc the time between the end of the first survey and the start of the second one
dfResults.loc[:,
              "surveyBetweenDuration"] = dfResults.loc[:,
                                                       "measure_ClaraResultsCreatedAt"] - dfResults.loc[:,
                                                                                                        "diagnose_ClaraResultCompletedAt"]

# Interactive section
As the code for the widgets executes immediately, this needs to be nested inside a function otherwise it doesn't wait for the user to select their input.<br>
This function contains the code for a second button to seek user interaction to save the file.<br>

In [77]:
# this function is used to display the groups to the user and set the value for use in the rest of the munging
# it contains all of the remaining code otherwise it just skips past this button and executes it which is meaningless in this context


def GroupIdSelect(groupId, coachName):

    # Get the user id's into a variable for selecting from the results data frame
    # Compare the group name selected by the user and return the columns where it matches
    groupUserId = list(dfGroup.loc[dfGroup['groupName'] == groupId]['userId'])

    # Get the user id's into a variable for selecting from the results data frame
    coachUserId = list(
        dfUsers.loc[dfUsers['coachDisplayName'] == coachName]['userId'])

    # if any of the options are blank then don't include them
    if (groupId == "") & (coachName == ""):
        combinedUserId = []
        # print out to the user their selection and their matches
        print("\n" + "*" * 45)
        print(("\n       You did not select a group \n"))
        print(("\n       You did not select a coach \n"))
        print("\nThis results in " + str(len(combinedUserId)) +
              " people selected")
    elif groupId == "":
        combinedUserId = coachUserId
        # print out to the user their selection and their matches
        print("\n" + "*" * 45)
        print(("\n       You did not select a group \n"))
        print(("\n       The coach you picked is: \n"))
        print("              " + coachName + "")
        print("\nThe number of members in the coach selection is: " +
              str(len(coachUserId)))
        print("\nThis results in " + str(len(combinedUserId)) +
              " people selected")
    elif coachName == "":
        combinedUserId = groupUserId
        # print out to the user their selection and their matches
        print("\n" + "*" * 45)
        print(("\n         The group you picked is: \n"))
        print("              " + groupId + "")
        print("\nThe number of members in the group is: " +
              str(len(groupUserId)))
        print(("\n         You did not select a coach \n"))
        print("\nThis results in " + str(len(combinedUserId)) +
              " people selected")
    else:
        # this is the intersection of the two selections
        # only the students that match both
        combinedUserId = list(set(groupUserId) & set(coachUserId))

        # print out to the user their selection and their matches
        print("\n" + "*" * 45)
        print(("\n         The group you picked is: \n"))
        print("              " + groupId + "")
        print("\n The number of members in the group is: " +
              str(len(groupUserId)))
        print(("\n         The coach you picked is: \n"))
        print("              " + coachName + "")
        print("\nThe number of members in the coach selection is: " +
              str(len(coachUserId)))
        print("\nThis results in " + str(len(combinedUserId)) +
              " people selected")
    # footer for display
    print("\n" + "*" * 45 + "\n")

    # use isin function to select the rows that match the userId's that are in the selected group
    selectedResults = dfResults.loc[dfResults['userId'].isin(
        combinedUserId)].copy()
    print()
    print("The " + str(len(combinedUserId)) + " people selected have " +
          str(len(selectedResults)) + " corresponding CLARA Journey results")
    print("\nA sample of the key columns and data is: \n")
    display(selectedResults[[
        'userPrimaryEmail', 'completedSecondSurvey', 'journeyTitle',
        'journeyGoal', 'diagnose_ClaraResultsCreatedAt',
        'measure_ClaraResultsCreatedAt'
    ]].head())

    # drop the unneeded columns from the results set
    selectedResults.drop([
        '_id', 'claraResultsJourneyStep', 'diagnose_ClaraId', 'insertdate',
        'journeyId', 'measure_ClaraId', 'measure_ClaraResultsJourneyStep',
        'numTotalClaraJourneySurveys', 'numTotalClaraSurveys', 'rowIndex',
        'userDeclaraLinked', 'userDeletedAt', 'userAvatarSupplied',
        'userClientUserId', 'userExtraData', 'userId',
        'userLanguagePreference', 'userStatus', 'userName'
    ],
                         inplace=True,
                         axis=1,
                         errors='ignore')

    if verbose:
        print("\nThe column names: \n")
        print(selectedResults.columns)
        print("\nA sample of the all columns and data is: \n")
        display(selectedResults.head())

    ############# !~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~! #############
    # Saving the file section really starts here
    # add a few extra lines for seperation from the save file
    print("\n" * 3)
    print("\n" + "*" * 80 + "\n")
    print("Specify the name of the file to be saved... ")
    print(
        "This will automatically place a comma serparated file onto the platform"
    )
    print(
        "at the following location - http://127.0.0.1:8888/tree/datasets/CLARA/UserSaved"
    )
    print(
        "Please note that this will overwrite any file with the same name, please check before saving the file."
    )
    print()
    print(
        "A good format could be YYMMDD_SubjectOrGroup_LecturerOrTutor - It depends on the the data that you have selected."
    )
    print()

    # # Save File
    # This section waits for the user to interact with the code to set the file name of the csv file.

    # set a variable to hold the filename - note there is a check if the user tries to use this value to save the file.
    filename = "Specify_filename"
    # define the widget controls
    wFilename = widgets.Text(
        value=filename,
        placeholder='Go on... type the file name?',
        description='Filename:',
        disabled=False)

    # this function is used to manage the setting of the filename and saving that file.

    def saveFile(filename):

        # check to see if the file ends in .csv, if it does do nothing otherwise, add it to the end of the filename
        # note that the tests below have been modified to take this into account
        # therefore blank file name is actually ".csv"
        if filename[-4:] != ".csv":
            # add .csv to the filename
            filename = filename + ".csv"

        # write out data to CSV file.
        # test to see if the user has entered a filename otherwise reject and let them try again
        # see lines above for why the tests have .csv in them
        if filename == ".csv":  # file name is blank
            print("\n" + "*" * 45 + "\n")
            print("     You need to enter a filename!")
            print("\n" + "*" * 45)

        elif filename != "Specify_filename.csv":  # file name has been changed by the user - not negative testing
            # don't write the index to the file as it is not required and it breaks when reading the file back in later on
            selectedResults.to_csv(
                "~/datasets/CLARA/UserSaved/" + filename, index=False)
            print("\n" + "##!" * 45 + "\n")
            print("Congrats! \n\nFile of " + str(len(selectedResults)) +
                  " records succesfully written to: " +
                  "~/datasets/CLARA/UserSaved/" + filename)
            print("\n" + "##!" * 45 + "\n")

        else:  # this means that the user has entered a filename of some sort
            print("\n" + "*" * 45 + "\n")
            print(("         File is NOT saved!"))
            print("     You need to specify the file name in the box.")
            print("\n" + "*" * 45 + "\n")

        # end function saveFile and return with nothing as file has been written or not...
        return

    ## Set up the interaction component
    # set the name on the button
    interact_save = interact_manual.options(manual_name="Save File")

    # This file is used by the next step of the analysis to save the file interactively
    interact_save(saveFile, filename=wFilename)

    # end function GroupIdSelect and return with nothing as data was selected and then written...
    return

## Set up the interaction component

In [31]:
# add a blank value to the variable - this is used as a default value
groupId = np.append(groupId, "")

# define the widget controls for the groups that are available
wGroupId = widgets.Dropdown(
    options=groupId, description='Select Group', value='', disabled=False)

if verbose:
    print(groupId)

In [32]:
# add a blank value to the variable - this is used as a default value
coachNames = np.append(coachNames, "")

# define the widget controls for the coach names
wCoachNames = widgets.Dropdown(
    options=coachNames, description='Select Coach', value='', disabled=False)

if verbose:
    print(coachNames)

In [33]:
# set the name on the button
interactGroupId = interact_manual.options(manual_name="Pick Group + Coach")

In [78]:
print("\n" * 3)
# This file is used by the next step of the analysis
interactGroupId(GroupIdSelect, groupId=wGroupId, coachName=wCoachNames)

print("\n" * 3)







interactive(children=(Dropdown(description='Select Group', index=27, options=('2019 Autumn', '2019 Summer', '2…





