# Graph Development CSPS
Using datasets pulled by Sam and co for the Data Explorer

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
import os
import warnings
import networkx as nx

data_dir = "~/data/csps/"

warnings.filterwarnings('ignore')

%load_ext autoreload
%autoreload 2
%matplotlib inline
%config InlineBackend.figure_format = 'retina'


## Offerings
* Have course title
* offering ID
* Course code
* Delivery status
* Region
* City

**Recommendation: Combine with Product info and registrations**

In [None]:
df = pd.read_csv("Offerings.csv", sep=None)

In [None]:
df.head()

In [None]:
deps = pd.read_csv("Departments.csv", sep=None)

In [None]:
deps.head()

## Product Info
* Some good info here - communities, business line
* Missing title, but can use dictionary with course code and offerings
* **Can use entity extraction on course_description_en**
[link](https://github.com/ToferC/sqlalchemy_gctools/blob/master/Content%20Analysis/SQLAlchemy%20GCconnex%20Classifier%20-%20April%202017.ipynb)

In [None]:
prod = pd.read_csv("product_info.csv", sep=None)

In [None]:
prod.head(5)

In [None]:
prod.loc[4]["course_description_en"]

## Ratings
* Tied to offering ID
* NOT tied to registration ID
* Separate survey ID
* Learner classification
* Learner department
* Satisfaction (1-5)
* **Can link to offering ID in registrations**

In [None]:
ratings = pd.read_csv("Ratings.csv", sep=None)

In [None]:
ratings.head()

## Overall Satisfaction
* Survey ID to 10pt satisfaction scale
* Can get survey ID from ratings

In [None]:
satisf = pd.read_csv("Overall Satisfaction.csv", sep=None)

In [None]:
satisf.head()

## Comments
* Separate row per comment (multiple possible on a survey ID)
* Should have datetime for comments
* Easy to aggregate for course

In [None]:
comments = pd.read_csv("Comments.csv", sep=None)

In [None]:
grouped_comments = comments.groupby(comments.survey_id)

In [None]:
grouped_comments.head()

## Registrations
* This is gold
* Core info on what our users are doing
* Need to pair with evaluation data
* Time series
* Departmental data
* Language
* Classification & Level (separate these)
* Big enough for ML model
* Need data on whether learner attended or completed learning
* Some strangeness in combining online * in-person -- consider separating

In [None]:
reg = pd.read_csv("Registrations.csv", sep=None)

In [None]:
reg.head()

In [None]:
courses = reg.groupby(reg.course_code).count()

In [None]:
courses.head()

### Look at columns

In [None]:
i = 0

for c in reg.columns:
    print(i, c)
    i+=1


Thinking here:

* learners are nodes {learner_id = 0, learner-classif, learner_language}
* departments are nodes {billing_dept_name_en}
* courses are nodes - display course name - data = course code, business type, learner_city_en
* edge == took {learner -> course } {start_date, end_date, reg_status, no_show}
* edge == belongs_to { learner -> department }
* We should have registration ID tied to the ratings & comments
* Missing data on results of learning - did they complete?
* Probably separate online & in-person learning


## Set up helper functions

This graph will separate all learner functions to mirror a graph database

* Potential here: track group, class, level by learner at every interaction with our system
* We should have a 6 month update questionnaire
    
We should be connecting:
    
* learner_id -> department (date)
* learner -> classification (date)
* learner -> level (date)
* learner -> registration (registration_id) (date, reg_status)
* learner -> city (date)
* learner -> province_en (date)
* registration_id -> offering (offering_id)
* offering -> course (course_code, course_name_en)
* province -> city

### Add Learner

In [None]:
# Add individual learner node
def learner_add_or_update(G, row):
    
    # Learner node
    if G.has_node(row.learner_id):
        G.nodes[row.learner_id]["count"] += 1
        G.nodes[row.learner_id]["no_show"] += row.no_show
        
    else:
        # Add Learner node
        G.add_node(
            row.learner_id,
            name=row.learner_classif, 
            bipartite="learner",
            province=row.learner_province_en,
            no_show=row.no_show,
            department=row.billing_dept_name_en,
            count=1)

### Add Department

In [None]:
def department_add_or_update(G, row):
    # Department node
    if G.has_node(row.billing_dept_name_en):
        G.nodes[row.billing_dept_name_en]["count"] += 1
        G.nodes[row.billing_dept_name_en]["no_show"] += row.no_show
    else:
        G.add_node(
            row.billing_dept_name_en, 
            name=row.billing_dept_name_en, 
            bipartite="department",
            count=1,
            no_show=row.no_show)

### Add Course

In [None]:
def course_add_or_update(G, row):
    # Course node - separate registrations update count
    # Tracks number of offerings by ID and total count
    if G.has_node(row.course_title_en):
        G.nodes[row.course_title_en]["count"] += 1

    else:
        G.add_node(
            row.course_title_en,
            business_type=row.business_type,
            client=row.client,
            course_code=row.course_code,
            bipartite="course",
            count=1)

### Add Registration

In [None]:
def registration_add_or_update(G, row):
    # Offering node - separate registrations update count
    # May have multiple offerings for a single course tracked as separate nodes
    if G.has_node(row.reg_id):
        G.nodes[row.reg_id]["count"] += 1

    else:
        G.add_node(
            row.reg_id,
            bipartite="registration",
            count=1)

### Add Offering

In [None]:
def offering_add_or_update(G, row):
    # Offering node - separate registrations update count
    # May have multiple offerings for a single course tracked as separate nodes
    if G.has_node(row.offering_id):
        G.nodes[row.offering_id]["count"] += 1
        G.nodes[row.offering_id]["no_show"] += row.no_show

    else:
        G.add_node(
            row.offering_id,
            name=row.course_title_en,
            course_code=row.course_code,
            city=row.offering_city_en,
            province=row.offering_province_en,
            business_type=row.business_type,
            start_date=row.start_date, 
            end_date=row.end_date,
            bipartite="offering",
            no_show=row.no_show,
            count=1)

### Add Learner to Registration Edge

In [None]:
def add_learner_to_registration_edge(G, row):
    
    # Learner to registration edge
    if not G.has_edge(row.learner_id, row.reg_id):
        # Learners registered
        G.add_edge(
            row.learner_id,
            row.reg_id,
            edge_type="registered_to",
            status=row.reg_status,
            no_show=row.no_show,
            start_date=row.start_date, 
            end_date=row.end_date)
    else:
        G.edges[row.learner_id, row.reg_id]["no_show"]+=row.no_show

### Add Registration to Offering Edge

In [None]:
def add_registration_to_offering_edge(G, row):
    
    # Registration to Course edge
    if not G.has_edge(row.reg_id, row.offering_id):
        # Registration took courses
        G.add_edge(
            row.reg_id,
            row.offering_id,
            edge_type="reg_to_offering",
            status=row.reg_status,
            no_show=row.no_show,
            start_date=row.start_date, 
            end_date=row.end_date)
    else:
        G.edges[row.reg_id, row.offering_id]["no_show"]+=row.no_show

## Add Offering to Course Edge

In [None]:
def add_offering_to_course_edge(G, row):
    
    # Offering to Course edge
    if not G.has_edge(row.offering_id, row.course_title_en):
        # Learners took courses
        G.add_edge(
            row.offering_id,
            row.course_title_en,
            city=row.offering_city_en,
            province=row.learner_province_en,
            edge_type="offering_of")

### Add Learner to Department Edge

In [None]:
def add_learner_to_department_edge(G, row):
    # Learner to department edge
    if not G.has_edge(row.learner_id, row.billing_dept_name_en):
        # Learners belong to departments
        G.add_edge(
            row.learner_id,
            row.billing_dept_name_en,
            edge_type="belongs_to",
            start_date=row.start_date,
            end_date=row.end_date)

### Add additional data nodes & edges

In [None]:
def add_data_nodes(G, row):
    
    learner_id = row.learner_id
            
    classification = row.learner_classif.replace(" ", "")
        
    province = row.learner_province_en
    
    city = row.offering_city_en
    
    if not G.has_node(city):
        G.add_node(city, bipartite="city")
        
    if not G.has_node(province):
        G.add_node(province, bipartite="province")
        
    if not G.has_node(classification):
        G.add_node(classification, bipartite="classification")

    # Add edges not covered below
    G.add_edge(learner_id, classification, edge_type="of_classification", start_date=row.start_date, end_date=row.end_date)
    G.add_edge(learner_id, city, edge_type="in_city", start_date=row.start_date, end_date=row.end_date)
    G.add_edge(city, province, edge_type="city_in" )


### Generate Full Graph
Generates a graph with all nodes

In [None]:
# Generate a dynamic graph with learners, departments and courses
# Call:

# generate_graph(reg, "test_file_1")

def generate_graph(reg, filename=""):
    
    if filename == "":
        filename = f"full_graph-{datetime.now().strftime('%y-%m-%d')}"
    
    G = nx.DiGraph()
    
    for i, row in enumerate(reg.itertuples()):
        
        if i % 5 == 0:
        
            # Add nodes
            learner_add_or_update(G, row)
            department_add_or_update(G, row)
            course_add_or_update(G, row)
            registration_add_or_update(G, row)
            offering_add_or_update(G, row)

            # Add edges
            add_learner_to_registration_edge(G, row)
            add_learner_to_department_edge(G, row)
            add_registration_to_offering_edge(G, row)
            add_offering_to_course_edge(G, row)
            add_data_nodes(G, row)
        
    nx.write_gexf(G, filename+".gexf")


## Full Registrations Graph
* Can do timeseries with each registration
* Useful to have department in graphs for community detection
* LOTS of nodes & edges

In [None]:
# Test full non-aggregated graph
generate_graph(reg)

## To Do
* Create graph for offerings
* Test graph for other options
* Remove large graph functions below
* Start adding in satisfaction and rating results

## Individual dynamic no depts
* Main difference from above is that no departments present & full dynamic setup

### Thinking
* Probably useful to start combining departments, classifications and add weights to edges to showcase use
* Also issues with what happens during registration - use the registration ID as the edge uID?

## Summary Graph
* Combines registrations by department * classification
* Dramatically reduces the number of nodes & edges
* Loses a lot of granularity
* Means that time series has low value
* Still a lot of nodes

## No department graph
* As summary graph, but removes departments as nodes
* Adds department to node attributes
* Cleans up graph and overall weighting
* Makes it harder to see which departments are part of which communities