In [1]:
import pandas

from data_algebra.data_ops import *  # https://github.com/WinVector/data_algebra
from data_algebra.data_pipe import Locum

d_local = pandas.DataFrame({
    'subjectID':[1, 1, 2, 2],
    'surveyCategory': [ "withdrawal behavior", "positive re-framing", "withdrawal behavior", "positive re-framing"],
    'assessmentTotal': [5, 2, 3, 4],
    'irrelevantCol1': ['irrel1']*4,
    'irrelevantCol2': ['irrel2']*4,
})

scale = 0.237

with data_algebra.env.Env(locals()) as env:
    ops = data_algebra.data_ops.describe_table(d_local, 'd'). \
        extend({'probability': '(assessmentTotal * scale).exp()'}). \
        extend({'total': 'probability.sum()'},
               partition_by='subjectID'). \
        extend({'probability': 'probability/total'}). \
        extend({'sort_key': '-probability'}). \
        extend({'row_number': '_row_number()'},
               partition_by=['subjectID'],
               order_by=['sort_key']). \
        select_rows('row_number == 1'). \
        select_columns(['subjectID', 'surveyCategory', 'probability']). \
        rename_columns({'diagnosis': 'surveyCategory'})

print(ops)

TableDescription(table_name='d', column_names=['subjectID', 'surveyCategory', 'assessmentTotal', 'irrelevantCol1', 'irrelevantCol2']) .\
   extend({'probability': '(assessmentTotal * 0.237).exp()'}) .\
   extend({'total': 'probability.sum()'}, partition_by=['subjectID']) .\
   extend({'probability': 'probability / total'}) .\
   extend({'sort_key': '-probability'}) .\
   extend({'row_number': '_row_number()'}, partition_by=['subjectID'], order_by=['sort_key']) .\
   select_rows('row_number == 1') .\
   select_columns(['subjectID', 'surveyCategory', 'probability']) .\
   rename_columns({'diagnosis': 'surveyCategory'})


In [2]:
prob_calculation = Locum(). \
    extend({'probability': '(assessmentTotal * 0.237).exp()'}). \
    extend({'total': 'probability.sum()'},
           partition_by='subjectID'). \
    extend({'probability': 'probability/total'})

print(prob_calculation)

[
    Extend({'probability': '(assessmentTotal * 0.237).exp()'}, partition_by=None, order_by=None, reverse=None),
    Extend({'total': 'probability.sum()'}, partition_by='subjectID', order_by=None, reverse=None),
    Extend({'probability': 'probability/total'}, partition_by=None, order_by=None, reverse=None),
]


In [3]:
top_rank = Locum(). \
    extend({'sort_key': '-probability'}). \
    extend({'row_number': '_row_number()'},
           partition_by=['subjectID'],
           order_by=['sort_key']). \
    select_rows('row_number == 1')

print(top_rank)

[
    Extend({'sort_key': '-probability'}, partition_by=None, order_by=None, reverse=None),
    Extend({'row_number': '_row_number()'}, partition_by=['subjectID'], order_by=['sort_key'], reverse=None),
    SelectRows('row_number == 1'),
]


In [4]:
clean_up_columns = Locum(). \
    select_columns(['subjectID', 'surveyCategory', 'probability']). \
    rename_columns({'diagnosis': 'surveyCategory'})

print(clean_up_columns)

[
    SelectColumns(['subjectID', 'surveyCategory', 'probability']),
    RenameColumns({'diagnosis': 'surveyCategory'}),
]


In [5]:
ops =  data_algebra.data_ops.describe_table(d_local, 'd') +\
    prob_calculation +\
    top_rank +\
    clean_up_columns

print(ops)

TableDescription(table_name='d', column_names=['subjectID', 'surveyCategory', 'assessmentTotal', 'irrelevantCol1', 'irrelevantCol2']) .\
   extend({'probability': '(assessmentTotal * 0.237).exp()'}) .\
   extend({'total': 'probability.sum()'}, partition_by=['subjectID']) .\
   extend({'probability': 'probability / total'}) .\
   extend({'sort_key': '-probability'}) .\
   extend({'row_number': '_row_number()'}, partition_by=['subjectID'], order_by=['sort_key']) .\
   select_rows('row_number == 1') .\
   select_columns(['subjectID', 'surveyCategory', 'probability']) .\
   rename_columns({'diagnosis': 'surveyCategory'})


In [6]:
d_local >> ops


Unnamed: 0,subjectID,diagnosis,probability
0,1,withdrawal behavior,0.670622
1,2,positive re-framing,0.558974
