This is an [Apache `Spark`](https://spark.apache.org) variation of the [logistic scoring example](https://github.com/WinVector/data_algebra/blob/master/Examples/LogisticExample/ScoringExample.ipynb).

Not all of the Spark code path is currently implemented/tested, but this gives the idea.


In [1]:
import pyspark
import pyspark.sql

import pandas

from data_algebra.data_ops import *
import data_algebra.SparkSQL

In [2]:
d_local = pandas.DataFrame({
    'subjectID':[1, 1, 2, 2],
    'surveyCategory': [ "withdrawal behavior", "positive re-framing", "withdrawal behavior", "positive re-framing"],
    'assessmentTotal': [5, 2, 3, 4],
    'irrelevantCol1': ['irrel1']*4,
    'irrelevantCol2': ['irrel2']*4,
})

d_local

Unnamed: 0,subjectID,surveyCategory,assessmentTotal,irrelevantCol1,irrelevantCol2
0,1,withdrawal behavior,5,irrel1,irrel2
1,1,positive re-framing,2,irrel1,irrel2
2,2,withdrawal behavior,3,irrel1,irrel2
3,2,positive re-framing,4,irrel1,irrel2


In [3]:
handle = data_algebra.SparkSQL.example_handle()


In [4]:
handle.insert_table(d_local, table_name='d', allow_overwrite=True)

TableDescription(
 table_name='d',
 column_names=[
   'subjectID', 'surveyCategory', 'assessmentTotal', 'irrelevantCol1',  
 'irrelevantCol2'])

In [5]:
local_copy = handle.read_query('SELECT * FROM d')

local_copy

Unnamed: 0,subjectID,surveyCategory,assessmentTotal,irrelevantCol1,irrelevantCol2
0,1,withdrawal behavior,5,irrel1,irrel2
1,1,positive re-framing,2,irrel1,irrel2
2,2,withdrawal behavior,3,irrel1,irrel2
3,2,positive re-framing,4,irrel1,irrel2


In [6]:
scale = 0.237

ops = data_algebra.data_ops.describe_table(d_local, 'd'). \
    extend({'probability': f'(assessmentTotal * {scale}).exp()'}). \
    extend({'total': 'probability.sum()'},
           partition_by='subjectID'). \
    extend({'probability': 'probability/total'}). \
    extend({'sort_key': '-probability'}). \
    extend({'row_number': '_row_number()'},
           partition_by=['subjectID'],
           order_by=['sort_key']). \
    select_rows('row_number == 1'). \
    select_columns(['subjectID', 'surveyCategory', 'probability']). \
    rename_columns({'diagnosis': 'surveyCategory'})
    
print(ops.to_python(pretty=True))

TableDescription(
    table_name="d",
    column_names=[
        "subjectID",
        "surveyCategory",
        "assessmentTotal",
        "irrelevantCol1",
        "irrelevantCol2",
    ],
).extend({"probability": "(assessmentTotal * 0.237).exp()"}).extend(
    {"total": "probability.sum()"}, partition_by=["subjectID"]
).extend(
    {"probability": "probability / total"}
).extend(
    {"sort_key": "-probability"}
).extend(
    {"row_number": "_row_number()"}, partition_by=["subjectID"], order_by=["sort_key"]
).select_rows(
    "row_number == 1"
).select_columns(
    ["subjectID", "surveyCategory", "probability"]
).rename_columns(
    {"diagnosis": "surveyCategory"}
)



In [7]:
ops.transform(d_local)


Unnamed: 0,subjectID,diagnosis,probability
0,1,withdrawal behavior,0.670622
1,2,positive re-framing,0.558974


In [8]:
sql = handle.to_sql(ops, pretty=True)

print(sql)

SELECT `surveycategory` AS `diagnosis`,
       `probability`,
       `subjectid`
FROM
  (SELECT `probability`,
          `subjectid`,
          `surveycategory`
   FROM
     (SELECT `probability`,
             `subjectid`,
             `surveycategory`,
             ROW_NUMBER() OVER (PARTITION BY `subjectid`
                                ORDER BY `sort_key`) AS `row_number`
      FROM
        (SELECT `probability`,
                `subjectid`,
                `surveycategory`, -(`probability`) AS `sort_key`
         FROM
           (SELECT `probability` / `total` AS `probability`,
                   `subjectid`,
                   `surveycategory`
            FROM
              (SELECT `subjectid`,
                      `surveycategory`,
                      `probability`,
                      SUM(`probability`) OVER (PARTITION BY `subjectid`) AS `total`
               FROM
                 (SELECT `subjectid`,
                         `surveycategory`,
                         EX

In [9]:
res = handle.read_query(sql)

res

Unnamed: 0,diagnosis,probability,subjectid
0,withdrawal behavior,0.670622,1
1,positive re-framing,0.558974,2


In [10]:
handle.close()
