This is an [Apache `Spark`](https://spark.apache.org) variation of the [logistic scoring example](https://github.com/WinVector/data_algebra/blob/master/Examples/LogisticExample/ScoringExample.ipynb).

Not all of the Spark code path is currently implemented/tested, but this gives the idea.


In [1]:
import pyspark
import pyspark.sql

import pandas

from data_algebra.data_ops import *
import data_algebra.SparkSQL

In [2]:
d_local = pandas.DataFrame({
    'subjectID':[1, 1, 2, 2],
    'surveyCategory': [ "withdrawal behavior", "positive re-framing", "withdrawal behavior", "positive re-framing"],
    'assessmentTotal': [5, 2, 3, 4],
    'irrelevantCol1': ['irrel1']*4,
    'irrelevantCol2': ['irrel2']*4,
})

d_local

Unnamed: 0,subjectID,surveyCategory,assessmentTotal,irrelevantCol1,irrelevantCol2
0,1,withdrawal behavior,5,irrel1,irrel2
1,1,positive re-framing,2,irrel1,irrel2
2,2,withdrawal behavior,3,irrel1,irrel2
3,2,positive re-framing,4,irrel1,irrel2


In [3]:
handle = data_algebra.SparkSQL.example_handle()


In [4]:
handle.insert_table(d_local, table_name='d', allow_overwrite=True)

TableDescription(
 table_name='d',
 column_names=[
   'subjectID', 'surveyCategory', 'assessmentTotal', 'irrelevantCol1',  
 'irrelevantCol2'])

In [5]:
local_copy = handle.read_query('SELECT * FROM d')

local_copy

Unnamed: 0,subjectID,surveyCategory,assessmentTotal,irrelevantCol1,irrelevantCol2
0,1,withdrawal behavior,5,irrel1,irrel2
1,1,positive re-framing,2,irrel1,irrel2
2,2,withdrawal behavior,3,irrel1,irrel2
3,2,positive re-framing,4,irrel1,irrel2


In [6]:
scale = 0.237

ops = data_algebra.data_ops.describe_table(d_local, 'd'). \
    extend({'probability': f'(assessmentTotal * {scale}).exp()'}). \
    extend({'total': 'probability.sum()'},
           partition_by='subjectID'). \
    extend({'probability': 'probability/total'}). \
    extend({'row_number': '_row_number()'},
           partition_by=['subjectID'],
           order_by=['probability'], reverse=['probability']). \
    select_rows('row_number == 1'). \
    select_columns(['subjectID', 'surveyCategory', 'probability']). \
    rename_columns({'diagnosis': 'surveyCategory'})
    
print(ops.to_python(pretty=True))

TableDescription(
    table_name="d",
    column_names=[
        "subjectID",
        "surveyCategory",
        "assessmentTotal",
        "irrelevantCol1",
        "irrelevantCol2",
    ],
).extend({"probability": "((assessmentTotal * 0.237)).exp()"}).extend(
    {"total": "probability.sum()"}, partition_by=["subjectID"]
).extend(
    {"probability": "probability / total"}
).extend(
    {"row_number": "_row_number()"},
    partition_by=["subjectID"],
    order_by=["probability"],
    reverse=["probability"],
).select_rows(
    "row_number == 1"
).select_columns(
    ["subjectID", "surveyCategory", "probability"]
).rename_columns(
    {"diagnosis": "surveyCategory"}
)



In [7]:
ops.transform(d_local)


Unnamed: 0,subjectID,diagnosis,probability
0,1,withdrawal behavior,0.670622
1,2,positive re-framing,0.558974


In [8]:
sql = handle.to_sql(ops, pretty=True, use_with=True, annotate=True)

print(sql)

-- data_algebra SQL https://github.com/WinVector/data_algebra
--  dialect: SparkSQLModel
--       string quote: "
--   identifier quote: `
WITH `table_reference_0` AS
  (SELECT `surveyCategory`,
          `subjectID`,
          `assessmentTotal`
   FROM `d`),
     `extend_1` AS
  (SELECT -- extend({ 'probability': '((assessmentTotal * 0.237)).exp()'})
 `surveyCategory`,
 `subjectID`,
 EXP((`assessmentTotal` * 0.237)) AS `probability`
   FROM `table_reference_0`),
     `extend_2` AS
  (SELECT -- extend({ 'total': 'probability.sum()'}, partition_by=['subjectID'])
 `surveyCategory`,
 `subjectID`,
 `probability`,
 SUM(`probability`) OVER (PARTITION BY `subjectID`) AS `total`
   FROM `extend_1`),
     `extend_3` AS
  (SELECT -- extend({ 'probability': 'probability / total'})
 `surveyCategory`,
 `subjectID`,
 `probability` / `total` AS `probability`
   FROM `extend_2`),
     `extend_4` AS
  (SELECT -- extend({ 'row_number': '_row_number()'}, partition_by=['subjectID'], order_by=['probability

In [9]:
res = handle.read_query(sql)

res

Unnamed: 0,diagnosis,subjectID,probability
0,withdrawal behavior,1,0.670622
1,positive re-framing,2,0.558974


In [10]:
handle.close()
