In [1]:
import io
import psycopg2
import pandas
import yaml

pandas.set_option('display.max_columns', None)  
pandas.set_option('display.expand_frame_repr', False)
pandas.set_option('max_colwidth', -1)

d = pandas.DataFrame({
    'subjectID':[1, 1, 2, 2],
    'surveyCategory': [ "withdrawal behavior", "positive re-framing", "withdrawal behavior", "positive re-framing"],
    'assessmentTotal': [5, 2, 3, 4],
    'irrelevantCol1': ['irrel1']*4,
    'irrelevantCol2': ['irrel2']*4,
})

print(d)




   subjectID       surveyCategory  assessmentTotal irrelevantCol1 irrelevantCol2
0  1          withdrawal behavior  5                irrel1         irrel2       
1  1          positive re-framing  2                irrel1         irrel2       
2  2          withdrawal behavior  3                irrel1         irrel2       
3  2          positive re-framing  4                irrel1         irrel2       


In [2]:
conn = psycopg2.connect(
    database="johnmount",
    user="johnmount",
    host="localhost",
    password=""
)
conn.autocommit=True

cur = conn.cursor()

In [3]:
def is_numeric(col):
    try:
        0.0 + col
        return True
    except Exception as ex:
        return False


def insert_table(conn, d, table_name):
    cr = [d.columns[i].lower() + " " + ("double precision" if is_numeric(d[d.columns[i]]) else "VARCHAR") for 
            i in range(d.shape[1])]
    table_name = 'd'
    create_stmt = "CREATE TABLE " + table_name + " ( " + ', '.join(cr) + " )"
    cur = conn.cursor()
    cur.execute("DROP TABLE IF EXISTS " + table_name)
    conn.commit()
    cur.execute(create_stmt)
    conn.commit()
    buf = io.StringIO(d.to_csv(index=False, header=False, sep='\t'))
    cur.copy_from(buf, 'd', columns=[c for c in d.columns])
    conn.commit()


def read_query(conn, q):
    cur.execute(q)
    r = cur.fetchall()
    colnames = [desc[0] for desc in cur.description]
    return pandas.DataFrame(columns = colnames, data = r)


def read_table(conn, table_name):
    return read_query(conn, "SELECT * FROM " + table_name)
    

insert_table(conn, d, 'd')


read_table(conn, 'd')

Unnamed: 0,subjectid,surveycategory,assessmenttotal,irrelevantcol1,irrelevantcol2
0,1.0,withdrawal behavior,5.0,irrel1,irrel2
1,1.0,positive re-framing,2.0,irrel1,irrel2
2,2.0,withdrawal behavior,3.0,irrel1,irrel2
3,2.0,positive re-framing,4.0,irrel1,irrel2


Note: case in columns is a bit of nightmare.  It is best to lower-case them in all cases.

In [4]:
from data_algebra.data_ops import *
import data_algebra.env
import data_algebra.yaml
import data_algebra.PostgreSQL




_, _1, _2, _get = [None, None, None, lambda x: x] # don't look unbound
data_algebra.yaml.fix_ordered_dict_yaml_rep()
data_algebra.env.push_onto_namespace_stack(locals())

db_model = data_algebra.PostgreSQL.PostgreSQLModel()

scale = 0.237

ops = TableDescription('d', 
                 ['subjectID',
                  'surveyCategory',
                  'assessmentTotal',
                  'irrelevantCol1',
                  'irrelevantCol2']) .\
    extend({'probability': '(assessmentTotal * scale).exp()'}) .\
    extend({'probability': 'probability/probability.sum()'},
           partition_by='subjectID') .\
    extend({'row_number':'_row_number()'},
           partition_by=['subjectID'],
           order_by=['probability', 'surveyCategory'],
           reverse=['probability']) .\
    select_rows('row_number==1') .\
    select_columns(['subjectID', 'surveyCategory', 'probability']) .\
    rename_columns({'diagnosis': 'surveyCategory'}) .\
    order_rows(['subjectID'])

print(ops.to_python(pretty=True))


TableDescription(
    table_name="d",
    column_names=[
        "subjectID",
        "surveyCategory",
        "assessmentTotal",
        "irrelevantCol1",
        "irrelevantCol2",
    ],
).extend({"probability": "(assessmentTotal * 0.237).exp()"}).extend(
    {"probability": "probability / probability.sum()"}, partition_by=["subjectID"]
).extend(
    {"row_number": "_row_number()"},
    partition_by=["subjectID"],
    order_by=["probability", "surveyCategory"],
    reverse=["probability"],
).select_rows(
    "row_number == 1"
).select_columns(
    ["subjectID", "surveyCategory", "probability"]
).rename_columns(
    {"diagnosis": "surveyCategory"}
).order_rows(
    ["subjectID"]
)



In [5]:
sql = ops.to_sql(db_model, pretty=True)
print(sql)

SELECT "probability",
       "subjectid",
       "diagnosis"
FROM
  (SELECT "probability",
          "subjectid",
          "surveycategory" AS "diagnosis"
   FROM
     (SELECT "probability",
             "subjectid",
             "surveycategory"
      FROM
        (SELECT "probability",
                "subjectid",
                "surveycategory"
         FROM
           (SELECT "probability",
                   "subjectid",
                   "surveycategory",
                   ROW_NUMBER() OVER (PARTITION BY "subjectid"
                                      ORDER BY "probability" DESC, "surveycategory") AS "row_number"
            FROM
              (SELECT "subjectid",
                      "surveycategory",
                      "probability" / SUM("probability") OVER (PARTITION BY "subjectid") AS "probability"
               FROM
                 (SELECT "subjectid",
                         "surveycategory",
                         EXP(("assessmenttotal" * 0.237)) AS "probab

In [6]:
read_query(conn, sql)

Unnamed: 0,probability,subjectid,diagnosis
0,0.670622,1.0,withdrawal behavior
1,0.558974,2.0,positive re-framing


In [7]:
p = ops.collect_representation()
dmp = yaml.dump(p)
with open("pipeline_yaml.txt", "w") as f:
    print(dmp, file=f)

print(dmp)

- op: TableDescription
  table_name: d
  qualifiers: {}
  column_names:
  - subjectID
  - surveyCategory
  - assessmentTotal
  - irrelevantCol1
  - irrelevantCol2
  key: d
- op: Extend
  ops:
    probability: (assessmentTotal * 0.237).exp()
  partition_by: []
  order_by: []
  reverse: []
- op: Extend
  ops:
    probability: probability / probability.sum()
  partition_by:
  - subjectID
  order_by: []
  reverse: []
- op: Extend
  ops:
    row_number: _row_number()
  partition_by:
  - subjectID
  order_by:
  - probability
  - surveyCategory
  reverse:
  - probability
- op: SelectRows
  expr: row_number == 1
- op: SelectColumns
  columns:
  - subjectID
  - surveyCategory
  - probability
- op: Rename
  column_remapping:
    diagnosis: surveyCategory
- op: Order
  order_columns:
  - subjectID
  reverse: []
  limit: null



In [8]:
ops_back = data_algebra.yaml.to_pipeline(yaml.safe_load(dmp))
print(ops_back)

TableDescription(table_name='d', column_names=['subjectID', 'surveyCategory', 'assessmentTotal', 'irrelevantCol1', 'irrelevantCol2']) .\
   extend({'probability': '(assessmentTotal * 0.237).exp()'}) .\
   extend({'probability': 'probability / probability.sum()'}, partition_by=['subjectID']) .\
   extend({'row_number': '_row_number()'}, partition_by=['subjectID'], order_by=['probability', 'surveyCategory'], reverse=['probability']) .\
   select_rows('row_number == 1') .\
   select_columns(['subjectID', 'surveyCategory', 'probability']) .\
   rename_columns({'diagnosis': 'surveyCategory'}) .\
   order_rows(['subjectID'])


In [9]:
conn.close()

In [10]:
ops = TableDescription('d', 
                 ['subjectID',
                  'surveyCategory',
                  'assessmentTotal',
                  'irrelevantCol1',
                  'irrelevantCol2']) .\
    extend({'probability': '(assessmentTotal * scale).exp()'}) .\
    extend({'total': 'probability.sum()'},
           partition_by='subjectID') .\
    extend({'probability': 'probability/total'}) .\
    extend({'row_number':'_row_number()'},
           partition_by=['subjectID'],
           order_by=['probability', 'surveyCategory'],
           reverse=['probability'])

print(ops)
ops.eval_pandas({'d': d})

TableDescription(table_name='d', column_names=['subjectID', 'surveyCategory', 'assessmentTotal', 'irrelevantCol1', 'irrelevantCol2']) .\
   extend({'probability': '(assessmentTotal * 0.237).exp()'}) .\
   extend({'total': 'probability.sum()'}, partition_by=['subjectID']) .\
   extend({'probability': 'probability / total'}) .\
   extend({'row_number': '_row_number()'}, partition_by=['subjectID'], order_by=['probability', 'surveyCategory'], reverse=['probability'])


Unnamed: 0,subjectID,surveyCategory,assessmentTotal,irrelevantCol1,irrelevantCol2,probability,total,row_number
0,1,withdrawal behavior,5,irrel1,irrel2,0.670622,4.877094,1
1,1,positive re-framing,2,irrel1,irrel2,0.329378,4.877094,2
2,2,withdrawal behavior,3,irrel1,irrel2,0.441026,4.61657,2
3,2,positive re-framing,4,irrel1,irrel2,0.558974,4.61657,1
