In [1]:
import io
import psycopg2
import pandas

pandas.set_option('display.max_columns', None)  
pandas.set_option('display.expand_frame_repr', False)
pandas.set_option('max_colwidth', -1)

d = pandas.DataFrame({
    'subjectID':[1, 1, 2, 2],
    'surveyCategory': [ "withdrawal behavior", "positive re-framing", "withdrawal behavior", "positive re-framing"],
    'assessmentTotal': [5, 2, 3, 4],
    'irrelevantCol1': ['irrel1']*4,
    'irrelevantCol2': ['irrel2']*4,
})

print(d)




   subjectID       surveyCategory  assessmentTotal irrelevantCol1 irrelevantCol2
0  1          withdrawal behavior  5                irrel1         irrel2       
1  1          positive re-framing  2                irrel1         irrel2       
2  2          withdrawal behavior  3                irrel1         irrel2       
3  2          positive re-framing  4                irrel1         irrel2       


In [2]:
conn = psycopg2.connect(
    database="johnmount",
    user="johnmount",
    host="localhost",
    password=""
)
conn.autocommit=True

cur = conn.cursor()

In [3]:
def is_numeric(col):
    try:
        0.0 + col
        return True
    except Exception as ex:
        return False


def insert_table(conn, d, table_name):
    cr = [d.columns[i].lower() + " " + ("double precision" if is_numeric(d[d.columns[i]]) else "VARCHAR") for 
            i in range(d.shape[1])]
    table_name = 'd'
    create_stmt = "CREATE TABLE " + table_name + " ( " + ', '.join(cr) + " )"
    cur = conn.cursor()
    cur.execute("DROP TABLE IF EXISTS " + table_name)
    conn.commit()
    cur.execute(create_stmt)
    conn.commit()
    buf = io.StringIO(d.to_csv(index=False, header=False, sep='\t'))
    cur.copy_from(buf, 'd', columns=[c for c in d.columns])
    conn.commit()


def read_query(conn, q):
    cur.execute(q)
    r = cur.fetchall()
    colnames = [desc[0] for desc in cur.description]
    return pandas.DataFrame(columns = colnames, data = r)


def read_table(conn, table_name):
    return read_query(conn, "SELECT * FROM " + table_name)
    

insert_table(conn, d, 'd')


read_table(conn, 'd')

Unnamed: 0,subjectid,surveycategory,assessmenttotal,irrelevantcol1,irrelevantcol2
0,1.0,withdrawal behavior,5.0,irrel1,irrel2
1,1.0,positive re-framing,2.0,irrel1,irrel2
2,2.0,withdrawal behavior,3.0,irrel1,irrel2
3,2.0,positive re-framing,4.0,irrel1,irrel2


Note: case in columns is a bit of nightmare.  It is best to lower-case them in all cases.

In [4]:
from data_algebra.data_ops import *
import data_algebra.env
import data_algebra.yaml
import data_algebra.PostgreSQL




_, _1, _2, _get = [None, None, None, lambda x: x] # don't look unbound
data_algebra.yaml.fix_ordered_dict_yaml_rep()
data_algebra.env.push_onto_namespace_stack(locals())

db_model = data_algebra.PostgreSQL.PostgreSQLModel()

scale = 0.237

ops = TableDescription('d', 
                 ['subjectID',
                  'surveyCategory',
                  'assessmentTotal',
                  'irrelevantCol1',
                  'irrelevantCol2']) .\
    extend({'probability': '(assessmentTotal * scale).exp()'}) .\
    extend({'probability': 'probability/probability.sum()'},
           partition_by='subjectID') .\
    extend({'row_number':'_row_number()'},
           partition_by=['subjectID'],
           order_by=['probability', 'surveyCategory'],
           reverse=['probability']) .\
    select_rows('row_number==1') .\
    select_columns(['subjectID', 'surveyCategory', 'probability'])

print(ops.to_python(pretty=True))


TableDescription(
    table_name="d",
    column_names=[
        "subjectID",
        "surveyCategory",
        "assessmentTotal",
        "irrelevantCol1",
        "irrelevantCol2",
    ],
).extend({"probability": "(assessmentTotal * 0.237).exp()"}).extend(
    {"probability": "probability / probability.sum()"}, partition_by=["subjectID"]
).extend(
    {"row_number": "_row_number()"},
    partition_by=["subjectID"],
    order_by=["probability", "surveyCategory"],
    reverse=["probability"],
).select_rows(
    "row_number == 1"
).select_columns(
    ["subjectID", "surveyCategory", "probability"]
)



In [5]:
sql = ops.to_sql(db_model, pretty=True)
print(sql)

SELECT "surveycategory",
       "subjectid",
       "probability"
FROM
  (SELECT "surveycategory",
          "subjectid",
          "probability"
   FROM
     (SELECT "surveycategory",
             "subjectid",
             "probability",
             ROW_NUMBER() OVER (PARTITION BY "subjectid"
                                ORDER BY "probability" DESC, "surveycategory") AS "row_number"
      FROM
        (SELECT "surveycategory",
                "subjectid",
                "probability" / SUM("probability") OVER (PARTITION BY "subjectid") AS "probability"
         FROM
           (SELECT "surveycategory",
                   "subjectid",
                   EXP(("assessmenttotal" * 0.237)) AS "probability"
            FROM
              (SELECT "assessmenttotal",
                      "surveycategory",
                      "subjectid"
               FROM "d") "sq_0") "sq_1") "sq_2") "sq_3"
   WHERE "row_number" = 1 ) "sq_4"


In [6]:


read_query(conn, sql)

Unnamed: 0,surveycategory,subjectid,probability
0,withdrawal behavior,1.0,0.670622
1,positive re-framing,2.0,0.558974


In [7]:
conn.close()