# Downloading data from Quesmed

Direct DB connection to download the latest comments

In [1]:
import psycopg2
from dotenv import load_dotenv
import os

load_dotenv()

conn = psycopg2.connect(
  dbname=os.getenv("POSTGRES_DB"),
  user=os.getenv("POSTGRES_USER"),
  password=os.getenv("POSTGRES_PASSWORD"),
  port=os.getenv("POSTGRES_PORT"),
  host=os.getenv("POSTGRES_HOST")
)

In [2]:
keys = (
  'id', 
  'createdAt', 
  'userId', 
  'userCreatedAt', 
  'classYear', 
  'universityId',
  'country',
  'universityName',
  'parentId',
  'questionId',
  'comment',
  'review'
)
data_map = {k: [] for k in keys}
data_map

{'id': [],
 'createdAt': [],
 'userId': [],
 'userCreatedAt': [],
 'classYear': [],
 'universityId': [],
 'country': [],
 'universityName': [],
 'parentId': [],
 'questionId': [],
 'comment': [],
 'review': []}

In [3]:
import pandas as pd

update_keys = (
    'chapter_explanation_update',
    'question_update',
    'question_explanation_update',
    'qc_explanation_update'
)

file_path = "data/1comments.h5"

def load_df():
  if os.path.isfile(file_path):
    df = pd.read_hdf(file_path, key='df')
    return df

  cur = conn.cursor()
  cur.execute("""
  SELECT
    com.id,
    com."createdAt",
    com."userId",
    u."createdAt" "userCreatedAt",
    u."classYear",
    u."universityId",
    uni.country,
    uni.name "universityName",
    com."parentId",
    com."questionId",
    com.comment,
    com.review
  FROM
    question_comments com
    INNER JOIN users u ON com."userId" = u.id
    INNER JOIN universities uni ON u."universityId" = uni.id
  ORDER BY
    com."createdAt" ASC
  """)

  batch_size = 1000
  loading = True
  while loading:
      data = cur.fetchmany(batch_size)
      if len(data) < batch_size:
          loading = False
      for row in data:
          for i, k in enumerate(keys):
              data_map[k].append(row[i])
      
  cur.close()

  df = pd.DataFrame.from_dict(data_map)
  for k in update_keys:
     df[k] = None
  df.to_hdf(file_path, key='df', mode='w')
  return df

df = load_df()
print(df.shape)
df.sample(3)

(24912, 16)


your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block4_values] [items->Index(['classYear', 'country', 'universityName', 'comment', 'review',
       'chapter_explanation_update', 'question_update',
       'question_explanation_update', 'qc_explanation_update'],
      dtype='object')]

  df.to_hdf(file_path, key='df', mode='w')


Unnamed: 0,id,createdAt,userId,userCreatedAt,classYear,universityId,country,universityName,parentId,questionId,comment,review,chapter_explanation_update,question_update,question_explanation_update,qc_explanation_update
5114,7534,2022-02-23 13:30:09.199000+00:00,13653,2021-10-28 17:22:07.001000+00:00,Year 3,2635,United Kingdom,University of Dundee,,5169,dentist,False,,,,
9881,13668,2022-10-09 21:31:59.674702+00:00,13308,2021-10-19 18:45:40.713000+00:00,Year 4,2641,United Kingdom,University of Exeter,,4903,Would IUS also be considered?,False,,,,
9078,12686,2022-06-29 21:24:40.085938+00:00,22627,2022-05-26 12:17:50.980110+00:00,Year 4,2620,United Kingdom,University College London (UCL),,5177,"surely with difficulty swallowing, taking an o...",False,,,,
