In [None]:
import os
import psycopg2
import dask.dataframe as dd
import pandas as pd
from io import StringIO

%load_ext dotenv
%dotenv

dataset_size = 'small'
all_data_path = './../data/'
data_path = f'./../data/{dataset_size}/'

In [None]:
from collections.abc import Sequence


class PapersList(Sequence):
    def __init__(self, csv_path):
        # Read in the file once and build a list of line offsets
        self.df_csv_file = open(csv_path, 'rb')
        self.header_line = ""
        self.line_offset = []

        self.init_offsets()

        super().__init__()

    def init_offsets(self):
        self.header_line = self.df_csv_file.readline().decode('utf-8')

        offset = len(self.header_line)
        for line in self.df_csv_file:
            self.line_offset.append(offset)
            offset += len(line)

        # print(self.line_offset)
    
    def generate_row(self, line):
        str_buffer = StringIO('\n'.join([self.header_line, line]))
        mini_df = pd.read_csv(str_buffer)
        mini_df.astype(str)

        return mini_df.iloc[0]

    def __getitem__(self, i):
        self.df_csv_file.seek(self.line_offset[i])
        line = self.df_csv_file.readline().decode('utf-8')
        # print('offset:', self.line_offset[i])
        # print('line len:', len(line))

        row = self.generate_row(line)

        return row

    def __len__(self):
        return len(self.line_offset)

# Let's test it:
papers_list = PapersList(f'{data_path}/df_covid.csv')
print(papers_list[0])
print(papers_list[1])
print(papers_list[2])

# papers_list[0]
# papers_list[1]
# papers_list[2]
# papers_list[3]


print(len(papers_list))

In [None]:

conn = None
cur = None

try:
        # Connect to the database and begin a transaction
    conn = psycopg2.connect(
        f"dbname={os.environ['DB_DATABASE_NAME']} user={os.environ['DB_USER']} password={os.environ['DB_PASSWORD']}")
    cur = conn.cursor()

    # Executing a SQL query
    cur.execute("SELECT version();")
    # Fetch result
    record = cur.fetchone()
    print("You are connected to - ", record[0], "\n")
    
    print(conn.get_dsn_parameters())

except (Exception) as error:
    print("Error while connecting to PostgreSQL", error)


In [None]:

for paper in papers_list:
    paper_id = str(paper['paper_id'])
    title = str(paper['title'])
    abstract = str(paper['abstract'])
    body = str(paper['body_text'])

    sql = """INSERT INTO papers
        (paper_id, title, abstract, body)
    VALUES
        (%s, %s, %s, %s)
    ON CONFLICT ON CONSTRAINT papers_pkey DO UPDATE SET
        title = %s,
        abstract = %s,
        body = %s
    ;"""

    try:
        cur.execute(sql, [paper_id, title, abstract, body, title, abstract, body])
        conn.commit()
    except (Exception) as error:
        #print(paper)
        print("Error while connecting to PostgreSQL", error)

In [None]:
paragraph_list = PapersList(f'{data_path}/covid_sentences.csv')
print(paragraph_list[0])
print(paragraph_list[1])
print(paragraph_list[2])

print(len(paragraph_list))

In [None]:
# https://github.com/UKPLab/sentence-transformers/blob/master/examples/application_semantic_search.py

import torch
from sentence_transformers import SentenceTransformer
import numpy as np
from psycopg2.extensions import register_adapter, AsIs

psycopg2.extensions.register_adapter(np.float32, psycopg2._psycopg.AsIs)

# embedder = SentenceTransformer('distiluse-base-multilingual-cased')
embedder = SentenceTransformer(f'{all_data_path}/models/py-pretrained/')

def get_embedding(text):
    embedding = embedder.encode(text, device='cuda')

    magnitude = np.linalg.norm(embedding)
    normalized_embedding = embedding/magnitude

    return (normalized_embedding.tolist(), magnitude)


In [None]:

for paper_paragraph in paragraph_list:
    paper_id = paper_paragraph['paper_id']
    paragraph_text = paper_paragraph['paragraph']
    
    embedding, magnitude = get_embedding(paragraph_text)

    sql = """INSERT INTO paragraphs
        (paper_id, paragraph, embedding, embedding_magnitude)
    VALUES (%s, %s, %s, %s)
    ;"""

    # print(paper_id)
    # print(len(embedding))
    # print(magnitude)

    try:
        cur.execute(sql, [paper_id, paragraph_text, embedding, magnitude])
        conn.commit()
    except (Exception) as error:
        #print(paper)
        print("Error while connecting to PostgreSQL", error)

In [None]:
# https://www.enterprisedb.com/postgres-tutorials/indexing-documents-full-text-search-postgresql

# CREATE FUNCTION update_tsv() RETURNS trigger
#     LANGUAGE 'plpgsql' VOLATILE NOT LEAKPROOF
# AS $BODY$
# begin
#   new.tsv :=
#     setweight(to_tsvector('pg_catalog.english',
#       coalesce(new.title, '')), 'A') ||
#     setweight(to_tsvector('pg_catalog.english',
#       coalesce(new.abstract, '')), 'B');
#     setweight(to_tsvector('pg_catalog.english',
#       coalesce(new.body, '')), 'D');
#  return new;
# end
# $BODY$;
# CREATE TRIGGER update_tsv
#        BEFORE INSERT OR UPDATE ON papers
#        FOR EACH ROW EXECUTE PROCEDURE update_tsv();

# ( 'pregnant'::tsquery || to_tsquery('pregnancy') && ( to_tsquery('covid') || to_tsquery('Sars-Cov-2') ) && ( to_tsquery('trials') || to_tsquery('tests') || to_tsquery('experiment') ) )

# SELECT
#     ts_rank("tsv", to_tsquery('pregnant | covid | trials')) AS "rank",
#     paper_id,
#     title
# FROM
#     papers
# WHERE
#     tsv @@ to_tsquery('pregnant | covid | trials')
# ORDER BY rank DESC LIMIT 20
