## Building a Text Retrieval Database

### Step 0: Create a connection with your database

In [None]:
import getpass
# This collects a masked password from the user
mypasswd = getpass.getpass()

In [None]:
mysso = '<your pawprint>'
dbname = 'dsa_student'
schema = '<your pawprint>'

In [None]:
import psycopg2
import numpy as np
from psycopg2.extensions import adapt, register_adapter, AsIs

# Then connects to the DB
connection = psycopg2.connect(database = dbname, 
                              user = mysso, 
                              host = 'pgsql.dsa.lan',
                              password = mypasswd)

### Step 1: Create data repository (i.e table) within a database.

```SQL
-------------------------
-- Basic Table 
-------------------------
CREATE TABLE sebcq5.BookLines(
        id SERIAL NOT NULL,
        name varchar(250) NOT NULL,
        line_no INT NOT NULL,
        line text NOT NULL
);

ALTER TABLE sebcq5.BookLines
ADD CONSTRAINT pk_BookLines PRIMARY KEY (id);
```

In [None]:
CREATE_TABLES = f"""
-------------------------
-- Basic Table 
-------------------------
DROP TABLE IF EXISTS {schema}.BookLines;
CREATE TABLE {schema}.BookLines(
        id SERIAL NOT NULL,
        name varchar(250) NOT NULL,
        line_no INT NOT NULL,
        line text NOT NULL
);

ALTER TABLE {schema}.BookLines
ADD CONSTRAINT pk_BookLines PRIMARY KEY (id);
"""
# print(CREATE_TABLES)

In [None]:
with connection, connection.cursor() as cursor:
    cursor.execute(CREATE_TABLES)

**Check whether the table is created by listing all the tables in the schema**

In [None]:
SQL_QUERY = f"""
SELECT table_schema, table_name
FROM information_schema.tables
WHERE table_schema = '{schema}';
"""

In [None]:
import pandas as pd

df = pd.read_sql_query(SQL_QUERY, connection)
df

### Step 2: Add a column that implements the vector model, then parse the data into it.

```SQL
-------------------------
Separate Ts_Vector column
-------------------------
-- TS_Vector for GIN INDEX
ALTER TABLE sebcq5.BookLines
  ADD COLUMN line_tsv_gin tsvector;

UPDATE sebcq5.BookLines
SET line_tsv_gin = to_tsvector('pg_catalog.english', line);
```

In [None]:
UPDATE_TABLE=f"""
-------------------------
-- Separate Ts_Vector column
-------------------------
-- TS_Vector for GIN INDEX
ALTER TABLE {schema}.BookLines
  ADD COLUMN line_tsv_gin tsvector;

UPDATE {schema}.BookLines
SET line_tsv_gin = to_tsvector('pg_catalog.english', line);
"""
# print(UPDATE_TABLE)

In [None]:
with connection, connection.cursor() as cursor:
    cursor.execute(UPDATE_TABLE)

**Check the table**

In [None]:
SQL_QUERY = f"""
SELECT table_schema, table_name, column_name, data_type
FROM information_schema.columns
WHERE table_schema = '{schema}' AND table_name = 'booklines';
"""

df = pd.read_sql_query(SQL_QUERY, connection)
df

### Step 3: Add another column that implements the vector model, then parse the data into it.

```SQL
-- TS_Vector for GIST INDEX
ALTER TABLE sebcq5.BookLines
  ADD COLUMN line_tsv_gist tsvector;

UPDATE sebcq5.BookLines
SET line_tsv_gist = to_tsvector('pg_catalog.english', line);
```

In [None]:
UPDATE_TABLE=f"""
-- TS_Vector for GIST INDEX
ALTER TABLE {schema}.BookLines
  ADD COLUMN line_tsv_gist tsvector;

UPDATE {schema}.BookLines
SET line_tsv_gist = to_tsvector('pg_catalog.english', line);
"""
# print(UPDATE_TABLE)

In [None]:
with connection, connection.cursor() as cursor:
    cursor.execute(UPDATE_TABLE)

**Check the table**

In [None]:
SQL_QUERY = f"""
SELECT table_schema, table_name, column_name, data_type
FROM information_schema.columns
WHERE table_schema = '{schema}' AND table_name = 'booklines';
"""

df = pd.read_sql_query(SQL_QUERY, connection)
df

### Step 4: Set up database triggers to parse all new content loaded into the vector models.

```SQL
--TRIGGER
CREATE TRIGGER tsv_gin_update 
	BEFORE INSERT OR UPDATE
	ON sebcq5.BookLines 
	FOR EACH ROW 
	EXECUTE PROCEDURE 
	tsvector_update_trigger(line_tsv_gin,'pg_catalog.english',line);

CREATE TRIGGER tsv_gist_update 
	BEFORE INSERT OR UPDATE
	ON sebcq5.BookLines 
	FOR EACH 
	ROW EXECUTE PROCEDURE
	tsvector_update_trigger(line_tsv_gist,'pg_catalog.english',line);

```

In [None]:
CREATE_TRIGGER=f"""
--TRIGGER
CREATE TRIGGER tsv_gin_update 
    BEFORE INSERT OR UPDATE
    ON {schema}.BookLines 
    FOR EACH ROW 
    EXECUTE PROCEDURE 
    tsvector_update_trigger(line_tsv_gin,'pg_catalog.english',line);

CREATE TRIGGER tsv_gist_update 
    BEFORE INSERT OR UPDATE
    ON {schema}.BookLines 
    FOR EACH 
    ROW EXECUTE PROCEDURE
    tsvector_update_trigger(line_tsv_gist,'pg_catalog.english',line);

"""

In [None]:
with connection, connection.cursor() as cursor:
    cursor.execute(CREATE_TRIGGER)

### Step 5:  Add a specialized indexing to the vector models.

```SQL
-------------------------
-- Create Indexes
-------------------------

-- Index on content (Trigram needed,to use Gin Index)
-- CREATE EXTENSION pg_trgm;  -- Done by DB Admin

CREATE INDEX BookLines_line
ON sebcq5.BookLines USING GIN(line gin_trgm_ops);

-- GIN INDEX on content_tsv_gin
CREATE INDEX BookLines_line_tsv_gin
ON sebcq5.BookLines USING GIN(line_tsv_gin);

-- GIST INDEX on content_tsv_gist
CREATE INDEX BookLines_line_tsv_gist
ON sebcq5.BookLines USING GIST(line_tsv_gist);
```

In [None]:
CREATE_INDEX=f"""
-------------------------
-- Create Indexes
-------------------------

-- Index on content (Trigram needed,to use Gin Index)
-- CREATE EXTENSION pg_trgm;  -- Done by DB Admin

CREATE INDEX BookLines_line
ON {schema}.BookLines USING GIN(line gin_trgm_ops);

-- GIN INDEX on content_tsv_gin
CREATE INDEX BookLines_line_tsv_gin
ON {schema}.BookLines USING GIN(line_tsv_gin);

-- GIST INDEX on content_tsv_gist
CREATE INDEX BookLines_line_tsv_gist
ON {schema}.BookLines USING GIST(line_tsv_gist);
"""

In [None]:
with connection, connection.cursor() as cursor:
    cursor.execute(CREATE_INDEX)

### Grant permission to read your table
```SQL
GRANT USAGE ON SCHEMA <yourpawprint> TO dsa_ro_user;
GRANT SELECT ON <yourpawprint>.BookLines TO dsa_ro_user;
```

In [None]:
GRANT_ACCESS=f"""
GRANT USAGE ON SCHEMA {schema} TO dsa_ro_user;
GRANT SELECT ON {schema}.booklines TO dsa_ro_user;
"""

In [None]:
with connection, connection.cursor() as cursor:
    cursor.execute(GRANT_ACCESS)