In [1]:
from base64 import b64encode, b64decode
from db import psycopg_engine
from algorithm import extract_from_pdf, pre_process, encode
import os
import numpy as np

In [2]:
names = ['Nandagopal', 'Adryja', 'Ravi Ranjan', 'Ashish', 'name', 'Divya Prakash', 'Bijjula', 'Karthik', 'Prashant', 'Vivek', 'Avik', 'Kumar']

In [3]:
surnames = ['H ', 'Ghosh', 'Kumar', 'Thakur', 'surname', 'Singh', 'Sahithi', 'Raja', 'Bhat', 'Kumar', 'Bhattacharya', 'Rajput']

In [4]:
job_titles = ['Backend Developer', 'Data Scientist', 'Director of Engineering', 'IT Project Manager', '(ITA) Data Scientist', '(ITA) Web Developer', 'Lead Technical Program Manager', 'Primary English Teacher', 'Senior Product Manager', 'Senior Software Developer', 'Web Developer']

In [36]:
with psycopg_engine() as conn:
    for i, (name, surname) in enumerate(zip(names, surnames)):
         conn.execute("""INSERT INTO worker VALUES (%(id)s, %(name)s, %(surname)s)""", params={"id": i+1, "name": name, "surname": surname})
         conn.commit()

In [13]:
with psycopg_engine() as conn:
    for i, title in enumerate(job_titles):
        conn.execute("""INSERT INTO job_offer VALUES (%(id)s, %(title)s)""", params={"id": i+1, "title": title})
        conn.commit()

In [5]:
# Load the paths of all the files
cv_dir = '../docs/cvs/'
jobs_dir = '../docs/jobs/'
cvs_paths = [ cv_dir + cv_path for cv_path in os.listdir(cv_dir) ]
jobs_paths = [ jobs_dir + job_path for job_path in os.listdir(jobs_dir) ]

In [6]:
# Read PDFs as binary strings
cvs_bin = [ b64encode(open(cv_path, 'rb').read()) for cv_path in cvs_paths ]
jobs_bin = [ b64encode(open(job_path, 'rb').read()) for job_path in jobs_paths ]

In [39]:
# Save CV PDFs into the DB
with psycopg_engine() as conn:
    for i, cv in enumerate(cvs_bin):
        conn.execute("""UPDATE worker
                        SET curriculum = %(cv)s
                        WHERE worker_id = %(id)s""", params={"id": i+1, "cv": cv})
        conn.commit()

In [52]:
# Save jobs PDFs into the DB
with psycopg_engine() as conn:
    for i, file in enumerate(jobs_bin):
        conn.execute("""UPDATE job_offer
                        SET file = %(file)s
                        WHERE offer_id = %(id)s""", params={"id": i+1, "file": file})
        conn.commit()

In [40]:
# Compute CV embeddings
cvs = [ extract_from_pdf(cv) for cv in cvs_paths]
cvs = [ pre_process(cv) for cv in cvs ]
cvs = [ encode(cv) for cv in cvs]

In [41]:
# Store CV embeddings
with psycopg_engine() as conn:
    for i, cv in enumerate(cvs):
        conn.execute("""UPDATE worker
                        SET embedding = %(embedding)s
                        WHERE worker_id = %(id)s""", params={"id": i+1, "embedding": cv.tolist()})
        conn.commit()

In [35]:
# Compute jobs embeddings
jobs = [ extract_from_pdf(job) for job in jobs_paths]
jobs = [ pre_process(job) for job in jobs ]
jobs = [ encode(job) for job in jobs]

In [38]:
# Store jobs embedding
with psycopg_engine() as conn:
    for i, job in enumerate(jobs):
        conn.execute("""UPDATE job_offer
                        SET embedding = %(embedding)s
                        WHERE offer_id = %(id)s""", params={"id": i+1, "embedding": job.tolist()})
        conn.commit()

In [42]:
# Randomly populate the applies_to table
np.random.seed(1)
with psycopg_engine() as conn:
    for i in range(len(jobs_paths)):
        # Sample from a Bernoulli with P(success)=p
        does_apply = np.random.binomial(n=1, p=0.3, size=len(cvs_paths))
        for j in range(len(cvs_paths)):
            # Insert only if success
            if does_apply[j] == 1:
                conn.execute("""INSERT INTO applies_to VALUES (%(offer_id)s, %(worker_id)s)""", params={"offer_id": i+1, "worker_id": j+1})
    conn.commit()

In [49]:
# Read a PDF from the database
with psycopg_engine() as conn:
    cur = conn.execute("""SELECT curriculum from worker
                    WHERE worker_id = %(id)s""", params={"id": 9})
    data = b64decode(cur.fetchone()[0])

In [50]:
# Write the file locally
with open('temp.pdf', 'wb') as f:
    f.write(data)