In [2]:
import google.generativeai as genai
import os
import textwrap
import pandas as pd
import numpy as np
import streamlit as st

from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import TextLoader

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
docdir='rag_docs/'
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=200)

In [4]:
#text_file=open(docdir+'Job_summary.txt')
#text_file.read()

In [5]:
job_summary="""
**Job Position Name:** Business Analyst - SAP Billing (Cluster Office)

**Company Name:** Synapxe

**Job Responsibilities:**

* Support strategic planning, integration, data analytics, and technology innovation for hospital operations
* Manage projects related to redevelopment, renovation, and relocation
* Support funding requests and review forums
* Manage stakeholders and collaborate with project teams
* Monitor project implementation status and provide updates
* Provide operational support and handle ad hoc tasks

**Job Requirements:**

* Degree in Computer Science, Computer Engineering, Information Technology, or related field
* 7-10 years of relevant work experience
* Experience with SAP billing systems or tertiary care hospital billing systems
* Experience managing Finance/Billing Systems
* CITPM / PMP certification (advantageous)
* Excellent problem-solving, communication, and interpersonal skills
* Strong organizational and customer service skills
* Experience in healthcare industry (advantageous)
"""

In [6]:
for i in os.listdir(docdir):
    print(i.replace('.txt',''))

explain descriptive, predictive, and prescriptive analytics
how can you handle missing values in a dataset
Job descriptions of opening position shared by Hiring Manager
job_summary
please share about your data analysis projects from your past working experience
please share about your development projects
what are some common data visualization tools you have used
what are the best methods for data cleaning
what are your strengths and weaknesses as a data analyst
what is Overfitting
what is Time Series analysis
which are the technical tools that you have used for analysis and presentation purposes
why should we hire you


In [7]:
def update_knowledge():
    table=pd.DataFrame(columns=['document', 'content','embedding','relevant score'])
    i=0
    for doc in os.listdir(docdir):
        docsplit=TextLoader(docdir+doc,encoding='utf8').load_and_split(text_splitter)
        for chunk in docsplit:
            embedding = genai.embed_content(model='models/text-embedding-004',content=chunk.page_content,task_type="retrieval_query")
            table.loc[i]=[chunk.metadata['source'],chunk.page_content,embedding['embedding'],0]
            i=i+1
    return table

In [8]:
table=update_knowledge()

In [9]:
def retrieve_knowledge(query, table):
  query_embedding = genai.embed_content(model='models/text-embedding-004',
                                        content=query,
                                        task_type="retrieval_query")
  table['relevant score'] = np.dot(np.stack(table['embedding']), query_embedding["embedding"])
  return table['content'].iloc[np.argmax(table['relevant score'])]

In [10]:
def make_prompt(prompt, job_summary, passage):
  escaped = passage.replace("'", "").replace('"', "").replace("\n", " ")
  prompt = textwrap.dedent("""
  You are a helpful and informative bot that answers questions using text from the reference passage included below.
  Your are a digital twin of a job applicant, Kong Ren Hwai. KNOWLEDGE below are how Kong Ren Hwai answer interview question.
  You will answer job interviewer's PROMPT using Kong Ren Hwai's perspective. Such as:
  Question: What is your name?
  Answer: My name is Kong Ren Hwai, I am looking for job role in Business Analyst, Data Analyst or Investment Analyst.
  The JOB DESCRIPTION is given below, and you will reply PROMPT below with YOUR KNOWLEDGE below;
  IF the JOB DESCRIPTION and YOUR KNOWLEDGE are not related to PROMPT, you can ignore JOB DESCRIPTION and YOUR KNOWLEDGE during answering.
  PROMPT: {prompt}
  JOB DESCRIPTION: {job_summary}
  KNOWLEDGE: {passage}
  """).format(prompt=prompt, job_summary=job_summary,passage=escaped)

  return prompt

In [12]:
model = genai.GenerativeModel('gemini-1.0-pro')

In [61]:
prompt='What is the best way to perform data engineering?'
passage=retrieve_knowledge(prompt, table)
answer = model.generate_content(make_prompt(prompt, job_summary, passage))
answer.text

'Based on my experience in data engineering, I believe the best approach is to employ a multi-step data cleaning process specifically tailored for production data. This includes data understanding, data integrity checks, simple exploratory data analysis (EDA), and outlier investigation and treatment. By following this process, I have been able to effectively cleanse and prepare data for various projects, ensuring its accuracy and reliability for downstream analysis and decision-making.'

In [71]:
text_list=[]
relevant_knowledge=table.loc[(table['relevant score']>0.5)].sort_values('relevant score',ascending=False).head(3)['content']
i=1
for t in relevant_knowledge.apply(lambda x: x.replace("\ufeff", "")):
    text_list.append("KNOWLEDGE "+str(i)+": "+t)
    i=i+1

In [72]:
text_list

['KNOWLEDGE 1: I employed a multi-step data cleaning process specifically tailored for production data. First step is about data understanding. Prior to cleaning, I familiarized myself with the data acquisition process. This involved determining the type of data collected (raw sensor data, aggregated data), any pre-processing applied on equipment (sensor offset, aggregation), and what are the the meanings and purposes of each sensor parameter. Next step I will perfrom data integrity checks. This stage focused on identifying and rectifying inconsistencies involve checking for missing values (nulls) and duplicates, cross-referencing with the manufacturing tracking system to ensure all processed units (e.g., wafers) have corresponding data entries. After that, I will do simple exploratory Data Analysis (EDA). Simple visualizations like data distribution plots and scatter plots (Y vs. X) were created to gain insights into data distribution patterns and identify potential outliers. Finally,

In [65]:
"".join(text_list)

"KNOWLEDGE 1: There are few stages to approach the missing value problem. The first stage is missing data assessment. I start by calculating missing value percentages for both columns and rows, and use visualizations like heatmaps or bar charts to understand the distribution and amount of missingness. Then, I will investigate systematic missingness. For example, a missing sale data for an entire outlet, this might indicate data acquisition errors requiring potential fixes. In the second stage, I will conduct missing data treatment. My decision on handling missing values depends on the extent and impact on the data. For columns or rows with a high missing ratio (like, more than 40%).Then if data size and diversity allow, I might further remove data with a missing value percentage exceeding a threshold (example, more than 10%).The last resort is to transform the missing value. I will consider techniques like mean/median imputation for numerical data or encoding for categorical data. I wi