# RAG Text

In [2]:
import glob, sys, os
from elasticsearch import Elasticsearch
from requests.auth import HTTPBasicAuth
from dotenv import load_dotenv
from sentence_transformers import SentenceTransformer, models
from sentence_transformers.cross_encoder import CrossEncoder
import ast

In [3]:
# for PDF Download 
import tempfile
from langchain.document_loaders import PyPDFLoader

In [4]:
import requests
import pandas as pd
import itertools
from ibm_watson_machine_learning.foundation_models import Model
from ibm_watson_machine_learning.metanames import GenTextParamsMetaNames as GenParams

In [5]:
load_dotenv()
project_id = os.environ["PROJECT_ID"]
ibm_cloud_url = os.environ["IBM_CLOUD_URL"]
api_key = os.environ["API_KEY"]
watsonx_discovery_username=os.environ["WATSONX_DISCOVERY_USERNAME"]
watsonx_discovery_password=os.environ["WATSONX_DISCOVERY_PASSWORD"]
watsonx_discovery_url=os.environ["WATSONX_DISCOVERY_URL"]
watsonx_discovery_port=os.environ["WATSONX_DISCOVERY_PORT"]
watsonx_discovery_endpoint = watsonx_discovery_url+':'+watsonx_discovery_port

In [5]:
def read_pdf(filepath):
        loader = PyPDFLoader(filepath)
        data = loader.load()
        docs = format_pdf_reader(data)
        return docs

def split_text_with_overlap(text, chunk_size, overlap_size):
    chunks = []
    start_index = 0

    while start_index < len(text):
        end_index = start_index + chunk_size
        chunk = text[start_index:end_index]
        chunks.append(chunk)
        start_index += (chunk_size - overlap_size)
    return chunks

def import_text_splitter(chunk_size, chunk_overlap):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        is_separator_regex=False,
        )
    return text_splitter

def format_pdf_reader(raw_data):
    # format content from pdf into text
    pdf_text = ""
    for data in raw_data:
        pdf_text+=data.page_content+"\n"
    return pdf_text

In [6]:
filepath='คู่มือการให้เงินกู้ยืมแบบมีรถเป็นสินทรัพย์ค้ำประกัน.pdf'
docs = read_pdf(filepath)
chunks = split_text_with_overlap(docs, 1000, 300)

In [7]:
print(chunks[0])

คู่มือการให้เงินกู้ยืมแบบมีรถเป็นสินทรัพย์คํ>าประกัน บทนํา ข้อกําหนดและเงืCอนไข ข้อกําหนดของผู้กู้ยืม • อายุ 20-60 ปี • มีรายได้ประจํา • มีรถยนต์ทีCมีค่ามากกว่า 100,000 บาท • ไม่มีประวัติเครดิตทีCไม่ดี ขั>นตอนการให้เงินกู้ยืม ขั>นตอนทีC 1: การสมัคร • ผู้กู้ยืมจะต้องกรอกแบบฟอร์มการสมัครและแนบเอกสารทีCจําเป็น เช่น ใบขับขีC, ใบอนุญาตจดทะเบียนรถยนต์, และเอกสารแสดงรายได้ • ผู้ให้กู้ยืมจะต้องตรวจสอบเอกสารและประเมินสินทรัพย์ของผู้กู้ยืม ขั>นตอนทีC 2: การประเมินสินทรัพย์ • ผู้ให้กู้ยืมจะต้องประเมินมูลค่าของรถยนต์ทีCใช้เป็นสินทรัพย์คํ>าประกัน • ผู้ให้กู้ยืมจะต้องตรวจสอบสภาพรถยนต์และเอกสารทีCเกีCยวข้อง ขั>นตอนทีC 3: การอนุมัติ • ผู้ให้กู้ยืมจะต้องอนุมัติการให้เงินกู้ยืมหลังจากประเมินสินทรัพย์และตรวจสอบเอกสาร • ผู้ให้กู้ยืมจะต้องแจ้งให้ผู้กู้ยืมทราบเกีCยวกับผลการอนุมัติ ขั>นตอนทีC 4: การจ่ายเงินกู้ยืม • ผู้ให้กู้ยืมจะต้องจ่ายเงินกู้ยืมให้กับผู้กู้ยืมหลังจากได้รับการอนุมัติ • ผู้กู้ยืมจะต้องชําระคืนเงินกู้ยืมพร้อมดอกเบี>ยภายในระยะเวลาทีCกําหนด  กรณีทีCเงินทีCต้องการกู้มากกว่ามูลค่าของรถ การมีคนคํ>

In [8]:
model_id_emb="kornwtp/simcse-model-phayathaibert"

def get_model(model_name='airesearch/wangchanberta-base-att-spm-uncased', max_seq_length=768, condition=True):
    if condition:
        # model_name = 'airesearch/wangchanberta-base-att-spm-uncased'
        # model_name = "hkunlp/instructor-large"
        word_embedding_model = models.Transformer(model_name, max_seq_length=max_seq_length)
        pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),pooling_mode='cls') # We use a [CLS] token as representation
        model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
    return model

In [9]:
embedder_model = get_model(model_name='kornwtp/simcse-model-phayathaibert', max_seq_length=768)
reranker_model = CrossEncoder("Pongsasit/mod-th-cross-encoder-minilm")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [27]:
es = Elasticsearch(
    [watsonx_discovery_endpoint],
    http_auth=(watsonx_discovery_username, watsonx_discovery_password),
    verify_certs=False
)

if es.ping():
    print("Connection to Elasticsearch successful")
else:
    print("Connection to Elasticsearch failed")

  es = Elasticsearch(


Connection to Elasticsearch successful


In [28]:
print(es.info())



{'name': 'm-1.85327704-a7e4-4102-b352-906f6cb1d39d.4f85e5cc8cc641b0a940e985e7b06ede.br37s45d0p54n73ffbr0.databases.appdomain.cloud', 'cluster_name': '85327704-a7e4-4102-b352-906f6cb1d39d', 'cluster_uuid': 'A1ded31mSKuR5ihPH2jxDw', 'version': {'number': '8.12.1', 'build_flavor': 'default', 'build_type': 'tar', 'build_hash': '6185ba65d27469afabc9bc951cded6c17c21e3f3', 'build_date': '2024-02-01T13:07:13.727175297Z', 'build_snapshot': False, 'lucene_version': '9.9.2', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'}


In [29]:
index_name1 = 'pongsasit_cash_loan_policy'

In [31]:
# pongsasit_cash_loan_policy_dictionary = {
#   "mappings": {
#     "properties": {
#         "text_as_reference": {"type": "text"},
#         "page_reference": {"type": "text"},
#         "embedding": {
#             "type": "dense_vector",
#             "dims": 768,
#             "index": True,
#             "similarity": "cosine"
#         }
#     }
#   }
# }

# es.indices.create(index=index_name1, body= pongsasit_cash_loan_policy_dictionary)

In [None]:
print(docs)

In [32]:
embeds = [list(embed) for embed in embedder_model.encode(chunks)]
embeds_i = [list(embed) for embed in embedder_model.encode([docs])]

In [33]:
embed_final = embeds.copy()
embed_final.extend(embeds_i)

In [34]:
page_reference_list = [str(1) for i in range(0, len(embed_final))]

In [35]:
text_as_reference_list = chunks.copy()
text_as_reference_list.extend([docs])

In [36]:
print(len(embed_final)==len(page_reference_list)==len(text_as_reference_list))

True


In [37]:
for text_example, page_ref, embedding in zip(text_as_reference_list, page_reference_list, embed_final):
    table_dictionary_doc = {"text_as_reference": text_example,
                        "page_reference": page_ref,
                        "embedding": embedding
                       }
    es.index(index=index_name1, body=table_dictionary_doc)



# Image Model

In [2]:
from dotenv import load_dotenv
import base64
import os
import http.client
import json
import requests

load_dotenv()

watsonx_api_key = os.getenv("WATSONX_APIKEY", None)
ibm_cloud_url = os.getenv("IBM_CLOUD_URL", None)
project_id = os.getenv("PROJECT_ID", None)
ibm_cloud_iam_url = os.getenv("IAM_IBM_CLOUD_URL", None)
chat_url = os.getenv("IBM_WATSONX_AI_INFERENCE_URL", None)
### Encode image as base 64

pic = open("catfish.jpeg","rb").read()
pic_base64 = base64.b64encode(pic)
pic_string = pic_base64.decode("utf-8")

In [3]:
conn_ibm_cloud_iam = http.client.HTTPSConnection(ibm_cloud_iam_url)
payload = "grant_type=urn%3Aibm%3Aparams%3Aoauth%3Agrant-type%3Aapikey&apikey="+watsonx_api_key
headers = { 'Content-Type': "application/x-www-form-urlencoded" }
conn_ibm_cloud_iam.request("POST", "/identity/token", payload, headers)
res = conn_ibm_cloud_iam.getresponse()
data = res.read()
decoded_json=json.loads(data.decode("utf-8"))
access_token=decoded_json["access_token"]

In [5]:
system_content = """You always answer the questions with markdown formatting using GitHub syntax. The markdown formatting you support: headings, bold, italic, links, tables, lists, code blocks, and blockquotes. You must omit that you answer the questions with markdown.\n\nAny HTML tags must be wrapped in block quotes, for example ```<html>```. You will be penalized for not rendering code in block quotes.\n\nWhen returning code blocks, specify language.\n\nYou are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. \nYour answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\n\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don'\''t know the answer to a question, please don'\''t share false information."""
user_message = """Please 1. Classify what object is this 2. Give a scale of 1-10 how damaged is the car 3. Do we need to change the parts or can fix. Please provide some descriptions\nAnswer in JSON with format {{'object': 'object_name', 'damage_scale': num, 'description': 'some_description'}}"""
body = {
   "messages": [
      {
         "role": "system",
         "content": system_content
      },
      {
         "role": "user",
         "content": [
            {
               "type": "text",
               "text": user_message,
            },
            {
               "type": "image_url",
               "image_url": {
                  "url": f"data:image/jpeg;base64, {pic_string}"
               }
            }
         ]
      }
   ],
   "project_id": project_id,
   "model_id": "meta-llama/llama-3-2-90b-vision-instruct",
   "decoding_method": "greedy",
   "repetition_penalty": 1.1,
   "max_tokens": 900
}

headers = {
    "Accept": "application/json",
    "Content-Type": "application/json",
    "Authorization": f"Bearer {access_token}"
}



response = requests.post(
    chat_url,
    headers=headers,
    json=body
)

if response.status_code != 200:
    raise Exception("Non-200 response: " + str(response.text))

data = response.json()

print('---- RESPONSE ----')
print(data['choices'][0]['message']['content'])


---- RESPONSE ----
I cannot provide a description for an object that is not a car. The image shows a fish, not a car. Therefore, I cannot provide a damage scale or suggest whether to change or fix the parts because there are no car parts in the image.


# AutoAI Part

In [1]:
payload = {
        "input_data": [
                {
                        "fields": [
                                "Make",
                                "Model",
                                "Year",
                                "Engine Fuel Type",
                                "Engine HP",
                                "Engine Cylinders",
                                "Transmission Type",
                                "Driven_Wheels",
                                "Number of Doors",
                                "Vehicle Size",
                                "Vehicle Style",
                                "highway MPG",
                                "city mpg",
                                "Popularity",
                                "Years Of Manufacture"
                        ],
                        "values": []
                }
        ]
}

In [7]:
import requests

# NOTE: you must manually set API_KEY below using information retrieved from your IBM Cloud account (https://dataplatform.cloud.ibm.com/docs/content/wsj/analyze-data/ml-authentication.html)
API_KEY = api_key
token_response = requests.post('https://iam.cloud.ibm.com/identity/token', data={"apikey":
 API_KEY, "grant_type": 'urn:ibm:params:oauth:grant-type:apikey'})
mltoken = token_response.json()["access_token"]

header = {'Content-Type': 'application/json', 'Authorization': 'Bearer ' + mltoken}

# NOTE: manually define and pass the array(s) of values to be scored in the next line
payload_scoring = {"input_data": [{"fields": [
                                "Make",
                                "Model",
                                "Year",
                                "Engine Fuel Type",
                                "Engine HP",
                                "Engine Cylinders",
                                "Transmission Type",
                                "Driven_Wheels",
                                "Number of Doors",
                                "Vehicle Size",
                                "Vehicle Style",
                                "highway MPG",
                                "city mpg",
                                "Popularity",
                                "Years Of Manufacture"
                        ], "values": [[
                                "BMW",
                                "1 Series M",
                                "2011",
                                "premium unleaded (required)",
                                "335.0",
                                "6.0",
                                "MANUAL",
                                "rear wheel drive",
                                "2.0",
                                "Compact",
                                "Coupe",
                                "26",
                                "19",
                                "3916",
                                "10"
                        ] ]}]}

response_scoring = requests.post('https://us-south.ml.cloud.ibm.com/ml/v4/deployments/car_price_prediction/predictions?version=2021-05-01', json=payload_scoring,
 headers={'Authorization': 'Bearer ' + mltoken})
print("Scoring response")
print(response_scoring.json())

Scoring response
{'predictions': [{'fields': ['prediction'], 'values': [[75800.0]]}]}


# Develop functions

In [None]:
def final_scoring_function(price_predict, front_result, back_result, left_result, right_result):
    divided_price = (price_predict/4)
    front_price = divided_price*front_result
    back_price = divided_price*back_result
    left_price = divided_price*left_result
    right_price = divided_price*right_result
    sum_price = front_price+back_price+left_price+right_price
    return sum_price*35

In [8]:
def image_scoring_prompt(side, pic_string, chat_url, project_id, access_token):
    system_content = """You always answer the questions with json formatting using with 2 keys, score and reason. \n\nAny JSON tags must be wrapped in block quotes, for example ```{'score': '99', 'reason': 'all good'}```. You will be penalized for not rendering code in block quotes.\n\nYou are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. \nYour answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\n\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don'\''t know the answer to a question, please don'\''t share false information."""
    user_message = f"""The side of the car is {side} side, Please 1. Classify what object is this 2. Give a score of 1-100 for the condition of the part of car when 0 is perfect 3. Do we need to change the parts, can fix, or it is all good. Please provide some descriptions\nAnswer in JSON with format {{'score': float, 'reason': str}}"""
    body = {
       "messages": [
          {
             "role": "system",
             "content": system_content
          },
          {
             "role": "user",
             "content": [
                {
                   "type": "text",
                   "text": user_message,
                },
                {
                   "type": "image_url",
                   "image_url": {
                      "url": f"data:image/jpeg;base64, {pic_string}"
                   }
                }
             ]
          }
       ],
       "project_id": project_id,
       "model_id": "meta-llama/llama-3-2-90b-vision-instruct",
       "decoding_method": "greedy",
       "repetition_penalty": 1.1,
       "max_tokens": 900
    }
    headers = {
    "Accept": "application/json",
    "Content-Type": "application/json",
    "Authorization": f"Bearer {access_token}"
    }
    response = requests.post(
        chat_url,
        headers=headers,
        json=body
    )
    
    if response.status_code != 200:
        raise Exception("Non-200 response: " + str(response.text))
    
    data = response.json()
    return  data, data['choices'][0]['message']['content']

In [9]:
def auto_ai_price_prediction(api_key, make, model, year, engine_fuel_type, engine_hp, engine_cylinder,
                            transmission_type, driven_wheels, number_of_doors, vehicle_size,
                            vehicle_style, highway_mpg, city_mpg, popularity, age):
    API_KEY = api_key
    token_response = requests.post('https://iam.cloud.ibm.com/identity/token', data={"apikey":
    API_KEY, "grant_type": 'urn:ibm:params:oauth:grant-type:apikey'})
    mltoken = token_response.json()["access_token"]
    header = {'Content-Type': 'application/json', 'Authorization': 'Bearer ' + mltoken}
    
    payload_scoring = {"input_data": [{"fields": [
                                "Make",
                                "Model",
                                "Year",
                                "Engine Fuel Type",
                                "Engine HP",
                                "Engine Cylinders",
                                "Transmission Type",
                                "Driven_Wheels",
                                "Number of Doors",
                                "Vehicle Size",
                                "Vehicle Style",
                                "highway MPG",
                                "city mpg",
                                "Popularity",
                                "Years Of Manufacture"
                        ], "values": [[
                                str(make),
                                str(model),
                                str(year),
                                str(engine_fuel_type),
                                str(engine_hp),
                                str(engine_cylinder),
                                str(transmission_type),
                                str(driven_wheels),
                                str(number_of_doors),
                                str(vehicle_size),
                                str(vehicle_style),
                                str(highway_mpg),
                                str(city_mpg),
                                str(popularity),
                                str(age)
                        ]]}]}
    response_scoring = requests.post('https://us-south.ml.cloud.ibm.com/ml/v4/deployments/car_price_prediction/predictions?version=2021-05-01', 
                                     json=payload_scoring, headers={'Authorization': 'Bearer ' + mltoken})
    data = response_scoring.json()
    value = data['predictions'][0]['values'][0][0]
    return data, value
    

In [11]:
#--------generate promt reday to prompt in model
def generate_prompt_rag_th(question, context):
    output = f"""**`<|begin_of_text|><|start_header_id|>**system<|end_header_id|>`

คุณเป็นผู้ช่วยที่ใจดี โปรดตอบคำถามอย่างใจดีและมีประโยชน์ที่สุดเสมอ พร้อมกับรักษาความปลอดภัย คำตอบของคุณไม่ควรมีเนื้อหาที่เป็นอันตราย ไม่ธรรมดา แบ่งแยกทางเชื้อชาติ ลำเอียงทางเพศ มีพิษ อันตราย หรือผิดกฎหมาย โปรดให้แน่ใจว่าคำตอบของคุณไม่มีอคติทางสังคมและเป็นบวกในธรรมชาติ ถ้าคำถามไม่มีเหตุผล หรือไม่สอดคล้องกับความเป็นจริง โปรดอธิบายเหตุผลแทนที่จะตอบคำถามที่ไม่ถูกต้อง ถ้าคุณไม่ทราบคำตอบของคำถาม โปรดอย่าแชร์ข้อมูลที่ผิด 

คุณจะได้รับนโยบายการให้กู้เงิน ที่เป็นแหล่งฃ้อมูลในการตอบคำถาม ที่ถูกถามจากผู้ใช้ จงตอบคำถามเป็นภาษาไทย

รายละเอียดนโยบายการกู้เงิน:
{context}

คำถาม: {question}
ตอบคำถามโดยใช้ฃ้อมูลจาก "รายละเอียดนโยบายการกู้เงิน" อธิบายเหตุผลของคุณ
หากคำถามไม่เกี่ยวข้องกับข้อมูลอ้างอิง โปรดตอบว่า “ฉันไม่ทราบคำตอบ, มันไม่ใช่ส่วนหนึ่งของนโยบายการกู้เงินที่ได้รับ”
<|eot_id|><|start_header_id|>user<|end_header_id|>
สวัสดี<|eot_id|><|start_header_id|>assistant<|end_header_id|>
สวัสดีครับผมคือผู้ช่วยของเงินกู้แบบมีสินทรัพย์ค้ำประกัน ครับ กรุณาพิมพ์คำถามของคุณข้างล่างได้เลยครับ<|eot_id|><|start_header_id|>user<|end_header_id|>
{question}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
    """
    return output