In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import sys
import numpy as np
import os

sys.path.append('../../system/')
# from parser import run_parser, convert_pdf_to_jpg     #for image preprocess
from get_similarity.utils.preprocess import preprocess
from langchain_experimental.text_splitter import SemanticChunker
from langchain_openai import OpenAIEmbeddings
from langchain.embeddings import CacheBackedEmbeddings
from langchain.storage import LocalFileStore
from langchain_chroma import Chroma
from configs import JD_PATH, COLLECTION, DB_PATH

from insert_chunks import *
from tqdm import tqdm
from collections import Counter, defaultdict
from uuid import uuid4
from dotenv import load_dotenv


#data download
# Login using e.g. `huggingface-cli login` to access this dataset
# df = pd.read_csv("hf://datasets/AzharAli05/Resume-Screening-Dataset/dataset.csv")

# Load

In [2]:
load_dotenv()
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY", "")

In [4]:
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key=PINECONE_API_KEY)

# Insert data

In [3]:
index_name = "quickstart"

pc.create_index(
    name=index_name,
    dimension=1024, # Replace with your model dimensions
    metric="cosine", # Replace with your model metric
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    ) 
)

{
    "name": "quickstart",
    "metric": "cosine",
    "host": "quickstart-7ynlq0x.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "vector_type": "dense",
    "dimension": 1024,
    "deletion_protection": "disabled",
    "tags": null
}

In [4]:
data = [
    {"id": "vec1", "text": "Apple is a popular fruit known for its sweetness and crisp texture."},
    {"id": "vec2", "text": "The tech company Apple is known for its innovative products like the iPhone."},
    {"id": "vec3", "text": "Many people enjoy eating apples as a healthy snack."},
    {"id": "vec4", "text": "Apple Inc. has revolutionized the tech industry with its sleek designs and user-friendly interfaces."},
    {"id": "vec5", "text": "An apple a day keeps the doctor away, as the saying goes."},
    {"id": "vec6", "text": "Apple Computer Company was founded on April 1, 1976, by Steve Jobs, Steve Wozniak, and Ronald Wayne as a partnership."}
]

embeddings = pc.inference.embed(
    model="multilingual-e5-large",
    inputs=[d['text'] for d in data],
    parameters={"input_type": "passage", "truncate": "END"}
)
print(embeddings[0])

{'vector_type': dense, 'values': [0.04931640625, -0.01328277587890625, ..., -0.0196380615234375, -0.010955810546875]}


In [5]:
# Wait for the index to be ready
while not pc.describe_index(index_name).status['ready']:
    time.sleep(1)

index = pc.Index(index_name)

vectors = []
for d, e in zip(data, embeddings):
    vectors.append({
        "id": d['id'],
        "values": e['values'],
        "metadata": {'text': d['text']}
    })

index.upsert(
    vectors=vectors,
    namespace="ns1"
)

{'upserted_count': 6}

In [7]:
print(index.describe_index_stats())

{'dimension': 1024,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'ns1': {'vector_count': 6}},
 'total_vector_count': 6,
 'vector_type': 'dense'}


# Query

In [8]:
query = "Tell me about the tech company known as Apple."

embedding = pc.inference.embed(
    model="multilingual-e5-large",
    inputs=[query],
    parameters={
        "input_type": "query"
    }
)

In [9]:
results = index.query(
    namespace="ns1",
    vector=embedding[0].values,
    top_k=3,
    include_values=False,
    include_metadata=True
)

print(results)

{'matches': [{'id': 'vec2',
              'metadata': {'text': 'The tech company Apple is known for its '
                                   'innovative products like the iPhone.'},
              'score': 0.872760534,
              'values': []},
             {'id': 'vec4',
              'metadata': {'text': 'Apple Inc. has revolutionized the tech '
                                   'industry with its sleek designs and '
                                   'user-friendly interfaces.'},
              'score': 0.852285385,
              'values': []},
             {'id': 'vec6',
              'metadata': {'text': 'Apple Computer Company was founded on '
                                   'April 1, 1976, by Steve Jobs, Steve '
                                   'Wozniak, and Ronald Wayne as a '
                                   'partnership.'},
              'score': 0.850212216,
              'values': []}],
 'namespace': 'ns1',
 'usage': {'read_units': 1}}


# Now use our dataset

In [13]:
JD_PATH = "../../data/jd_origin"
jd_folder = JD_PATH
for jd_path in os.listdir(jd_folder):
    full_path = os.path.join(jd_folder, jd_path)
#일단 chunk가 어떤느낌으로 만드는지 뜯어보자

In [14]:
emb_model = load_emb_model()
preprocessed_doc = preprocess(full_path)    #dataframe
total_chunks = get_chunks(preprocessed_doc, set_splitter(emb_model))

In [19]:
len(emb_model.embed_query("test"))

1536

In [11]:
from pinecone import Pinecone

pc = Pinecone(api_key="pcsk_XAVgY_KVbSNbJSbZyAEL7b2YUXQd1ACAEJVG7w92uk4x4qnE3wHYC8c8xw25sHKhZVNk4")
# index = pc.Index("jd-dataset")

In [20]:
pc.create_index(
    name="jd-dataset",
    dimension=1536 , # Replace with your model dimensions
    metric="cosine", # Replace with your model metric
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    ) 
)

In [30]:
index = pc.Index("jd-dataset")
uuids = [str(uuid4()) for _ in range(len(total_chunks))]

In [31]:
from langchain_pinecone import PineconeVectorStore

vector_store = PineconeVectorStore(index=index, embedding=emb_model)

In [32]:
vector_store.add_documents(documents=total_chunks, ids=uuids)

['42429ae2-d2a2-4b6b-a408-d749ed2ace0d',
 '98f405d8-4bfd-44fb-83d1-93303e23a6ee',
 'e7d5087e-fbe2-4321-9b61-9648a205a9ca',
 'a408b223-07b2-4af4-93ee-8c43558159f8',
 'fbf6f85a-812f-41ad-9134-87240e7f9e60',
 '5efe7067-65c7-40a7-bfd6-940fc26c2859',
 '62ccb903-abe4-4c55-ace0-dbaea0f56d54',
 'a39ef5ea-3cee-4989-aea6-b32662df0393',
 '4a2cd8be-c3e5-4da1-84e6-7a8047f3b9d4',
 'dffdeb54-1d16-42a9-8328-3f3812beba75',
 '9a7e2755-ec24-437c-9a4e-c5fbda7befb5',
 '4b5ef61d-fed4-482e-98ab-3d8ace16210d',
 'ed9097b4-8267-4069-bb55-71ae6ff8bec1',
 '76b95158-3ee1-4729-b48d-06e790f939a1',
 'a97ad777-0f9f-449d-a505-6aae99f91b22',
 'd3df2ed4-b041-4b48-9ec9-a3435d49b8f1',
 '85f78357-0dd9-4828-852d-a944d651cf7d',
 '6153b126-163d-4c33-8034-8d10587bb1e6',
 '950f549c-007f-4abd-82ff-580035ea7b4c',
 '5ed88665-1e66-4f73-941c-f79bfaf7e7cd',
 '6bd3cf3c-52a8-4edc-8305-326590f2bedb',
 '92168abb-17fe-428d-912d-72962d98f3fa',
 '4087622e-2f87-4def-9382-f39888085f15',
 '7bac60ff-270c-4811-b6e6-54a2e56cdf97',
 '378434f5-05c7-

In [41]:
results = vector_store.similarity_search(
    "I want Data Scientist position",
    k=2,
)
for res in results:
    print(f"* {res.page_content} [{res.metadata}]")

* in Computer Science or a related field is required. 8+ years’ experience in applied AI/ML to solve data challenges. Deep understanding of statistical modeling, machine learning, and deep
learning, with a track record of solving problems with these methods. Experience in solving technical problems in data privacy and conversational
systems is preferred. Strong programming skills with extensive experience in Java or Python. Proven track record of innovation and sharing insights through publications
and patents. Excellent problem-solving, analytical and communication skills. Adaptable to evolving priorities, accepting challenges outside one's comfort
zone and learning new technologies. Ability to think through solutions from a short term and long-term lens in an
iterative development cycle. Our compensation reflects the cost of labor across several U.S. geographic
markets, and we pay differently based on those defined markets. [{'company': 'Adobe', 'company_url': 'https://www.indeed.com

## Query by retrieval

In [44]:
retriever = vector_store.as_retriever(
    search_type="similarity_score_threshold",
    search_kwargs={"k": 1, "score_threshold": 0.2},
)
retriever.invoke("Stealing from the bank is a crime")

[Document(id='b747e126-eeba-42d7-aa46-b58c1e1486a5', metadata={'company': 'Johnny Clean', 'company_url': 'https://www.indeed.com/cmp/Johnny-Clean-Car-Wash-1', 'date_posted': '2024-11-11', 'description': "**Position Summary**\n\nJohnny Clean Car Wash is seeking an **Operational Data Scientist** with proven\nexperience in **subscription-based businesses** to join our team at the\nDeerfield Beach Support Center in Florida. This role is tailored for a data\nscientist who not only excels in technical analysis but also deeply\nunderstands the unique dynamics of recurring revenue models and membership-\nbased customer experiences. At Johnny Clean, we've built a strong foundation\nwith our data warehouse and PowerBI dashboards; now, we need a data\nprofessional who knows the ins and outs of subscription models to help us\nscale effectively, improve member satisfaction, and identify operational\nopportunities. This role involves a balance of office-based data work and in-\nfield insights gather