In [1]:
from dotenv import load_dotenv
import os
load_dotenv()

True

In [2]:
from pinecone import Pinecone

pc = Pinecone(api_key=os.getenv('PINECONE_API_KEY'))

  from tqdm.autonotebook import tqdm


In [22]:
pc.list_indexes()


{'indexes': [{'dimension': 1536,
              'host': 'canopy--document-uploader-dnmsse2.svc.gcp-starter.pinecone.io',
              'metric': 'cosine',
              'name': 'canopy--document-uploader',
              'spec': {'pod': {'environment': 'gcp-starter',
                               'metadata_config': {'indexed': ['document_id']},
                               'pod_type': 'starter',
                               'pods': 1,
                               'replicas': 1,
                               'shards': 1,
                               'source_collection': ''}},
              'status': {'ready': True, 'state': 'Ready'}}]}

In [24]:
from openai import OpenAI
client = OpenAI(
                    api_key=os.getenv("OPENAI_API_KEY"),
                    base_url=os.getenv("OPENAI_API_BASE"),
            )

response = client.embeddings.create(
    input="Abstract of paper 1: Back pain is one of the most common illnesses in Western civilizations. Office work and lack of motion can lead to deterioration over time. Many people already use seat cushions to improve their posture during work or leisure. In this work, we present an E-Textile cushion. This seat cushion is equipped with capacitive proximity sensors that track the proximity and motion of the sitting user and distinguish up to 7 postures. Giving a user immediate feedback on the posture can facilitate more healthy behavior. We evaluated a number of different electrode setups, materials, and classification methods, leading to a maximum accuracy of 97.1%.",
    model="text-embedding-3-small"
)

print(response.data[0].embedding)

[0.014693201519548893, 0.02302601933479309, 0.028128620237112045, 0.05751502513885498, 0.02669931948184967, -0.00036156817805022, 0.01879529282450676, 0.04162121191620827, 0.04530880227684975, -0.04533739015460014, 0.005685039795935154, -0.03150176629424095, 0.0047452752478420734, -0.005792237352579832, 0.04673810303211212, 0.031673282384872437, 0.06500455737113953, -0.0584297738969326, 0.0024405294097959995, 0.017237355932593346, 0.007532409857958555, 0.018266450613737106, 0.03416026383638382, -0.008397135883569717, 0.028828976675868034, -0.018009178340435028, 0.017123011872172356, -0.02316894941031933, -0.018366502597928047, -0.035475220531225204, -0.021611012518405914, -0.028843270614743233, -0.015536488965153694, -0.0457947663962841, -0.051168933510780334, 0.0412210077047348, -0.05705764889717102, 0.06506172567605972, -0.0012488507200032473, -0.024498198181390762, 0.02072484791278839, 0.0366758331656456, 0.037419069558382034, -0.006063804496079683, -0.048053059726953506, -0.0085829

In [25]:
response

CreateEmbeddingResponse(data=[Embedding(embedding=[0.014693201519548893, 0.02302601933479309, 0.028128620237112045, 0.05751502513885498, 0.02669931948184967, -0.00036156817805022, 0.01879529282450676, 0.04162121191620827, 0.04530880227684975, -0.04533739015460014, 0.005685039795935154, -0.03150176629424095, 0.0047452752478420734, -0.005792237352579832, 0.04673810303211212, 0.031673282384872437, 0.06500455737113953, -0.0584297738969326, 0.0024405294097959995, 0.017237355932593346, 0.007532409857958555, 0.018266450613737106, 0.03416026383638382, -0.008397135883569717, 0.028828976675868034, -0.018009178340435028, 0.017123011872172356, -0.02316894941031933, -0.018366502597928047, -0.035475220531225204, -0.021611012518405914, -0.028843270614743233, -0.015536488965153694, -0.0457947663962841, -0.051168933510780334, 0.0412210077047348, -0.05705764889717102, 0.06506172567605972, -0.0012488507200032473, -0.024498198181390762, 0.02072484791278839, 0.0366758331656456, 0.037419069558382034, -0.006

In [31]:
# read json, get abstracts, get embeddings, write to pinecone with metadata (doi, title, authors, journal, year, abstract)
import os
import json
import pinecone
from openai import OpenAI

# initialize OpenAI
openai_client = OpenAI(
    api_key=os.getenv("OPENAI_API_KEY"),
    base_url=os.getenv("OPENAI_API_BASE")
)

# initialize Pinecone
pc = Pinecone(api_key=os.getenv('PINECONE_API_KEY'))
index = pc.Index("textiles-hci-01") # pinecone index name

# read paper data from json file
file_path = './data/paper_data_final.json'  
with open(file_path, 'r') as file:
    papers = json.load(file)

for paper in papers:
    # use OpenAI API text-embedding-3-small to get embeddings
    response = openai_client.embeddings.create(
        input=paper['abstract_full'],
        model="text-embedding-3-small"
    )
    vector = response.data[0].embedding  # get the embedding vector
    
    # prepare metadata
    metadata = {
        "title": paper['title'],
        "doi": paper['doi'],
        "authors": paper['authors'],
        "publication": paper['conference'],
        "year": paper['year'],
        "abstract": paper['abstract_full']
    }
    
    # insert into Pinecone index
    paper_id = paper['doi']  # use doi as the unique identifier
    index.upsert(vectors=[(paper_id, vector, metadata)])
    print(f"Inserted paper: {paper_id}")

print("finished!")


Inserted paper: https://doi.org/10.1145/3316782.3316785
Inserted paper: https://doi.org/10.1145/3123514.3123565
Inserted paper: https://doi.org/10.1145/3084863.3084868
Inserted paper: https://doi.org/10.1145/2556288.2557299
Inserted paper: https://doi.org/10.1145/3170427.3188623
Inserted paper: https://doi.org/10.1145/3305367.3327995
Inserted paper: https://doi.org/10.1145/1858171.1858257
Inserted paper: https://doi.org/10.1145/989863.989874
Inserted paper: https://doi.org/10.1145/3306306.3338856
Inserted paper: https://doi.org/10.1145/3406499.3418770
Inserted paper: https://doi.org/10.1145/1178823.1178880
Inserted paper: https://doi.org/10.1145/2641248.2666717
Inserted paper: https://doi.org/10.1145/572020.572039
Inserted paper: https://doi.org/10.1145/3334480.3382788
Inserted paper: https://doi.org/10.1145/2893499
Inserted paper: https://doi.org/10.1145/2370216.2370348
Inserted paper: https://doi.org/10.1145/1709886.1709972
Inserted paper: https://doi.org/10.1145/3027063.3052972
Inse

In [70]:
from openai import OpenAI
from pinecone import Pinecone
import os

openai_client = OpenAI(
    api_key=os.getenv("OPENAI_API_KEY"),
    base_url=os.getenv("OPENAI_API_BASE")
)

user_question = "what is possible with embroidery in HCI textiles research?"

# use OpenAI API text-embedding-3-small to get embeddings for a question
response = openai_client.embeddings.create(
    input=user_question,
    model="text-embedding-3-small" 
)
question_vector = response.data[0].embedding

# use Pinecone to search for the most relevant papers
pc = Pinecone(api_key=os.getenv('PINECONE_API_KEY'))
index_name = "textiles-hci-01" 
index = pc.Index(index_name)

# search for the most relevant papers
# index.query(vector=[0.1, 0.2, 0.3], top_k=10, namespace='my_namespace')
query_results = index.query(
    top_k=5, # top 5 most relevant papers
    vector=question_vector, 
    includeValues=True,
    includeMetadata=True,
    namespace="pdf"
    # filter: { genre: { '$eq': 'action' }}
    )  

# get the metadata of the most relevant papers
# print(query_results)
# contexts = [item['metadata']['abstract'] for item in query_results['matches']]
contexts = [item['metadata']['text'] for item in query_results['matches']]

# combine the contexts and the question
context_combined = "\n".join(contexts)
chat_input = f"Context: {context_combined}\nQuestion: {user_question}"

response = openai_client.chat.completions.create(
  model="gpt-3.5-turbo", 
  messages=[
    {"role": "system", "content": "You are a helpful research assistant and an expert in textile research and HCI research."},
    {"role": "user", "content": chat_input}
  ],
#   prompt=chat_input,
  max_tokens=1000 # limit the response to 1000 tokens
)

# get the answer
answer = response.choices[0].message
print(answer)

ChatCompletionMessage(content='Embroidery offers a wide range of possibilities in the field of Human-Computer Interaction (HCI) textiles research. Some of the key possibilities include:\n\n1. **Integration of Technology**: Embroidery can be used to embed technology into textiles without compromising the functionality of the technology or the integrity of the textile. This allows for the seamless integration of interactive elements into everyday textiles, providing users with familiar and comfortable tactile experiences.\n\n2. **Sensor Development**: In HCI textiles research, embroidery can be utilized for creating conductive electrodes for capacitive sensing. Researchers have used embroidery to develop multi-layered capacitive sensing structures, connections between textile wiring and electronic substrates, electromyography electrodes integration into clothing, and fabricating textile coils for sensors. By experimenting with design and manufacturing parameters, researchers can optimize

In [39]:
len(papers)

220

In [40]:
papers[219]

{'title': 'Interactivity to enhance perception: does increased interactivity in mobile visual presentation tools facilitate more accurate rating of textile properties?',
 'abstract': "As part of the EPSRC funded 'Digital Sensoria' project a set of digital tools were utilised to better demonstrate the tactile qualities of textiles via the internet. Shoogleit [8], an online utility for the creation of interactive video was one such ...",
 'citation_num': 1,
 'download_num': 206,
 'conference': "MobileHCI '11",
 'authors': ['Pawel M. Orzechowski',
  'Douglas Atkinson',
  'Stefano Padilla',
  'Thomas S. Methven',
  'Sharon Baurley',
  'Mike Chantler'],
 'contentType': 'poster',
 'doi': 'https://doi.org/10.1145/2037373.2037472',
 'date': 'August 2011',
 'year': '2011',
 'abstract_full': "As part of the EPSRC funded 'Digital Sensoria' project a set of digital tools were utilised to better demonstrate the tactile qualities of textiles via the internet. Shoogleit [8], an online utility for the

In [42]:
# index.upsert(vectors=[('id-1', vector)],
#              namespace='pdf')

{'upserted_count': 1}

In [51]:
import fitz
doc = fitz.open("./pdf/1052380.1052381.pdf")
text = ""
for page in doc:
    text += page.get_text()

# save text as a txt file
with open('output.txt', 'w') as file:
    file.write(text)

In [65]:
# chunk the text into paragraphs

import fitz  # PyMuPDF
from openai import OpenAI
from pinecone import Pinecone
import json
import os

def read_pdf_chunk_by_paragraphs(pdf_path):
    doc = fitz.open(pdf_path)
    paragraphs = []

    for page in doc:
        blocks = page.get_text("blocks")
        blocks.sort(key=lambda block: (block[1], block[0]))  # first sort by y (up to down), then by x (left to right)

        current_paragraph = ""
        last_block_bottom = None

        for block in blocks:
            block_text = block[4].strip()
            block_top = block[1]
            if last_block_bottom is not None and (block_top - last_block_bottom) > 5:  # assume new paragraph if distance between blocks is larger than 5
                # save the last paragraph and start a new one
                paragraphs.append(current_paragraph)
                current_paragraph = block_text
            else:
                # else, just add the text to the current paragraph
                if current_paragraph:
                    current_paragraph += " "  # add space between blocks
                current_paragraph += block_text
            last_block_bottom = block[3]  # update last block bottom

        # make sure to save the last paragraph
        if current_paragraph:
            paragraphs.append(current_paragraph)

    return paragraphs

# use OpenAI API text-embedding-3-small to get embeddings
def vectorize_text(text):
    response = openai_client.embeddings.create(
        input=text,
        model="text-embedding-3-small"
    )
    return response.data[0].embedding

# upload chunks to Pinecone
def upload_to_pinecone(chunks, doi, pinecone_index):
    for i, chunk in enumerate(chunks):
        vector = vectorize_text(chunk)
        metadata = {"text": chunk, "doi": doi}
        pinecone_index.upsert(vectors=[(f"{doi}_{i}", vector, metadata)], namespace="pdf")
    print(f"Uploaded {len(chunks)} chunks for paper: {doi}")


# initialize OpenAI
openai_client = OpenAI(
    api_key=os.getenv("OPENAI_API_KEY"),
    base_url=os.getenv("OPENAI_API_BASE")
)
# initialize Pinecone
pc = Pinecone(api_key=os.getenv('PINECONE_API_KEY'))
index = pc.Index("textiles-hci-01")

json_file_path = './data/paper_data_final.json' 
with open(json_file_path, 'r') as json_file:
    papers = json.load(json_file)

pdf_folder = "./pdf/" 

for paper in papers[194:]:
    # extract DOI suffix and PDF filename
    doi_suffix = paper['doi'].split('/')[-1]
    pdf_filename = f"{doi_suffix}.pdf"
    pdf_path = os.path.join(pdf_folder, pdf_filename)
    
    # check if PDF file exists
    if os.path.exists(pdf_path):
        # read PDF and chunk into paragraphs
        chunks = read_pdf_chunk_by_paragraphs(pdf_path)  
        # upload chunks to Pinecone
        upload_to_pinecone(chunks, doi_suffix, index) 
    else:
        print(f"PDF文件不存在: {pdf_filename}")

Uploaded 25 chunks for paper: 3410531.3414305
Uploaded 32 chunks for paper: 3325480.3329176
Uploaded 37 chunks for paper: 3328243.3328267
PDF文件不存在: 3328778.3372712.pdf
Uploaded 27 chunks for paper: 2677199.2680560
Uploaded 35 chunks for paper: 1810543.1810567
Uploaded 57 chunks for paper: 2757226.2757231
Uploaded 23 chunks for paper: 3341162.3343775
Uploaded 12 chunks for paper: 1810543.1810574
Uploaded 17 chunks for paper: 2148131.2148233
Uploaded 35 chunks for paper: 1979742.1979707
Uploaded 201 chunks for paper: 3580883
Uploaded 53 chunks for paper: 3123021.3123068
Uploaded 69 chunks for paper: 3242587.3242664
Uploaded 75 chunks for paper: 2512349.2512813
Uploaded 24 chunks for paper: 3197391.3205382
Uploaded 41 chunks for paper: 2971763.2971784
Uploaded 49 chunks for paper: 2800835.2807931
Uploaded 37 chunks for paper: 3290607.3312857
Uploaded 10 chunks for paper: 2700648.2811355
Uploaded 23 chunks for paper: 2347504.2347542
Uploaded 0 chunks for paper: 3610927
Uploaded 20 chunks f