In [148]:
import requests
from bs4 import BeautifulSoup
import csv
import re
import time
import json
import os

In [165]:
url_search = "https://dl.acm.org/action/doSearch?fillQuickSearch=false&target=advanced&expand=dl&AfterYear=2000&BeforeYear=2024&AllField=Title%3A%28%22Textile%22+OR+%22Fabric%22+AND+NOT+%22Network%22+AND+NOT+%22software+engineering%22+AND+NOT+%22decentralized%22%29+AND+Abstract%3A%28NOT+%22Network%22+AND+NOT+%22server%22+AND+NOT+%22render%22+AND+NOT+%22distributed%22+AND+NOT+%22software+engineering%22+AND+NOT+%22decentralized%22+AND+NOT+%22blockchain%22+AND+NOT+%22life+science%22%29&startPage=7&pageSize=50"

file_path = './data/papers_data.json'

# check if file exists
if os.path.exists(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        papers_data = json.load(file)
else:
    papers_data = []

res = requests.get(url_search)
soup = BeautifulSoup(res.text, "html.parser")
selector_paper = "div.issue-item.issue-item--search.clearfix"
papers = soup.select(selector_paper)

for paper in papers:
    # collapsed abstract
    abstract = paper.select("div.issue-item__abstract.truncate-text > p")
    abstract = [a.text for a in abstract]
    if len(abstract) > 0:
        abstract = abstract[0]
    else:
        # if no abstract, skip
        continue
    # title
    title = paper.select("h5 > span > a")
    title = [t.text for t in title]
    title = title[0]
    # authors
    authors = []
    author_tags = paper.select('ul[aria-label="authors"] > li > a')
    for tag in author_tags:
        authors.append(tag.text)
    # date
    date = paper.select("div.bookPubDate")
    date = [d.text for d in date]
    date = date[0]
    # year
    year = date.split(" ")[-1]
    # citations
    citations = paper.select("div.citation > span")
    citations = [c.text for c in citations]
    if len(citations) > 0:
        citations = int(citations[0].replace(",", ""))
    else:
        citations = 0
    # downloads
    downloads = paper.select("div.metric > span")
    downloads = [d.text for d in downloads]
    if len(downloads) > 0:
        downloads = int(downloads[0].replace(",", ""))
    else:
        downloads = 0
    # content type
    content_type = paper.select("div.issue-heading")
    content_type = [c.text for c in content_type]
    if len(content_type) > 0:
        content_type = content_type[0]
    else:
        content_type = ""
    # doi
    doi = paper.select("a.issue-item__doi")
    doi = [d.text for d in doi]
    if len(doi) > 0:
        doi = doi[0]
    else:
        continue
    # publication
    publication = paper.select("span.epub-section__title")
    publication = [p.text for p in publication]
    if len(publication) > 0:
        publication = publication[0].split(":")[0]
    else:
        continue

    new_paper = {
        "title": title,
        "abstract": abstract,
        "citation_num": citations,
        "download_num": downloads,
        "conference": publication, 
        "authors": authors,
        "contentType": content_type,
        "doi": doi,
        "date": date,
        "year": year
    }
    
    papers_data.append(new_paper)

# save to file
with open(file_path, 'w', encoding='utf-8') as file:
    json.dump(papers_data, file, indent=4)

In [168]:
def convert_doi_url(doi_url):
    # Split the original URL to get the DOI part
    doi_part = doi_url.split("https://doi.org/")[-1]
    
    # Construct the new URL using the ACM format
    new_url = f"https://dl.acm.org/doi/{doi_part}"
    
    return new_url

In [173]:
# get full abstract
file_path = './data/papers_data.json'
full_abstract_file_path = './data/papers_data_with_full_abstract.json'

if os.path.exists(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        papers_data = json.load(file)
    for paper in papers_data:
        print(convert_doi_url(paper["doi"]))
        res = requests.get(convert_doi_url(paper["doi"]))
        soup = BeautifulSoup(res.text, "html.parser")
        selector_full_abstract = "div.abstractSection.abstractInFull > p"
        full_abstract_elements = soup.select(selector_full_abstract)
        full_abstract_text = " ".join([element.text for element in full_abstract_elements])
        
        # update the paper data with the full abstract
        paper["abstract_full"] = full_abstract_text

        # check if file exists
        if not os.path.exists(full_abstract_file_path):
            existing_data = []
        else:
            with open(full_abstract_file_path, 'r', encoding='utf-8') as file:
                existing_data = json.load(file)
        
        # add the new paper to the existing data
        existing_data.append(paper)
        
        # write the updated data back to the file
        with open(full_abstract_file_path, 'w', encoding='utf-8') as file:
            json.dump(existing_data, file, indent=4)
        
        print(full_abstract_text)
        time.sleep(10) 

https://dl.acm.org/doi/10.1145/3316782.3316785
Back pain is one of the most common illnesses in Western civilizations. Office work and lack of motion can lead to deterioration over time. Many people already use seat cushions to improve their posture during work or leisure. In this work, we present an E-Textile cushion. This seat cushion is equipped with capacitive proximity sensors that track the proximity and motion of the sitting user and distinguish up to 7 postures. Giving a user immediate feedback on the posture can facilitate more healthy behavior. We evaluated a number of different electrode setups, materials, and classification methods, leading to a maximum accuracy of 97.1%.
https://dl.acm.org/doi/10.1145/3123514.3123565
The Aural Fabric is an interactive textile sonic map created to promote engagement in acoustic awareness towards the built environment. It fosters discussions on the aural environment of our cities by allowing users to experience binaural recordings captured d

In [182]:
def convert_doi_to_acm_pdf_url(doi_url):
    """
    convert doi url to acm pdf url
    
    params:
    doi_url (str): original doi url, e.g. https://doi.org/10.1145/3316782.3316785
    
    returns:
    str: converted pdf url, e.g. https://dl.acm.org/doi/pdf/10.1145/3593856.3595907
    """
    acm_pdf_base_url = "https://dl.acm.org/doi/pdf/"
    doi_part = doi_url.split("https://doi.org/")[-1]
    acm_pdf_url = acm_pdf_base_url + doi_part
    
    return acm_pdf_url

In [185]:
# download pdf
full_abstract_file_path = './data/papers_data_with_full_abstract.json'


if os.path.exists(full_abstract_file_path):
    with open(full_abstract_file_path, 'r', encoding='utf-8') as file:
        papers_data = json.load(file)
    
    for paper in papers_data:
        pdflink = convert_doi_to_acm_pdf_url(paper["doi"])
        pdf_filename = paper["doi"].split('/')[-1] + '.pdf'
        save_path = os.path.join('pdf', pdf_filename)

        # check if the pdf file already exists
        if os.path.exists(save_path):
            print(f"{pdf_filename} already exists.")
            continue 

        print(f"Downloading {pdflink}...")

        # download pdf
        response = requests.get(pdflink)

        print(f"Status code: {response.status_code}")
        
        # make sure the response status code is 200
        if response.status_code == 200:
            # save the pdf file
            with open(save_path, 'wb') as f:
                f.write(response.content)
            print(f"Downloaded {pdf_filename} successfully.")
        else:
            print(f"Failed to download {pdf_filename}. Status code: {response.status_code}")
        
        time.sleep(10)


3316782.3316785.pdf already exists.
3123514.3123565.pdf already exists.
3084863.3084868.pdf already exists.
2556288.2557299.pdf already exists.
3170427.3188623.pdf already exists.
3305367.3327995.pdf already exists.
1858171.1858257.pdf already exists.
989863.989874.pdf already exists.
Downloading https://dl.acm.org/doi/pdf/10.1145/3306306.3338856...
Status code: 200
Downloaded 3306306.3338856.pdf successfully.
3406499.3418770.pdf already exists.
1178823.1178880.pdf already exists.
Downloading https://dl.acm.org/doi/pdf/10.1145/2641248.2666717...
Status code: 200
Downloaded 2641248.2666717.pdf successfully.
Downloading https://dl.acm.org/doi/pdf/10.1145/572020.572039...
Status code: 200
Downloaded 572020.572039.pdf successfully.
Downloading https://dl.acm.org/doi/pdf/10.1145/3334480.3382788...
Status code: 200
Downloaded 3334480.3382788.pdf successfully.
Downloading https://dl.acm.org/doi/pdf/10.1145/2893499...
Status code: 200
Downloaded 2893499.pdf successfully.
Downloading https://dl

In [197]:
# fetch image from pdf
import fitz  # PyMuPDF

def extract_first_image_from_pdf(pdf_path, image_save_path):
    """
    Extract the first image from a PDF file that meets size criteria and save it to a file.
    If the image is smaller than 20KB or exactly 51KB, it skips to the next available image.
    
    Params:
    pdf_path (str): Path to the PDF file.
    image_save_path (str): Path to save the extracted image.
    """
    doc = fitz.open(pdf_path)
    
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        image_list = page.get_images(full=True)
        
        for image_index, img in enumerate(image_list):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            
            # Check the size of the image
            if len(image_bytes) < 61200:  # Image is smaller than 20KB or exactly 51KB
                continue  # Skip to the next image
            else:
                # Save the image that meets the criteria
                with open(image_save_path, "wb") as img_file:
                    img_file.write(image_bytes)
                print(f"Image extracted and saved to {image_save_path}")
                return  # Image saved, no need to continue with this PDF
                
    doc.close()
    print(f"No suitable image found in {pdf_path}")


full_abstract_file_path = './data/papers_data_with_full_abstract.json'

no_image_list = []

if os.path.exists(full_abstract_file_path):
    with open(full_abstract_file_path, 'r', encoding='utf-8') as file:
        papers_data = json.load(file)
    
    for paper in papers_data:
        try: 
            pdf_path = "./pdf/"+paper["doi"].split('/')[-1] + '.pdf'
            image_save_path = "./image/"+paper["doi"].split('/')[-1]+".png"
            extract_first_image_from_pdf(pdf_path, image_save_path)
        except:
            no_image_list.append(paper["doi"])
            print(f"No image found in {pdf_path}")
            continue

Image extracted and saved to ./image/3316782.3316785.png
Image extracted and saved to ./image/3123514.3123565.png
Image extracted and saved to ./image/3084863.3084868.png
Image extracted and saved to ./image/2556288.2557299.png
Image extracted and saved to ./image/3170427.3188623.png
Image extracted and saved to ./image/3305367.3327995.png
Image extracted and saved to ./image/1858171.1858257.png
No suitable image found in ./pdf/989863.989874.pdf
No image found in ./pdf/3306306.3338856.pdf
Image extracted and saved to ./image/3406499.3418770.png
No suitable image found in ./pdf/1178823.1178880.pdf
Image extracted and saved to ./image/2641248.2666717.png
Image extracted and saved to ./image/572020.572039.png
Image extracted and saved to ./image/3334480.3382788.png
Image extracted and saved to ./image/2893499.png
No suitable image found in ./pdf/2370216.2370348.pdf
Image extracted and saved to ./image/1709886.1709972.png
Image extracted and saved to ./image/3027063.3052972.png
Image extra

In [228]:
"""
At the command line, only need to run once to install the package via pip:

$ pip install google-generativeai
"""

import google.generativeai as genai

genai.configure(api_key="AIzaSyD9pgMYx4gTkWZrTWxaWK4LMqTh6P6F0Z0")

# Set up the model
generation_config = {
  "temperature": 0.9,
  "top_p": 1,
  "top_k": 1,
  "max_output_tokens": 2048,
}

safety_settings = [
  {
    "category": "HARM_CATEGORY_HARASSMENT",
    "threshold": "BLOCK_MEDIUM_AND_ABOVE"
  },
  {
    "category": "HARM_CATEGORY_HATE_SPEECH",
    "threshold": "BLOCK_MEDIUM_AND_ABOVE"
  },
  {
    "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
    "threshold": "BLOCK_MEDIUM_AND_ABOVE"
  },
  {
    "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
    "threshold": "BLOCK_MEDIUM_AND_ABOVE"
  },
]

model = genai.GenerativeModel(model_name="gemini-pro",
                              generation_config=generation_config,
                              safety_settings=safety_settings)

In [299]:
prompt_start = """Based on the abstract below, extract keywords of the paper."""

prompt_end = """Notice, I will extract a lot of similar papers all about textiles, so I need to extract the most specific keywords possible. For example, I am interested in how they made the smart textiles (embroidered) and what they did with the smart textiles (sensing, actuating, and connecting components, etc.), not something like “textiles” or “smart textile”. The ones he extracted are meaningless because all the articles I collected are about textiles. I want more specific keywords with a specific purpose.
    dimensions include: 
    1. method that they used to fabricate textiles (e.g. coating, In-situ polymerization, etc.)
    2. tools used (e.g. embroidery machine, knitting machine, punch needle, etc.) 
    3. materials used (conductive thread, carbon ink, PPy, etc.) 
    4. what they did with the textiles (use textiles to do what? sensing? displaying? output, actuating?, applications, etc.) 
    5. if they conduct any workshop or user study (if yes, return "user study") 
    6. special techniques used (e.g. self-powered, TENG, piezoresistivity, etc.)
    for each dimensions, if the outlines are not applicable, e.g. in the abstract they did not describe how they made the textiles, return NA for that dimension.
    Organize the output keywords as a python list, e.g. [“sensing”, “displaying”, “embroidery machine”, “conductive thread”, “user study”] 
    In the output, remove NA, and you can have multiple keywords (but no more than 3) for each dimension.
    """
# types of smart textiles (sensing, output, actuating, displaying, etc.)

abstract1 = """
    "Interactive interiors which are customizable to the design preferences and functional purposes of individual users can help create flexible interiors within fixed spaces. They are particularly relevant to densely populated cities where most people live in compact spaces. This proposed research explores interactive interior textile surfaces as an adaptive media which can transform interiors via the change of colors, luminescence and surface design. This is done by investigating the integration of textile base material via embellishment techniques, such as embroidery and felting, with polymeric photonic fibers (POF) which enhance tactile quality without compromising on the technological functionality. Their flexible application methods mean that, it is adaptable to the rigid nature of polymeric photonic fibers. In addition, embroidery has the unique ability to arrange and combine threads together with various materials in non-rigid formations and multiple directions to create stable two-dimensional and three-dimensional forms [1]. Different materials can be combined to explore aesthetically pleasing surface designs, textures and achieve positive tactile quality.",
    """

response = model.generate_content(prompt_start+abstract1+prompt_end)
print(response.text)
# keyword = json.loads(response.text)

["embroidery", "felting", "polymeric photonic fibers (POF)"]


In [223]:
# generate keywords
full_abstract_file_path = './data/papers_data_with_full_abstract.json'
keywords_file_path = './data/papers_data_with_keywords.json'

if os.path.exists(full_abstract_file_path):
    with open(full_abstract_file_path, 'r', encoding='utf-8') as file:
        papers_data = json.load(file)
    
    # generate keywords for each paper
    for paper in papers_data:
        abstract = paper.get('abstract_full', '')
        
        try:
            response = model.generate_content(prompt_start+abstract+prompt_end)
            print(response.text)
            keywords = json.loads(response.text.replace("```python\n", "").replace("\n```", ""))
        except Exception as e:
            try:
                print(f"Error generating keywords for {paper['doi']}: {e}")
                keywords = response.text
            except:
                keywords = ""

        paper['keywords'] = keywords  # add the keywords to the paper data
        time.sleep(1)
        
    # save the updated data to a new file
    with open(keywords_file_path, 'w', encoding='utf-8') as new_file:
        json.dump(papers_data, new_file, indent=4)


["capacitive proximity sensors", "embroidery machine", "posture tracking", "user feedback", "user study"]
["sensing", "embroidery machine", "conductive thread"]
["touch-sensitive", "XY coordinate position", "pressure", "flexible", "lightweight", "low cost", "cuff-based user interface", "jacket"]
, 4. 冢.
Error generating keywords for https://doi.org/10.1145/2556288.2557299: Expecting value: line 1 column 1 (char 0)
["persuasive game", "textile production", "environmental impact", "societal concerns", "labour rights", "sustainable practices", "Textile Manager", "expert interviews", "pre-post exposure study", "persuasive effect", "voluntary information quests", "visualization of consequences"]
[“shape and structure change”, “heat”, “sizing customization”, “aesthetic patterning”, “embroidery machine”]
Error generating keywords for https://doi.org/10.1145/3305367.3327995: Expecting value: line 1 column 2 (char 1)
["handcrafting", "sensor", "circuitry", "electronic textile materials", "user 

In [302]:
# write to firestore database
import firebase_admin
from firebase_admin import credentials
from firebase_admin import firestore
import json

# initialize Firebase Admin SDK
# cred = credentials.Certificate('./serviceAccountKey.json')  # 替换为你的Firebase Admin SDK密钥文件路径
# firebase_admin.initialize_app(cred)

db = firestore.client()

# read the paper data from the JSON file
with open('./data/paper_data_final.json', 'r', encoding='utf-8') as file:
    papers_data = json.load(file)

# write the paper data to Firestore
for paper in papers_data:
    # create a new document in the "paper" collection
    doc_ref = db.collection('paper').document()  # automatically generate a unique ID
    doc_ref.set(paper)
    print(f"Document for {paper['title']} added.")

print("All papers have been added to Firestore.")

Document for E-textile capacitive electrodes: Fabric or thread: designing an E-textile cushion for sitting posture detection added.
Document for Aural Fabric: an interactive textile sonic map added.
Document for Textile++: low cost textile interface using the principle of resistive touch sensing added.
Document for Social fabric fitness: the design and evaluation of wearable E-textile displays to support group running added.
Document for Textile Manager: Design and Development of a Persuasive Game about Sustainable Textile Production added.
Document for Active textile tailoring added.
Document for Handcrafting textile mice added.
Document for Scalable Fabric: flexible task management added.
Document for 3D printing on fabric added.
Document for Fabric(a): Co-crafting Textiles with Robots added.
Document for Mosaic textile: wearable ambient display with non-emissive color-changing modules added.
Document for Designing an interface between the textile and electronics using e-textile comp

In [3]:
import json

# Read the JSON file
file_path = './data/paper_data_final.json'
with open(file_path, 'r') as file:
    papers = json.load(file)

# Generate the output in a TXT file
output_file_path = './data/papers_summary.txt'
with open(output_file_path, 'w') as output_file:
    for i, paper in enumerate(papers, start=1):
        title = paper.get('title', 'N/A')
        abstract_full = paper.get('abstract_full', 'N/A')
        doi = paper.get('doi', 'N/A')
        
        output_file.write(f'Title of paper {i}: {title}\n\n')
        output_file.write(f'Abstract of paper {i}: {abstract_full}\n\n')
        output_file.write(f'DOI of paper {i}: {doi}\n\n\n\n')

output_file_path

'./data/papers_summary.txt'

In [3]:
# write to firestore database
import firebase_admin
from firebase_admin import credentials
from firebase_admin import firestore
import json

# initialize Firebase Admin SDK
cred = credentials.Certificate('./serviceAccountKey.json')  # 替换为你的Firebase Admin SDK密钥文件路径
firebase_admin.initialize_app(cred)

db = firestore.client()

In [2]:
# change year to integer
# Reference to the collection
papers_ref = db.collection('paper')

# Fetch all documents
docs = papers_ref.stream()

for doc in docs:
    doc_dict = doc.to_dict()
    year_str = doc_dict.get('year')
    if year_str and year_str.isdigit():
        # Convert year to integer and update document
        papers_ref.document(doc.id).update({'year': int(year_str)})

print('Year fields updated successfully.')

Year fields updated successfully.


In [12]:
from firebase_admin import firestore

# Assuming Firebase Admin SDK is already initialized

db = firestore.client()

# Reference to the 'paper' collection
papers_ref = db.collection('paper')

# Fetch all documents
docs = papers_ref.stream()

for doc in docs:
    # Add document ID to its fields
    papers_ref.document(doc.id).update({'docId': doc.id})

print('Document IDs added successfully.')


Document IDs added successfully.


In [11]:

# Reference to the collection
papers_collection = db.collection('paper')

# Query for the document
query = papers_collection.where('title', '==', 'MoCapaci: Posture and gesture detection in loose garments using textile cables as capacitive antennas')
documents = query.stream()

for document in documents:
    print(f'Document ID: {document.id}')

Document ID: fNSVpbfiDuak84OCMCGf
Document ID: qpFbXij7i6TVeKue2byT
