In [9]:
import os
import requests
import json
from dotenv import load_dotenv

load_dotenv()

# Load the API key from an environment variable
CORE_API_KEY = os.getenv('CORE_API_KEY')
BASE_URL = 'https://api.core.ac.uk/v3'

def fetch_data(endpoint, params=None):
    """
    Fetches data from the specified CORE API endpoint.
    
    Args:
        endpoint (str): The endpoint of the API to call.
        params (dict): Optional dictionary of parameters to include in the request.
        
    Returns:
        dict: The JSON response from the API as a dictionary.
    """
    headers = {
        'Authorization': f'Bearer {CORE_API_KEY}'
    }
    response = requests.get(f'{BASE_URL}/{endpoint}', headers=headers, params=params)
    
    # Check if the request was successful
    if response.status_code == 200:
        return response.json()
    else:
        # Raise an error if the request was not successful
        response.raise_for_status()

def get_single_research(query):
    """
    Retrieves a single research article with selected metadata fields.
    
    Args:
        query (str): The search query (e.g., "machine learning").
        
    Returns:
        dict: Selected metadata fields of the first research article that matches the search query.
    """
    search_params = {
        'q': query,
        'page': 1,
        'pageSize': 1
    }
    
    # Search for articles
    search_results = fetch_data('search/works', params=search_params)
    
    # Extract the first result's ID
    if 'results' in search_results and len(search_results['results']) > 0:
        article_id = search_results['results'][0]['id']
        
        # Fetch full metadata and text for the article
        article_data = fetch_data(f'works/{article_id}')
        
        # Extract specific fields
        title = article_data.get('title', 'No title available')
        authors = [author.get('name', 'Unknown') for author in article_data.get('authors', [])]
        year = article_data.get('year', 'No year available')
        abstract = article_data.get('abstract', 'No abstract available')
        full_text = article_data.get('fullText', 'No full text available')
        full_text_link = article_data.get('downloadUrl', 'No link available')
        
        # Additional metadata fields
        keywords = article_data.get('keywords', 'No keywords available')
        publisher = article_data.get('publisher', 'No publisher available')
        publication_date = article_data.get('publicationDate', 'No publication date available')
        journal = article_data.get('journal', {}).get('name', 'No journal available')
        citation_count = article_data.get('citationCount', 'No citation count available')
        
        # Format and return selected metadata fields
        return {
            "Title": title,
            "Authors": ", ".join(authors),
            "Year": year,
            "Abstract": abstract,
            "Keywords": keywords,
            "Publisher": publisher,
            "Publication Date": publication_date,
            "Journal": journal,
            "Citation Count": citation_count,
            "Full Text": full_text,  # Full text included without truncation
            "Link": full_text_link
        }
    else:
        print("No results found for the query.")
        return None

def save_metadata_to_file(data, filename):
    """
    Saves the article metadata to a JSON file for RAG model use.
    
    Args:
        data (dict): The article metadata.
        filename (str): The name of the file to save the data.
    """
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=4)
    print(f"Data saved to {filename}")

# Example usage of the function
if __name__ == "__main__":
    query = "Large Language Model"  # Change the query as needed
    article = get_single_research(query)
    
    if article:
        filename = "article_metadata.json"
        save_metadata_to_file(article, filename)

Data saved to article_metadata.json


In [10]:
from langchain_openai import OpenAIEmbeddings

import os
load_dotenv()
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

embeddings = OpenAIEmbeddings(
    model="text-embedding-3-large",
    api_key=OPENAI_API_KEY
    # dimensions=1024  # Uncomment if you need to specify dimensions
)

In [13]:
with open("article_metadata.json", "r", encoding="utf-8") as file:
    article_metadata = json.load(file)

text_content = (
    f"Title: {article_metadata.get('Title', 'No title available')}\n"
    f"Authors: {article_metadata.get('Authors', 'Unknown')}\n"
    f"Year: {article_metadata.get('Year', 'No year available')}\n"
    f"Abstract: {article_metadata.get('Abstract', 'No abstract available')}\n"
    f"Full Text: {article_metadata.get('Full Text', 'No full text available')}\n"
    f"Link: {article_metadata.get('Link', 'No link available')}\n"
)

# Get the embedded vector
embedded_vector = embeddings.embed_query(text_content)

# Display the embedded vector
print("Embedded Vector:", embedded_vector)

Embedded Vector: [-0.02200848608183178, 0.0003857299816024685, -0.024215726167220723, 0.0006734850586554599, 0.009508709112686843, 0.006370294439283696, -0.016889071934464848, 0.03525963777411313, -0.029729412741997807, 0.0073675260866850716, 0.02881254801682592, 0.021667683130318725, 0.015357217042600095, -0.04485952073438473, -0.0007569873591096098, 0.0018007677975715498, 0.018699980677313887, 0.008299593952130424, 0.010574124519514972, -0.0403402720655928, -0.023309274268151166, -0.01569995377148128, -0.029419996062541004, 0.03754552643407949, -0.016315125796812633, 0.02285010088766422, 0.003874050846852914, 0.002870292981927491, -0.04847169242993567, 0.012589435745767201, 0.019251076095781104, -0.0030749897656981097, 0.003790291452154976, -0.008974711195832367, -0.027280045918705243, -0.03095022064740923, 0.08408284165263424, 0.028961080350007412, -0.02625412570441883, 0.05674945312149602, 0.03003176817436058, 0.018570211030405164, -0.008648666175550856, -0.02789014629320605, 0.014

In [15]:
from scipy.spatial.distance import cosine

# Assuming `embedded_vector` is the vector of the saved document in the database
# And `input_text` is the new text you want to compare

# Step 1: Embed the input text
input_text = "Large Language Model"
input_vector = embeddings.embed_query(input_text)

# Step 2: Calculate cosine similarity
# Cosine similarity is 1 - cosine distance
similarity_score = 1 - cosine(input_vector, embedded_vector)

print("Similarity Score:", similarity_score)


Similarity Score: 0.479142613130639


In [None]:
import lancedb
import pandas as pd
import pyarrow as pa

uri = "data/sample-lancedb"
db = lancedb.connect(uri)

# LanceDb offers both a synchronous and an asynchronous client.  There are still a
# few operations that are only supported by the synchronous client (e.g. embedding
# functions, full text search) but both APIs should soon be equivalent

# In this guide we will give examples of both clients.  In other guides we will
# typically only provide examples with one client or the other.
uri = "data/sample-lancedb"
async_db = await lancedb.connect_async(uri)

ImportError: cannot import name 'Embedding' from 'lancedb.embeddings' (c:\Hackathon\HackUMass-XII\.venv\Lib\site-packages\lancedb\embeddings\__init__.py)