In [39]:
import os
import requests
import json

load_dotenv()

# Load the API key from an environment variable
CORE_API_KEY = os.getenv('CORE_API_KEY')
BASE_URL = 'https://api.core.ac.uk/v3'

def fetch_data(endpoint, params=None):
    """
    Fetches data from the specified CORE API endpoint.
    
    Args:
        endpoint (str): The endpoint of the API to call.
        params (dict): Optional dictionary of parameters to include in the request.
        
    Returns:
        dict: The JSON response from the API as a dictionary.
    """
    headers = {
        'Authorization': f'Bearer {CORE_API_KEY}'
    }
    response = requests.get(f'{BASE_URL}/{endpoint}', headers=headers, params=params)
    
    # Check if the request was successful
    if response.status_code == 200:
        return response.json()
    else:
        # Raise an error if the request was not successful
        response.raise_for_status()

def get_single_research(query):
    """
    Retrieves a single research article with selected metadata fields.
    
    Args:
        query (str): The search query (e.g., "machine learning").
        
    Returns:
        dict: Selected metadata fields of the first research article that matches the search query.
    """
    search_params = {
        'q': query,
        'page': 1,
        'pageSize': 1
    }
    
    # Search for articles
    search_results = fetch_data('search/works', params=search_params)
    
    # Extract the first result's ID
    if 'results' in search_results and len(search_results['results']) > 0:
        article_id = search_results['results'][0]['id']
        
        # Fetch full metadata and text for the article
        article_data = fetch_data(f'works/{article_id}')
        
        # Extract specific fields
        title = article_data.get('title', 'No title available')
        authors = [author.get('name', 'Unknown') for author in article_data.get('authors', [])]
        year = article_data.get('year', 'No year available')
        abstract = article_data.get('abstract', 'No abstract available')
        full_text = article_data.get('fullText', 'No full text available')
        full_text_link = article_data.get('downloadUrl', 'No link available')
        
        # Additional metadata fields
        keywords = article_data.get('keywords', 'No keywords available')
        publisher = article_data.get('publisher', 'No publisher available')
        publication_date = article_data.get('publicationDate', 'No publication date available')
        journal = article_data.get('journal', {}).get('name', 'No journal available')
        citation_count = article_data.get('citationCount', 'No citation count available')
        
        # Format and return selected metadata fields
        return {
            "Title": title,
            "Authors": ", ".join(authors),
            "Year": year,
            "Abstract": abstract,
            "Keywords": keywords,
            "Publisher": publisher,
            "Publication Date": publication_date,
            "Journal": journal,
            "Citation Count": citation_count,
            "Full Text": full_text,  # Full text included without truncation
            "Link": full_text_link
        }
    else:
        print("No results found for the query.")
        return None

def save_metadata_to_file(data, filename):
    """
    Saves the article metadata to a JSON file for RAG model use.
    
    Args:
        data (dict): The article metadata.
        filename (str): The name of the file to save the data.
    """
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=4)
    print(f"Data saved to {filename}")

# Example usage of the function
if __name__ == "__main__":
    query = "Large Language Model"  # Change the query as needed
    article = get_single_research(query)
    
    if article:
        filename = "article_metadata.json"
        save_metadata_to_file(article, filename)

Data saved to article_metadata.json


In [None]:
from langchain_openai import OpenAIEmbeddings

import os
load_dotenv()
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

embeddings = OpenAIEmbeddings(
    model="text-embedding-3-large",
    api_key=OPENAI_API_KEY
    # dimensions=1024  # Uncomment if you need to specify dimensions
)

In [None]:
with open("article_metadata.json", "r", encoding="utf-8") as file:
    article_metadata = json.load(file)

text_content = (
    f"Title: {article_metadata.get('Title', 'No title available')}\n"
    f"Authors: {article_metadata.get('Authors', 'Unknown')}\n"
    f"Year: {article_metadata.get('Year', 'No year available')}\n"
    f"Abstract: {article_metadata.get('Abstract', 'No abstract available')}\n"
    f"Full Text: {article_metadata.get('Full Text', 'No full text available')}\n"
    f"Link: {article_metadata.get('Link', 'No link available')}\n"
)

# Get the embedded vector
embedded_vector = embeddings.embed_documents(text_content)

# Display the embedded vector
print("Embedded Vector:", embedded_vector)

Embedded Vector: [-0.02206378390829954, 0.00032391457457998414, -0.024214692123652588, 0.0005290487322752978, 0.009503426410507143, 0.006360163296119833, -0.01673329431300888, 0.03536386726251577, -0.029883373845400824, 0.007382750582973838, 0.028759070578389376, 0.02156604977032399, 0.015215580064932458, -0.044953826141692556, -0.0008697135511081063, 0.0017994266178640993, 0.018552372236960765, 0.008303050330258882, 0.010626141960853389, -0.04028945213962963, -0.02323840976475574, -0.015589202995798477, -0.02921702354554245, 0.037510359558184256, -0.016076354598693044, 0.02295854160471163, 0.003941194304755615, 0.0029440929802800973, -0.04832272941799323, 0.012573624909341902, 0.019477514983380705, -0.0029919399829806985, 0.00367422569265621, -0.009323344590993498, -0.02733542460305466, -0.031185264193059743, 0.08402467967967966, 0.029013314282486855, -0.026109274750991403, 0.05685903424722458, 0.03009710340913846, 0.01847047120810612, -0.008760736162536762, -0.02771798020157665, 0.01

In [41]:
from scipy.spatial.distance import cosine

# Assuming `embedded_vector` is the vector of the saved document in the database
# And `input_text` is the new text you want to compare

# Step 1: Embed the input text
input_text = "Large Margin Neural Language Model"
input_vector = embeddings.embed_query(input_text)

# Step 2: Calculate cosine similarity
# Cosine similarity is 1 - cosine distance
similarity_score = 1 - cosine(input_vector, embedded_vector)

print("Similarity Score:", similarity_score)


Similarity Score: 0.5827808095642234


In [None]:
import lancedb
from lancedb.pydantic import LanceModel, Vector

# Connect to LanceDB
db = lancedb.connect("data/lancedb")

# Define the schema for the documents
class ResearchPaper(LanceModel):
    title: str
    authors: str
    year: str
    abstract: str
    link: str
    vector: Vector(embeddings.dimension) = embeddings.VectorField()

# Create or connect to a table
research_papers_table = db.create_table("research_papers", schema=ResearchPaper, exist_ok=True)

# Assuming you have multiple embedded vectors and metadata, insert them
research_papers_table.upsert(
    [
        {
            "title": article_metadata['Title'],
            "authors": article_metadata['Authors'],
            "year": article_metadata['Year'],
            "abstract": article_metadata['Abstract'],
            "link": article_metadata['Link'],
            "vector": embedded_vector
        }
    ]
)
