In [6]:
import os
import requests
import json
from dotenv import load_dotenv

load_dotenv()

# Load the API key from an environment variable
CORE_API_KEY = os.getenv('CORE_API_KEY')
BASE_URL = 'https://api.core.ac.uk/v3'

def fetch_data(endpoint, params=None):
    """
    Fetches data from the specified CORE API endpoint.
    
    Args:
        endpoint (str): The endpoint of the API to call.
        params (dict): Optional dictionary of parameters to include in the request.
        
    Returns:
        dict: The JSON response from the API as a dictionary.
    """
    headers = {
        'Authorization': f'Bearer {CORE_API_KEY}'
    }
    response = requests.get(f'{BASE_URL}/{endpoint}', headers=headers, params=params)
    
    # Check if the request was successful
    if response.status_code == 200:
        return response.json()
    else:
        # Raise an error if the request was not successful
        response.raise_for_status()

def get_single_research(query):
    """
    Retrieves a single research article with selected metadata fields.
    
    Args:
        query (str): The search query (e.g., "machine learning").
        
    Returns:
        dict: Selected metadata fields of the first research article that matches the search query.
    """
    search_params = {
        'q': query,
        'page': 1,
        'pageSize': 1
    }
    
    # Search for articles
    search_results = fetch_data('search/works', params=search_params)
    
    # Extract the first result's ID
    if 'results' in search_results and len(search_results['results']) > 0:
        article_id = search_results['results'][0]['id']
        
        # Fetch full metadata and text for the article
        article_data = fetch_data(f'works/{article_id}')
        
        # Extract specific fields
        title = article_data.get('title', 'No title available')
        authors = [author.get('name', 'Unknown') for author in article_data.get('authors', [])]
        year = article_data.get('year', 'No year available')
        abstract = article_data.get('abstract', 'No abstract available')
        full_text = article_data.get('fullText', 'No full text available')
        full_text_link = article_data.get('downloadUrl', 'No link available')
        
        # Additional metadata fields
        keywords = article_data.get('keywords', 'No keywords available')
        publisher = article_data.get('publisher', 'No publisher available')
        publication_date = article_data.get('publicationDate', 'No publication date available')
        journal = article_data.get('journal', {}).get('name', 'No journal available')
        citation_count = article_data.get('citationCount', 'No citation count available')
        
        # Format and return selected metadata fields
        return {
            "Title": title,
            "Authors": ", ".join(authors),
            "Year": year,
            "Abstract": abstract,
            "Keywords": keywords,
            "Publisher": publisher,
            "Publication Date": publication_date,
            "Journal": journal,
            "Citation Count": citation_count,
            "Full Text": full_text,  # Full text included without truncation
            "Link": full_text_link
        }
    else:
        print("No results found for the query.")
        return None

def save_metadata_to_file(data, filename):
    """
    Saves the article metadata to a JSON file for RAG model use.
    
    Args:
        data (dict): The article metadata.
        filename (str): The name of the file to save the data.
    """
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=4)
    print(f"Data saved to {filename}")

# Example usage of the function
if __name__ == "__main__":
    query = "Large Language Model"  # Change the query as needed
    article = get_single_research(query)
    
    if article:
        filename = "article_metadata.json"
        save_metadata_to_file(article, filename)

Data saved to article_metadata.json


In [7]:
from langchain_openai import OpenAIEmbeddings

import os
load_dotenv()
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

embeddings = OpenAIEmbeddings(
    model="text-embedding-3-large",
    api_key=OPENAI_API_KEY
    # dimensions=1024  # Uncomment if you need to specify dimensions
)

In [13]:
with open("article_metadata.json", "r", encoding="utf-8") as file:
    article_metadata = json.load(file)

text_content = (
    f"Title: {article_metadata.get('Title', 'No title available')}\n"
    f"Authors: {article_metadata.get('Authors', 'Unknown')}\n"
    f"Year: {article_metadata.get('Year', 'No year available')}\n"
    f"Abstract: {article_metadata.get('Abstract', 'No abstract available')}\n"
    f"Full Text: {article_metadata.get('Full Text', 'No full text available')}\n"
    f"Link: {article_metadata.get('Link', 'No link available')}\n"
)

# Get the embedded vector
embedded_vector = embeddings.embed_query(text_content)

# Display the embedded vector
print("Embedded Vector:", embedded_vector)

Embedded Vector: [-0.02201929105626971, 0.00039131966259927596, -0.02424024297580226, 0.0005357008601517476, 0.00962312752203924, 0.006403191966755165, -0.016765636417205285, 0.03539050859971724, -0.029760723490397562, 0.007326925043058509, 0.028813907291222046, 0.021694433191218356, 0.01522827180049851, -0.04495851887631829, -0.0008399571264626906, 0.0017270542557111127, 0.0185680762349692, 0.008288762496063194, 0.010656342827930274, -0.04026109436359093, -0.0233556443742386, -0.01561467642155252, -0.029246551365022005, 0.03752506886985873, -0.015996157811139184, 0.022994055201494865, 0.0038994803370856687, 0.0029182232895281655, -0.048251408884927564, 0.012595055092903772, 0.019382258328741367, -0.0031007649552655552, 0.0036663980188907665, -0.009274233758989428, -0.02735050000251786, -0.031121790267240018, 0.08407787656539949, 0.028975197558996593, -0.026151995320369083, 0.05684891827405578, 0.03009313450990194, 0.018561175537051267, -0.008680753759720801, -0.027792041347269307, 0.0

In [15]:
import json
file_path = "article_metadata.json"

with open(file_path, "r", encoding="utf-8") as file:
    article_metadata = json.load(file)

In [21]:
import json
from langchain_openai import OpenAIEmbeddings
import os
from dotenv import load_dotenv
import lancedb

# Load environment variables
load_dotenv()
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

# Initialize the embeddings model
embeddings_model = OpenAIEmbeddings(
    model="text-embedding-3-large",
    openai_api_key=OPENAI_API_KEY
)

# List of file paths for JSON files
file_paths = ["article_metadata.json", "article_metadata copy.json"]

# Initialize a list to hold all embedded data
embedded_vectors = []

# Process each JSON file
for file_path in file_paths:
    # Load the article metadata from JSON
    with open(file_path, "r", encoding="utf-8") as file:
        article_metadata = json.load(file)

    # Embed the full text
    embedding = embeddings_model.embed_query(article_metadata["Full Text"])

    # Append the structured data to embedded_vectors
    embedded_vectors.append({
        "embedding": embedding,
        "metadata": {
            "Title": article_metadata["Title"],
            "Authors": article_metadata["Authors"],
            "Year": article_metadata["Year"],
            "Abstract": article_metadata["Abstract"],
            "Keywords": article_metadata["Keywords"],
            "Link": article_metadata["Link"]
        }
    })

# Connect to LanceDB and create the table with all data
db = lancedb.connect("./lancedb")
tbl = db.create_table("my_table", data=embedded_vectors)

print("Data from both JSON files successfully stored in LanceDB.")


Data from both JSON files successfully stored in LanceDB.


In [25]:
import numpy as np
import pandas as pd
from langchain_openai import OpenAIEmbeddings
import lancedb

# Function to calculate cosine similarity
def cosine_similarity(vec1, vec2):
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

# Initialize the embeddings model
embeddings_model = OpenAIEmbeddings(
    model="text-embedding-3-large",
    openai_api_key=OPENAI_API_KEY
)

# Connect to LanceDB (local database)
db = lancedb.connect("./lancedb")
table_name = "my_table"
tbl = db.open_table(table_name)

# Define the query text
query_text = "Large margin criterion for training neural language models."

# Embed the query text
query_embedding = embeddings_model.embed_query(query_text)

# Convert LanceDB table to a Pandas DataFrame
df = tbl.to_pandas()

# Calculate similarity between query and each embedding in the DataFrame
similarities = []
for _, row in df.iterrows():
    embedding = row["embedding"]
    similarity = cosine_similarity(query_embedding, embedding)
    similarities.append((row, similarity))

# Sort by similarity in descending order and get the top 5 results
similarities.sort(key=lambda x: x[1], reverse=True)
top_results = similarities[:5]

# Display the results
for result, similarity in top_results:
    print("Title:", result['metadata']['Title'])
    print("Authors:", result['metadata']['Authors'])
    print("Abstract:", result['metadata']['Abstract'])
    print("Similarity Score:", similarity)
    print("-" * 50)


Title: Large Margin Neural Language Model
Authors: Huang, Jiaji, Huang, Liang, Li, Yi, Ping, Wei
Abstract: We propose a large margin criterion for training neural language models.
Conventionally, neural language models are trained by minimizing perplexity
(PPL) on grammatical sentences. However, we demonstrate that PPL may not be the
best metric to optimize in some tasks, and further propose a large margin
formulation. The proposed method aims to enlarge the margin between the "good"
and "bad" sentences in a task-specific sense. It is trained end-to-end and can
be widely applied to tasks that involve re-scoring of generated text. Compared
with minimum-PPL training, our method gains up to 1.1 WER reduction for speech
recognition and 1.0 BLEU increase for machine translation.Comment: 9 pages. Accepted as a long paper in EMNLP201
Similarity Score: 0.6163819185377857
--------------------------------------------------
Title: Large Margin Neural Language Model
Authors: Huang, Jiaji, Huang, L