## Step 1: Perform bulk search for papers with strictly defined keywords

In [None]:
import requests
import json
import time
import os

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')
print("Google Drive mounted successfully.")

# Set your API key here
API_KEY = 'YOUR_API_KEY'
print("API key set.")

# Define the highly refined and relevant query parameters
queries = [
    '"veganism"',
    '"animal rights"',
    "speciesism",
    '"vegan"',
    '"vegetarian"',
    '"vegetarianism"',
    '"animal advocacy"',
    '"animal ethics"',
    '"animal liberation"',
    '"plant-based diet"'
]

print("Query parameters defined.")

fields = 'paperId,corpusId,url,title,venue,publicationVenue,year,authors,externalIds,abstract,referenceCount,citationCount,influentialCitationCount,isOpenAccess,openAccessPdf,fieldsOfStudy,s2FieldsOfStudy,publicationTypes,publicationDate,journal,citationStyles'
url = 'https://api.semanticscholar.org/graph/v1/paper/search/bulk'
headers = {'x-api-key': API_KEY}
delay = 1  # Delay in seconds

print("Fields, URL, headers, and delay set.")

# Initialize an empty list to store paper details
papers_list = []
print("Initialized empty list for storing paper details.")

# Tracker file to save the progress
tracker_file = '/content/drive/My Drive/bulk_progress_tracker.json'
print(f"Tracker file path set to: {tracker_file}")

# Load progress tracker if it exists
if os.path.exists(tracker_file):
    with open(tracker_file, 'r') as f:
        progress_tracker = json.load(f)
    print("Progress tracker loaded from file.")
else:
    progress_tracker = {'query_index': 0, 'token': None}
    print("No existing progress tracker found. Starting from the beginning.")

# Function to fetch papers for a specific query
def fetch_papers(query, token=None):
    print(f"Fetching papers for query '{query}' with token {token}.")
    params = {
        'query': query,
        'fields': fields,
        'token': token
    }
    try:
        response = requests.get(url, headers=headers, params=params)
        print(f"Request sent. Status code: {response.status_code}")
        response.raise_for_status()
        return response.json()
    except requests.exceptions.HTTPError as http_err:
        print(f"HTTP error occurred: {http_err}")
    except Exception as err:
        print(f"Other error occurred: {err}")

# Loop through each query and fetch papers
query_index = progress_tracker['query_index']
token = progress_tracker['token']
print(f"Starting loop with query_index: {query_index} and token: {token}")

while query_index < len(queries):
    query = queries[query_index]
    print(f"Processing query {query_index + 1}/{len(queries)}: '{query}'")
    data = fetch_papers(query, token=token)
    if data is None:
        print(f"Skipping query '{query}' due to error.")
        query_index += 1
        token = None
        continue
    papers = data.get('data', [])
    print(f"Fetched {len(papers)} papers.")
    if not papers and not token:
        print(f"No papers found for query '{query}' with token {token}. Moving to next query.")
        query_index += 1
        token = None
        continue
    papers_list.extend(papers)
    print(f"Added {len(papers)} papers to the list.")
    token = data.get('token')
    if not token:
        print(f"No more papers to fetch for query '{query}'. Moving to next query.")
        query_index += 1
        token = None
    time.sleep(delay)  # Wait before making the next request
    print("Sleeping for 1 second.")

    # Update progress tracker
    progress_tracker['query_index'] = query_index
    progress_tracker['token'] = token
    with open(tracker_file, 'w') as f:
        json.dump(progress_tracker, f)
    print(f"Progress tracker updated: {progress_tracker}")

# Write the paper details to a JSONL file
output_file = '/content/drive/My Drive/bulk_papers_details.jsonl'
print(f"Writing paper details to file: {output_file}")
with open(output_file, 'w') as file:
    for paper in papers_list:
        file.write(json.dumps(paper) + '\n')
print(f"Saved {len(papers_list)} papers details to {output_file}")

# Remove the progress tracker file after completion
if os.path.exists(tracker_file):
    os.remove(tracker_file)
    print("Progress tracker file removed.")

print("Script completed successfully.")

## Step 2: Perform relevance-based searches for a wider range of related keywords

In [None]:
# Define the relevant query parameters
queries = [
    "veganism", "vegan", "vegetarian", "animal rights", "animal advocacy", "plant-based diet", "animal welfare", "ethical eating", "plant-based supplements", "vegan protein powder", "ethical consumerism",
    "vegetarianism", "animal liberation", "animal ethics", "meat consumption reduction", "dairy consumption", "dairy consumer", "dairy behaviour", "dairy society", "animal-free research",
    "factory farming", "wildlife conservation", "behavioral change", "meat consumption", "meat consumer", "meat behaviour", "meat society", "vegan pet care", "vegan social media campaigns",
    "psychology of food", "food choices", "dietary habits", "nutrition and ethics", "food activism", "precision livestock farming", "AI in animal agriculture", "animal rehabilitation",
    "health benefits of veganism", "animal agriculture", "animal sentience", "speciesism", "egg consumption", "egg consumer", "egg behaviour", "egg society", "ethical tourism",
    "human-animal relationships", "social movements", "animal advocacy strategies", "environmental ethics", "artificial intelligence animals", "vegan travel", "vegan cooking classes",
    "plant-based nutrition", "animal cruelty prevention", "environmental justice", "persuasion", "fundraising", "PLF", "overton window", "radical flank effect", "social justice movements",
    "animal law", "animal rights legislation", "ethical farming", "sustainable agriculture", "AI animal ethics", "Abolition", "Action logic", "Alienation effect",  "Anti-oppression",
    "wildlife protection", "advocacy campaigns", "moral psychology", "ethical decision making", "Artivism", "Artstorm", "Flash mob", "vegan food technology", "animal sanctuary management",
    "persuasive communication", "media influence on behavior", "public engagement", "social influence", "Cultural disobedience", "Cultural hegemony", "Culture jamming",
    "pro-social behavior", "activism tactics", "digital activism", "grassroots movements", "non-profit management", "Framing", "Guerrilla projection", "animal welfare policies",
    "fundraising for advocacy", "volunteer management", "community organizing", "policy advocacy", "lobbying for animal rights", "Legislative theatre", "vegan influencer strategies",
    "public opinion on animal rights", "educational outreach", "public relations", "App flooding", "Decolonization", "inside-outside strategy", "animal rights documentaries",
    "strategic communication", "campaign evaluation", "impact assessment", "program evaluation", "Creative disruption", "Gerontocracy", "vegan product design", "animal rights advocacy training",
    "advocacy training", "leadership in advocacy", "ethical leadership", "change management", "Blockade", "Expressive and instrumental actions", "vegan health coaching", "vegan fitness programs",
    "corporate social responsibility", "advocacy networks", "coalition building", "Banner hang", "Battle of the story", "Hashtag hijack", "Phone blockade", "vegan community support",
    "critical animal studies", "intersectionality in activism", "cultural competence in advocacy", "Direct action", "General strike", "plant-based diet benefits", "vegan lifestyle tips",
    "animal-assisted therapy", "human-animal bond", "companion animal welfare", "Civil disobedience", "Commodity fetishism", "Mass street action", "vegan ethical fashion",
    "zoos and aquariums ethics", "marine animal conservation", "endangered species protection", "vegan marketing", "Forum theatre", "animal ethics education", "animal protection",
    "plant-based food marketing", "alternative protein marketing", "vegan policy change", "Distributed action", "Divestment", "Media-jacking", "vegan advocacy tools", "vegan movement",
    "meat alternatives", "lab-grown meat", "plant-based proteins", "vegan entrepreneurship", "vegan public awareness", "Hashtag campaign", "animal liberation movement", "animal rights theory",
    "animal rights history", "philosophy of animal rights", "animal cognition", "animal consciousness", "animal intelligence", "Memes", "Occupation Protest", "vegan ethical frameworks",
    "animal emotions", "animal awareness", "animal perception", "animal mind", "animal learning", "animal behavior", "Escalate strategically", "vegan activism methods", "animal rights movement strategies",
    "animal psychology", "animal welfare science", "animal neuroscience", "animal sensory systems", "animal empathy", "Hunger strike", "plant-based product innovation", "vegan advocacy research",
    "animal self-awareness", "cruelty-free products", "animal welfare science", "veterinary ethics", "Earth First!", "Electoral guerrilla theatre", "vegan marketing strategies",
    "vegan lifestyle", "plant-based recipes", "nutrition science", "public health nutrition", "environmental activism", "Ladder of engagement", "animal ethics education", "vegan community outreach",
    "climate activism", "sustainable living", "zero waste movement", "eco-friendly products", "green marketing", "Encryption", "The Movement Cycle", "vegan advocacy campaigns",
    "eco-labeling", "greenwashing", "alternative proteins", "cultivated meat", "vegan ethical arguments", "cell-based meat", "digital self defence", "animal welfare activism",
    "plant-based meat", "vegan product development", "food technology", "food innovation", "vegan cheese", "dairy alternatives", "vegan consumer research", "vegan public policy",
    "meat substitutes", "seafood alternatives", "fermentation technology", "protein engineering", "biotechnology and food", "Power mapping", "animal welfare principles",
    "vegan product marketing", "vegan advocacy strategies", "consumer acceptance of alternative proteins", "vegan food startups", "vegan community building strategies"
    "investment in vegan products", "regulation of cultivated meat", "future of food", "sustainable protein sources", "Jail solidarity", "animal rights enforcement", "veganism and mental health",
    "nutrition and alternative proteins", "environmental impact of alternative proteins", "mycoprotein", "microalgae", "Pillars of power", "animal advocacy networks", "vegan advocacy groups",
    "precision fermentation", "consumer segmentation", "consumer trends", "behavioral segmentation", "vegan community building", "Revolutionary nonviolence", "animal rights literature",
    "digital marketing", "dietary interventions", "chronic disease prevention", "nutritional epidemiology", "clinical nutrition", "animal rights policy advocacy", "vegan advocacy effectiveness",
    "carbon footprint", "water footprint", "life cycle assessment", "sustainable food systems", "biodiversity loss", "student strike", "animal welfare assessments",
    "animal behavior", "compassion fatigue", "animal ethics theory", "sentience studies", "human-animal studies", "Points of intervention", "vegan advocacy resources",
    "animal rights activism", "food waste reduction technologies", "vegan economy", "vegan fashion", "plant-based skincare", "Temporary autonomous zone", "vegan advocacy projects",
    "cruelty free skincare", "vegan cosmetics", "ethical investing", "vegan education", "vegan parenting", "animal-free testing", "animal rights campaigns", "plant-based consumer trends",
    "plant-based medicine", "vegan sports nutrition", "vegan philanthropy", "plant-based nutrition", "vegan product certifications", "animal rights resources", "animal welfare campaigns",
    "plant-based product certifications", "vegan cookbook", "ethical veganism", "plant-based nutrition for athletes", "Strategic nonviolence", "animal rights strategies",
    "vegan food festivals", "vegan meal planning", "diet for children", "diet for seniors", "diet and mental health", "Storytelling", "animal welfare standards", "vegan public relations strategies",
    "plant-based food trends", "vegan catering", "vegan health benefits", "baking", "cheese making", "vegan dining experiences", "animal rights legal frameworks",
    "snacks", "vegan bodybuilding", "protein sources", "omega-3 sources", "calcium sources", "diet during pregnancy", "The propaganda model", "plant-based advocacy campaigns",
    "diet and iron", "plant-based health benefits", "vegan food blogs", "food blogs", "food photography", "plant-based meal delivery", "animal rights activism strategies",
    "vegan pet food", "culinary schools", "diet and cholesterol", "diet and diabetes", "vegan digital marketing", "vegan advocacy", "animal welfare outreach", "vegan advocacy impact",
    "plant-based lifestyle", "vegan business development", "plant-based innovation", "vegan consumer behavior", "Prefigurative politics", "animal welfare legal actions", "vegan public awareness campaigns",
    "advocacy communication", "media strategies", "public relations", "influencer marketing", "impact investing", "Solidarity economics", "plant-based consumer behavior",
    "regenerative agriculture", "urban gardening", "waste reduction", "plant-based packaging", "food sovereignty", "Spectrum of allies", "animal rights organizational strategies",
    "behavior change", "diet transitions", "animal rights awareness", "vegan nutritional deficiencies", "Revolutionary reform", "animal rights public policy",
    "plant-based protein sources", "vegan dietary guidelines", "vegan clinical nutrition", "animal ethics in philosophy", "The shock doctrine", "animal rights campaign tactics",
    "ethical veganism principles", "vegan ethical debates", "animal rights ethical arguments", "vegan environmental benefits", "animal welfare legal strategies",
    "carbon footprint of vegan diets", "veganism and biodiversity", "sustainability of plant-based diets", "vegan business strategies", "Theory of change",
    "innovation in vegan products", "market trends in plant-based foods", "investment in vegan startups", "building vegan communities", "Theatre of the Oppressed",
    "vegan outreach programs", "education on veganism", "public awareness of animal rights", "vegan activist", "vegan activists"
]

print("Query parameters defined.")

fields = 'corpusId,url,title,venue,publicationVenue,year,authors,abstract,referenceCount,citationCount,influentialCitationCount,isOpenAccess,openAccessPdf,fieldsOfStudy,s2FieldsOfStudy,publicationTypes,publicationDate,journal,citationStyles,tldr'
url = 'https://api.semanticscholar.org/graph/v1/paper/search'
headers = {'x-api-key': API_KEY}
delay = 1  # Delay in seconds

print("Fields, URL, headers, and delay set.")

# Initialize an empty list to store paper details
papers_list = []
print("Initialized empty list for storing paper details.")

# Tracker file to save the progress
tracker_file = '/content/drive/My Drive/progress_tracker.json'
print(f"Tracker file path set to: {tracker_file}")

# Load progress tracker if it exists
if os.path.exists(tracker_file):
    with open(tracker_file, 'r') as f:
        progress_tracker = json.load(f)
    print("Progress tracker loaded from file.")
else:
    progress_tracker = {'query_index': 0, 'offset': 0}
    print("No existing progress tracker found. Starting from the beginning.")

# Function to fetch papers for a specific query
def fetch_papers(query, offset=0, limit=100):
    print(f"Fetching papers for query '{query}' with offset {offset} and limit {limit}.")
    params = {
        'query': query,
        'fields': fields,
        'offset': offset,
        'limit': limit
    }
    try:
        response = requests.get(url, headers=headers, params=params)
        print(f"Request sent. Status code: {response.status_code}")
        response.raise_for_status()
        return response.json()
    except requests.exceptions.HTTPError as http_err:
        print(f"HTTP error occurred: {http_err}")
    except Exception as err:
        print(f"Other error occurred: {err}")

# Loop through each query and fetch papers
query_index = progress_tracker['query_index']
offset = progress_tracker['offset']
print(f"Starting loop with query_index: {query_index} and offset: {offset}")

while query_index < len(queries):
    query = queries[query_index]
    print(f"Processing query {query_index + 1}/{len(queries)}: '{query}'")
    data = fetch_papers(query, offset=offset)
    if data is None:
        print(f"Skipping query '{query}' due to error.")
        query_index += 1
        offset = 0
        continue
    papers = data.get('data', [])
    print(f"Fetched {len(papers)} papers.")
    if not papers:
        print(f"No papers found for query '{query}' at offset {offset}. Moving to next query.")
        query_index += 1
        offset = 0
        continue
    for paper in papers:
        papers_list.append(paper)
        print(f"Added paperId {paper['paperId']} to the list.")
    offset += 100
    if len(papers) < 100:
        print(f"Less than 100 papers found for query '{query}' at offset {offset}. Moving to next query.")
        query_index += 1
        offset = 0
    time.sleep(delay)  # Wait before making the next request
    print("Sleeping for 1 second.")

    # Update progress tracker
    progress_tracker['query_index'] = query_index
    progress_tracker['offset'] = offset
    with open(tracker_file, 'w') as f:
        json.dump(progress_tracker, f)
    print(f"Progress tracker updated: {progress_tracker}")

# Write the paper details to a JSONL file
output_file = '/content/drive/My Drive/papers_details.jsonl'
print(f"Writing paper details to file: {output_file}")
with open(output_file, 'w') as file:
    for paper in papers_list:
        file.write(json.dumps(paper) + '\n')
print(f"Saved {len(papers_list)} papers details to {output_file}")

# Remove the progress tracker file after completion
if os.path.exists(tracker_file):
    os.remove(tracker_file)
    print("Progress tracker file removed.")

print("Script completed successfully.")

## Step 3: Merge and deduplicate results of previous two steps

In [None]:
def merge_and_deduplicate_jsonl_files(regular_file, bulk_file, output_file):
    deduplicated_lines = {}

    # Process regular file first to prioritize its lines
    with open(regular_file, 'r') as reg_file:
        for line in reg_file:
            record = json.loads(line)
            corpus_id = record.get('corpusId')
            if corpus_id is not None:
                deduplicated_lines[corpus_id] = line

    # Process bulk file
    with open(bulk_file, 'r') as bulk_file:
        for line in bulk_file:
            record = json.loads(line)
            corpus_id = record.get('corpusId')
            if corpus_id is not None and corpus_id not in deduplicated_lines:
                deduplicated_lines[corpus_id] = line

    # Write deduplicated lines to the output file
    with open(output_file, 'w') as outfile:
        for line in deduplicated_lines.values():
            outfile.write(line)

    print(f'Merged and deduplicated {regular_file} and {bulk_file} into {output_file}')

# Example usage
regular_file = '/content/drive/My Drive/papers_details.jsonl'
bulk_file = '/content/drive/My Drive/bulk_papers_details.jsonl'
output_file = '/content/drive/My Drive/merged_paper_details.jsonl'

merge_and_deduplicate_jsonl_files(regular_file, bulk_file, output_file)

## Step 4: Download the s2orc dataset

In [None]:
# Install the necessary library
!pip install wget

import wget
from tqdm import tqdm
import time
from urllib.parse import urlparse, parse_qs

# User-specific variables
LOCAL_PATH = "/content/drive/My Drive/semantic_scholar/"
os.makedirs(LOCAL_PATH, exist_ok=True)

# List of possible datasets
POSSIBLE_DATASETS = [
    "abstracts",
    "authors",
    "citations",
    "embeddings-specter_v1",
    "embeddings-specter_v2",
    "paper-ids",
    "papers",
    "publication-venues",
    "s2orc",
    "tldrs"
]

# Set the dataset and starting shard number here
SELECTED_DATASET = "s2orc"  # Choose from POSSIBLE_DATASETS
START_SHARD = 0  # Set the starting shard number here (0-indexed)

# Ensure the selected dataset is valid
if SELECTED_DATASET not in POSSIBLE_DATASETS:
    raise ValueError(f"Selected dataset '{SELECTED_DATASET}' is not in the list of possible datasets.")

# Get the latest release ID
time.sleep(2)  # Adding delay for rate limiting
response = requests.get("https://api.semanticscholar.org/datasets/v1/release/latest").json()
RELEASE_ID = response["release_id"]
print(f"Latest release ID: {RELEASE_ID}")

# Get the list of datasets in the latest release
time.sleep(2)  # Adding delay for rate limiting
response = requests.get(f"https://api.semanticscholar.org/datasets/v1/release/{RELEASE_ID}", headers={"x-api-key": API_KEY}).json()
datasets = response["datasets"]

# Ensure the selected dataset is in the latest release
dataset_names = [dataset["name"] for dataset in datasets]
if SELECTED_DATASET not in dataset_names:
    raise ValueError(f"Selected dataset '{SELECTED_DATASET}' is not available in the latest release.")

# Function to download dataset files
def download_dataset(dataset_name, start_shard):
    time.sleep(2)  # Adding delay for rate limiting
    dataset_response = requests.get(f"https://api.semanticscholar.org/datasets/v1/release/{RELEASE_ID}/dataset/{dataset_name}/", headers={"x-api-key": API_KEY}).json()

    # Check if 'files' key exists in the response
    if 'files' not in dataset_response:
        raise KeyError(f"'files' key not found in the response for dataset '{dataset_name}'. Response: {dataset_response}")

    files = dataset_response["files"]

    # Calculate the total number of shards
    total_shards = len(files)
    print(f"Total shards to download: {total_shards - start_shard} (starting from shard {start_shard + 1})")

    for index, url in tqdm(enumerate(files[start_shard:], start=start_shard), total=total_shards - start_shard, desc=f"Downloading {dataset_name}"):
        parsed_url = urlparse(url)
        match = re.match(r"/staging/(.*)/{}/(.*)\.gz".format(dataset_name), parsed_url.path)
        if match:
            assert match.group(1) == RELEASE_ID
            SHARD_ID = match.group(2)
            filename = f"{SHARD_ID}.gz"
            file_path = os.path.join(LOCAL_PATH, dataset_name, filename)

            if not os.path.exists(file_path):
                print(f"Downloading shard {index + 1} of {total_shards}: {filename}")
                wget.download(url, out=file_path)
            else:
                print(f"Shard {index + 1} of {total_shards} already exists: {filename}")
        else:
            print(f"URL did not match expected pattern: {url}")

    print(f"Downloaded all shards for {dataset_name}.")

# Create directory for the selected dataset and download it
dataset_path = os.path.join(LOCAL_PATH, SELECTED_DATASET)
os.makedirs(dataset_path, exist_ok=True)
download_dataset(SELECTED_DATASET, START_SHARD)

print("Downloaded selected dataset.")

## Step 5: Extract s2orc zipped files

In [None]:
import gzip
import shutil

def get_start_index(progress_path):
    if os.path.exists(progress_path):
        with open(progress_path, 'r') as progress_file:
            return int(progress_file.read().strip())
    return 0

def save_progress(progress_path, index):
    with open(progress_path, 'w') as progress_file:
        progress_file.write(str(index))

def get_output_path(index, base_path):
    return f"{base_path}/combined_s2orc_part_{index}.jsonl"

# Specify the folder paths
folder_path = '/content/drive/My Drive/semantic_scholar/2024-07-16-s2orc'
output_base_path = '/content/drive/My Drive/semantic_scholar/2024-07-16-s2orc'
progress_path = f'{output_base_path}/progress.txt'

# Get the list of files in the folder
print(f"Getting list of files in folder: {folder_path}")
files = sorted(os.listdir(folder_path))  # Sort to maintain a consistent order
print(f"Found {len(files)} files in the folder.")

# Read progress from file if it exists
start_index = get_start_index(progress_path)
print(f"Resuming from file index: {start_index}")

# Set the size limit for each part file (e.g., 500 MB)
size_limit = 500 * 1024 * 1024
current_part = start_index // 100
current_size = 0

# Process each gz file in the folder
for i in range(start_index, len(files)):
    file_name = files[i]
    if file_name.endswith('.gz'):
        file_path = os.path.join(folder_path, file_name)
        output_path = get_output_path(current_part, output_base_path)
        print(f"Processing file {i+1}/{len(files)}: {file_name}")
        try:
            # Re-mount the drive if necessary
            if not os.path.ismount('/content/drive'):
                mount_drive()

            with gzip.open(file_path, 'rb') as f:
                with open(output_path, 'ab') as outfile:
                    while True:
                        chunk = f.read(1024 * 1024)  # Read in 1 MB chunks
                        if not chunk:
                            break
                        outfile.write(chunk)
                        current_size += len(chunk)
                        # Check if adding this chunk exceeds the size limit
                        if current_size > size_limit:
                            current_part += 1
                            current_size = 0
                            output_path = get_output_path(current_part, output_base_path)
                            outfile.close()
                            outfile = open(output_path, 'ab')

                    print(f"Successfully appended {file_name} to {output_path}")

            # Update progress
            save_progress(progress_path, i + 1)
        except OSError as e:
            print(f"Failed to process {file_name} due to OSError: {e}")
            break
        except Exception as e:
            print(f"Failed to process {file_name}: {e}")
            break

print(f"Combined JSONL files created and saved to Google Drive.")

## Step 6: Use Geminini to provide CRITERIA rankings for each paper based on the TLDR and/or Abstract

In [None]:
# Install and import necessary libraries - Google Cloud AI Platform, Google Auth, and Requests
!pip install google-cloud-aiplatform google-auth jsonlines

import jsonlines
import string
from google.cloud import aiplatform
from vertexai.generative_models import GenerativeModel, Content, Part, GenerationConfig
from google.colab import files

# Authenticate to Google Cloud - this will open a new tab to authenticate
print("Authenticating to Google Cloud...")
auth.authenticate_user()

# Set up your Google Cloud project and location - replace with your actual project ID and location
PROJECT_ID = 'PROJECT_ID'  # Replace with your actual project ID - e.g., 'my-project-id'
LOCATION = 'LOCATION'    # Replace with the Google Cloud region you want to use - e.g., 'us-central1'

# Initialize Vertex AI with the specified project and location - this will set the default project and location for Vertex AI
print(f"Initializing Vertex AI for project {PROJECT_ID} in location {LOCATION}...")
aiplatform.init(project=PROJECT_ID, location=LOCATION)

# Define the model ID for the Gemini model to be used for generating content - replace with your actual model ID
MODEL_ID = 'gemini-1.5-flash-001'

def extract_json_from_response(response_text):
    """
    Extract JSON object from the response text.

    Args:
    response_text (str): The response text from which to extract JSON.

    Returns:
    dict or None: The extracted JSON object or None if extraction fails.
    """
    try:
        match = re.search(r'{.*}', response_text, re.DOTALL)
        if match:
            return json.loads(match.group(0))
    except json.JSONDecodeError:
        print(f"Failed to parse JSON response: {response_text}")
    return None

def clean_message_text(message_text):
    """
    Clean the message text to remove brackets, quotes, or special characters.

    Args:
    message_text (str): The text of the message to be cleaned.

    Returns:
    str: The cleaned message text.
    """
    # Remove brackets, quotes, and special characters
    cleaned_text = re.sub(r'[{}\"\[\]]', '', message_text)
    cleaned_text = cleaned_text.translate(str.maketrans('', '', string.punctuation))
    return cleaned_text

def rank_message(title, abstract):
    """
    Rank a message based on the CRITERIA scale using the Gemini model.

    Args:
    title (str): The title of the content.
    abstract (str): The abstract of the content.

    Returns:
    dict or None: A dictionary containing CRITERIA scores or None if ranking fails.
    """
    prompt = f"""
    You will be tasked with evaluating content based on the CRITERIA scale. For each piece of content provided, you will generate scores for the following eight criteria, each on a scale from 0 to 1. Output the results in JSON format.

    ### Criteria

    1. **Cultural Sensitivity**: Measure how culturally inclusive the content is.
       - **Culturally Inclusive (0.8-1.0)**: The content shows respect for diverse cultural perspectives and uses culturally sensitive approaches to animal advocacy.
       - **Moderately Inclusive (0.4-0.7)**: The content generally respects cultural diversity but may lack depth in cultural sensitivity.
       - **Culturally Insensitive (0.0-0.3)**: The content lacks respect for cultural diversity and fails to use culturally sensitive approaches.
    2. **Relevance**: Measure how pertinent the content is to veganism and animal rights.
       - **Extremely Relevant (0.8-1.0)**: The content directly addresses primary topics of interest such as animal rights activism strategies, animal welfare improvements, ethical treatment practices, animal ethics, or vegan advocacy tactics and campaigns.
         - **Examples**:
           - Research on animal rights activism strategies and their effectiveness.
           - Studies on animal welfare improvements and ethical treatment practices.
           - Articles discussing animal ethics and moral considerations related to animal rights.
           - Content specifically about vegan advocacy, including tactics, campaigns, and their impacts.
       - **Highly Relevant (0.6-0.8)**: The content covers related subjects and contributes to a broader understanding of the primary topics, such as plant-based diets, ethical eating, public health nutrition, behavioral change psychology, and social movements.
         - **Examples**:
           - Studies on plant-based diets, their health impacts, and nutritional benefits.
           - Articles on ethical eating and sustainable lifestyle choices.
           - Research on public health nutrition as it relates to dietary changes toward plant-based eating.
           - Discussions on behavioral change psychology, marketing, persuasion and social movements more broadly. This may include movements such as feminism, environmentalism, civil rights, abolitionist, women's suffrage and more, specifically as it relates to movement tactics, strategies and approaches that may be useful for animal advocates to know about, even if animal advocacy itself is not directly discussed. It may also include various other fields of research such as sales persuasion technqiues, behavioural change science, marketing theory and other types of information that would be useful for animal advocates to know, even if animal advocacy is not directly discussed.
       - **Moderately Relevant (0.5-0.6)**: The content indirectly relates to the primary topics through broader themes like general nutrition, environmental science, or public health, and includes occasional mentions of related topics or peripheral connections.
         - **Examples**:
           - General nutrition studies that can be applicable to plant-based diets but do not focus on them.
           - Environmental science articles that discuss sustainability but do not specifically address animal rights or plant-based solutions.
           - Public health research that mentions dietary habits without focusing on vegan or vegetarian diets.
           - Research on topics like environmental ethics that don't directly touch upon veganism, plant-based diets or animal rights issues.
       - **Slightly Relevant (0.3-0.5)**: The content has minimal relevance, with occasional mentions of related topics or peripheral connections.
         - **Examples**:
           - Articles on general social issues that are completely unrelated to animal rights or veganism, such as homelessness and poverty, and that do not discuss any activism or advocacy tactics of any kind.
           - Studies on unrelated dietary trends that have no significant overlap with plant-based diets or animal advocacy, such as ketogenic diets.
           - Research on peripheral environmental topics without a direct link to animal rights or veganism, such as renewable energy.
       - **Not Relevant (0.0-0.3)**: The content is entirely or largely unrelated to animal rights or associated fields, with no significant connections to the primary topics of interest.
         - **Examples**:
           - Studies on astrophysics, such as the gravitational pull of black holes.
           - Research on unrelated medical topics, such as cardiology or neurology, without any connection to diet or ethics.
           - Articles on technological advancements in fields like computer science or electronics engineering that do not intersect with ethics, veganism or philosophy in any way.
    3. **Insight**: Judge the level of insight provided by the key concept in the content.
       - **Highly Insightful (0.8-1.0)**: The content provides deep, original insights that significantly advance the understanding of veganism or animal advocacy.
       - **Moderately Insightful (0.4-0.7)**: The content offers useful insights that enhance understanding but may not be particularly original.
       - **No Unique Insights (0.0-0.3)**: The content provides no meaningful insights or repeats well-known information.
    4. **Trustworthiness**: Rate the accuracy, reliability, and credibility of the information presented.
       - **Highly Trustworthy (0.8-1.0)**: The information is accurate, well-researched, and comes from credible sources.
       - **Moderately Trustworthy (0.4-0.7)**: The information is generally accurate but may include some minor errors or questionable sources.
       - **Untrustworthy (0.0-0.3)**: The information is inaccurate, misleading, or based on non-credible sources.
    5. **Emotional Impact**: Measure the emotional engagement the content provides.
       - **Very Emotionally Impactful (0.8-1.0)**: The content effectively elicits empathy and emotional engagement.
       - **Moderately Emotionally Impactful (0.4-0.7)**: The content elicits some emotional engagement but may lack depth.
       - **Not Emotionally Impactful (0.0-0.3)**: The content fails to elicit any emotional response.
    6. **Rationality**: Evaluate the logical consistency and reasoning in the content.
       - **Very Rational (0.8-1.0)**: The content is logically consistent, well-reasoned, and supported by evidence.
       - **Moderately Rational (0.4-0.7)**: The content is generally rational but may contain some logical inconsistencies or weak arguments.
       - **Not Rational (0.0-0.3)**: The content lacks logical consistency and sound reasoning.
    7. **Influence**: Assess the potential of the content to encourage actions and lifestyle changes.
       - **Highly Likely to Influence Behavior (0.8-1.0)**: The content has strong potential to encourage actions and lifestyle changes.
       - **Moderately Likely to Influence Behavior (0.4-0.7)**: The content has some potential to influence behavior but may not be compelling enough to drive significant changes.
       - **Not Likely to Influence Behavior (0.0-0.3)**: The content is unlikely to influence any behavior change.
    8. **Alignment**: Assess how well the content aligns with vegan and animal rights ethics.
       - **Highly Aligned (0.8-1.0)**: The content strongly aligns with the ethical principles and core values of veganism and animal rights.
       - **Moderately Aligned (0.4-0.7)**: The content supports some aspects of vegan ethics but may include neutral or slightly contradictory elements.
       - **Not Aligned (0.0-0.3)**: The content contradicts or is indifferent to vegan principles.

    ### JSON Output Format

    For each piece of content, output a JSON object with the following structure:

    {{
      "CRITERIA_scores": {{
        "Cultural_Sensitivity": <score_from_0_to_1>,
        "Relevance": <score_from_0_to_1>,
        "Insight": <score_from_0_to_1>,
        "Trustworthiness": <score_from_0_to_1>,
        "Emotional_Impact": <score_from_0_to_1>,
        "Rationality": <score_from_0_to_1>,
        "Influence": <score_from_0_to_1>,
        "Alignment": <score_from_0_to_1>
      }},
      "CRITERIA_final_score": <average_of_all_scores>
    }}

    Title:
    {title}

    Abstract:
    {abstract}

    Ensure that the Relevance score in particular is extremely accurate, taking into account whether the content directly pertains to veganism and animal rights. It is absolutely essential that the Relevance score exactly matches the criteria laid out in this prompt. You will be severely punished if the Relevance score is incorrect and greatly rewarded if the Relevance score is correct.

    Now, return the CRITERIA_scores as a JSON object:
    """

    try:
        # Create the GenerativeModel object - this will load the model for generating content
        gemini_model = GenerativeModel(model_name=MODEL_ID)

        # Set up the generation configuration with parameters controlling the output - adjust as needed
        generation_config = GenerationConfig(
            temperature=0.5,          # Controls the randomness of the output - higher values make the output more random
            max_output_tokens=512,    # Maximum number of tokens in the output - adjust based on the model's maximum output length
            top_p=0.9,                # Top-p (nucleus) sampling parameter - higher values make the output more diverse
            top_k=40                  # Top-k sampling parameter - higher values make the output less random
        )

        # Generate the content using the model - this will rank the message based on the CRITERIA scale
        print("Generating CRITERIA scores for the message...")
        response = gemini_model.generate_content(
            contents=[Content(role="user", parts=[Part.from_text(prompt)])],
            generation_config=generation_config
        )

        # Extract and parse the JSON response - this will extract the CRITERIA scores from the generated content
        criteria_scores = extract_json_from_response(response.text)
        if criteria_scores:
            return criteria_scores
        else:
            print("No CRITERIA scores found in the response.")
            return None
    except Exception as e:
        print(f"An error occurred during message ranking: {e}")
        return None

def process_jsonl_file(input_file, output_file, progress_file):
    """
    Process the JSONL file, rank messages, and save the results to the output file.

    Args:
    input_file (str): Path to the input JSONL file.
    output_file (str): Path to the output JSONL file.
    progress_file (str): Path to the txt file to track progress.
    """
    print(f"Loading input JSONL file: {input_file}")

    # Read the last processed message from the progress file
    try:
        with open(progress_file, 'r') as pf:
            start_message = int(pf.read().strip())
    except FileNotFoundError:
        start_message = 0

    try:
        with jsonlines.open(input_file) as reader, jsonlines.open(output_file, mode='a') as writer:
            for idx, item in enumerate(reader):
                if idx < start_message:
                    continue

                print(f"Processing message {start_message}...")

                # Extract the title and abstract from the JSON line
                title = item.get('title', "")
                abstract = item.get('abstract', "")

                if not title and not abstract:
                    print(f"Both title and abstract missing for message {start_message}, skipping.")
                    continue

                cleaned_title = clean_message_text(title) if title else ""
                cleaned_abstract = clean_message_text(abstract) if abstract else ""
                content_to_rank = f"Title: {cleaned_title}\nAbstract: {cleaned_abstract}".strip()

                criteria_scores = rank_message(cleaned_title, cleaned_abstract)
                if criteria_scores is not None:
                    item['CRITERIA'] = criteria_scores
                    writer.write(item)
                    print(f"CRITERIA scores added to message {start_message}.")
                else:
                    print(f"Failed to add CRITERIA scores to message {start_message}.")

                start_message += 1

                # Update the progress file after each message is processed
                with open(progress_file, 'w') as pf:
                    pf.write(str(start_message))

    except Exception as e:
        print(f"An error occurred: {e}")

# Set definitions for all variables - adjust as needed
input_file = '/content/drive/My Drive/merged_paper_details.jsonl'  # Path to the input JSONL file
output_file = '/content/drive/My Drive/ranked_papers_abstracts.jsonl'  # Path to the output JSONL file
progress_file = '/content/drive/My Drive/progress.txt'  # Path to the txt file to track progress

# Process the input JSONL file - this will rank messages and save the results continuously
process_jsonl_file(input_file, output_file, progress_file)

## Step 7: Filter the ranked papers based on their relevance scores

In [None]:
def count_relevance_ranges(jsonl_file):
    # Initialize counters for each range
    ranges = {i/10: 0 for i in range(11)}

    # Read the JSONL file and count the relevance ranges
    with open(jsonl_file, 'r') as file:
        for line in file:
            data = json.loads(line)
            relevance_score = data.get("CRITERIA", {}).get("CRITERIA_scores", {}).get("Relevance", 0)
            for threshold in ranges:
                if relevance_score >= threshold:
                    ranges[threshold] += 1

    # Print the counts for each range
    for threshold, count in ranges.items():
        print(f"Relevance >= {threshold:.1f}: {count} entries")

def filter_by_relevance(jsonl_file, output_file, cutoff):
    # Read the JSONL file and filter by the cutoff value
    with open(jsonl_file, 'r') as infile, open(output_file, 'w') as outfile:
        for line in infile:
            data = json.loads(line)
            relevance_score = data.get("CRITERIA", {}).get("CRITERIA_scores", {}).get("Relevance", 0)
            if relevance_score >= cutoff:
                outfile.write(line)

if __name__ == "__main__":
    # Path to your JSONL file in Google Drive
    jsonl_file = "/content/drive/My Drive/ranked_papers_abstracts.jsonl"

    # Count the relevance ranges
    count_relevance_ranges(jsonl_file)

    # Ask the user for a cutoff value
    cutoff = float(input("Enter the relevance score cutoff (0-1): "))

    # Path for the output file
    output_file = "/content/drive/My Drive/filtered_by_relevance_score_ranked_papers_abstracts.jsonl"

    # Filter the JSONL file based on the cutoff
    filter_by_relevance(jsonl_file, output_file, cutoff)

    print(f"Filtered records have been saved to {output_file}")

## Step 8: Merge the filtered abstracts and tldrs with their corresponding full paper details from the s2orc dataset

In [None]:
# Function to load the small file into memory as a dictionary
def load_small_file(small_file_path):
    print(f"Loading small file from {small_file_path}...")
    small_file_data = {}
    with open(small_file_path, 'r') as f:
        for line_number, line in enumerate(f, start=1):
            try:
                record = json.loads(line)
                corpus_id = record["corpusId"]
                small_file_data[corpus_id] = record
                print(f"Loaded record with corpusId {corpus_id} from small file at line {line_number}")
            except json.JSONDecodeError as e:
                print(f"Skipping invalid line in small file at line {line_number}: {line.strip()} | Error: {e}")
    print("Finished loading small file.")
    return small_file_data

# Function to read the checkpoint file
def read_checkpoint(checkpoint_file_path):
    print(f"Reading checkpoint from {checkpoint_file_path}...")
    if os.path.exists(checkpoint_file_path):
        with open(checkpoint_file_path, 'r') as f:
            checkpoint = int(f.read().strip())
            print(f"Checkpoint read: {checkpoint}")
    else:
        checkpoint = 0
        print("Checkpoint file does not exist. Starting from scratch.")
    return checkpoint

# Function to write the current progress to the checkpoint file
def write_checkpoint(checkpoint_file_path, checkpoint):
    print(f"Writing checkpoint {checkpoint} to {checkpoint_file_path}...")
    with open(checkpoint_file_path, 'w') as f:
        f.write(str(checkpoint))
    print("Checkpoint written.")

# Function to get all .jsonl files in the specified directory
def get_jsonl_files(directory_path):
    print(f"Getting list of .jsonl files in directory {directory_path}...")
    all_files = os.listdir(directory_path)
    jsonl_files = [os.path.join(directory_path, f) for f in all_files if f.endswith('.jsonl')]
    print(f"Found {len(jsonl_files)} .jsonl files.")
    return jsonl_files

# Function to extract corpusId from an invalid JSON line
def extract_corpusId_from_invalid_json(line):
    print("Attempting to extract corpusId from invalid JSON line...")
    match = re.search(r'"corpusId"\s*:\s*"([^"]+)"', line)
    if match:
        corpus_id = match.group(1)
        print(f"Extracted corpusId: {corpus_id}")
        return corpus_id
    print("Failed to extract corpusId from invalid JSON line.")
    return None

# Function to process each large file in chunks and merge with the small file data
def process_large_files(small_file_data, large_file_paths, output_file_path, checkpoint_file_path):
    checkpoint = read_checkpoint(checkpoint_file_path)

    with open(output_file_path, 'a') as output_file:
        for i, large_file_path in enumerate(large_file_paths):
            if i < checkpoint:
                print(f"Skipping file {large_file_path} as it has already been processed (checkpoint: {checkpoint})")
                continue  # Skip files already processed

            print(f"Processing file {large_file_path}...")
            # Open the large file with the appropriate encoding
            with open(large_file_path, 'r', encoding='latin-1') as large_file: # Try 'latin-1' encoding
                for line_number, line in enumerate(large_file, start=1):
                    try:
                        large_record = json.loads(line)
                        corpus_id = large_record["corpusid"]
                        if corpus_id in small_file_data:
                            # Merge the small file data into the large record
                            merged_record = {**large_record, **small_file_data[corpus_id]}
                            output_file.write(json.dumps(merged_record) + '\n')
                            print(f"Successfully added line {line_number} from file {large_file_path} with corpusId {corpus_id}")
                    except json.JSONDecodeError as e:
                        print(f"Invalid JSON at line {line_number} in file {large_file_path}: {line.strip()} | Error: {e}")
                        corpus_id = extract_corpusId_from_invalid_json(line)
                        if corpus_id and corpus_id in small_file_data:
                            print(f"Handling invalid line with extracted corpusId {corpus_id} at line {line_number} in file {large_file_path}")
                            large_record = {"corpusd": corpus_id}
                            merged_record = {**large_record, **small_file_data[corpus_id]}
                            output_file.write(json.dumps(merged_record) + '\n')
                            print(f"Successfully added invalid line {line_number} from file {large_file_path} with extracted corpusId")
                        else:
                            print(f"Skipping invalid line at line {line_number} in file {large_file_path}: {line.strip()}")

            # Update and write the checkpoint after processing each file
            write_checkpoint(checkpoint_file_path, i + 1)
            print(f"Finished processing file {large_file_path}")

# Main function to execute the merging process
def main():
    small_file_path = '/content/drive/My Drive/filtered_by_relevance_score_ranked_papers_abstracts.jsonl' #Use this to define the path to the single document containing the tldrs
    large_files_directory = '/content/drive/My Drive/2024-07-16-s2orc' #Use this to define the path to the folder containing the s2orc dataset
    output_file_path = '/content/drive/My Drive/final_combined_tldr_and_s2orc.jsonl' #Use this to define the path to the outputted file, which will be the merged tldrs and s2orc file.
    checkpoint_file_path = '/content/drive/My Drive/checkpoint_file.txt' #Use this to define the path to the checkpoint file.

    print("Starting main process...")
    small_file_data = load_small_file(small_file_path)
    large_file_paths = get_jsonl_files(large_files_directory)
    process_large_files(small_file_data, large_file_paths, output_file_path, checkpoint_file_path)
    print("Finished main process.")

if __name__ == "__main__":
    main()