In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
YOUR_KEY_HERE="KEYKEYKEYKEYKEY"

In [None]:
!pip install -q langchain replicate

In [None]:
from typing import Dict, List
from langchain.llms import Replicate
from langchain.memory import ChatMessageHistory
from langchain.schema.messages import get_buffer_string
import os

os.environ["REPLICATE_API_TOKEN"] = YOUR_KEY_HERE

In [None]:
LLAMA2_13B_CHAT = "meta/llama-2-13b-chat:f4e2de70d66816a838a89eeeb621910adffb0dd0baba3976c96980970978018d"

In [None]:
DEFAULT_MODEL = LLAMA2_13B_CHAT

def completion(
    prompt: str,
    model: str = DEFAULT_MODEL,
    temperature: float = 0.6,
    top_p: float = 0.9,
) -> str:
    llm = Replicate(
        model=model,
        model_kwargs={"temperature": temperature,"top_p": top_p, "max_new_tokens": 1000}
    )
    return llm(prompt)

def chat_completion(
    messages: List[Dict],
    model = DEFAULT_MODEL,
    temperature: float = 0.6,
    top_p: float = 0.9,
) -> str:
    history = ChatMessageHistory()
    for message in messages:
        if message["role"] == "user":
            history.add_user_message(message["content"])
        elif message["role"] == "assistant":
            history.add_ai_message(message["content"])
        else:
            raise Exception("Unknown role")
    return completion(
        get_buffer_string(
            history.messages,
            human_prefix="USER",
            ai_prefix="ASSISTANT",
        ),
        model,
        temperature,
        top_p,
    )

def assistant(content: str):
    return { "role": "assistant", "content": content }

def user(content: str):
    return { "role": "user", "content": content }

def complete_and_print(prompt: str, model: str = DEFAULT_MODEL):
    print(f'==============\n{prompt}\n==============')
    response = completion(prompt, model)
    print(response, end='\n\n')

In [None]:
import pandas as pd

In [None]:
simpleText_path = "/content/drive/MyDrive/BIP/task 2/train/dataset"

terms=pd.read_csv('/content/drive/MyDrive/BIP/SimpleText/task 2/train/dataset/training/terms.tsv', delimiter = '\t')
definitions = pd.read_csv('/content/drive/MyDrive/BIP/SimpleText/task 2/train/dataset/training/definitions_explanations.tsv', delimiter = '\t')
documents = pd.read_csv('/content/drive/MyDrive/BIP/SimpleText/task 2/train/dataset/documents.tsv', delimiter = '\t')
documents

In [None]:
definitions.head()

In [None]:
merged = definitions.merge(documents, on='snt_id', how='left')
merged

In [None]:
test = pd.read_csv('/content/drive/MyDrive/BIP/SimpleText/task 2/test/dataset/documents.tsv', delimiter = '\t')
test

In [None]:
prompt_terms="""
    You are a robot that ONLY outputs JSON.
    You reply in JSON format with the field 'terms'.
    You provide ONLY semicolon-separated  list of MAXIMUM 3 scientific terms of a source sentence ONLY.
    You DO NOT add 'Sure, Here are the scientific terms of your sentence:'.
    Example source sentence: In the modern era of automation and robotics, \
    autonomous vehicles are currently the focus of academic and industrial research.? \
    Example answer: {'terms': 'robotics; autonomous vehicles'}
    Now here is my sentence:
"""

In [None]:
c=completion(prompt_terms+test['source_snt'][0])
c

In [None]:
import json
import re

def extract_value_inside_curly_braces(text):
    # Use regex to find the value inside curly braces
    match = re.search(r"\{([^{}]*)\}", text)

    if match:
        return match.group(1)
    else:
        return None

In [None]:
def extract_terms_from_string(sentence):
    # Use regex to find the terms inside curly braces
    match = re.search(r"\{[^{}]*'terms':\s*'([^']*)'\}", sentence)

    if match:
        return match.group(1)
    else:
        return None

In [None]:
test = test.head(5)
test['all_terms'] = test['source_snt'].apply(lambda x: extract_terms_from_string(completion(prompt_terms + x)))
test = test.dropna(subset=['all_terms'])
test

In [None]:
test['term']=test['all_terms'].str.split(";")
test=test.explode('term').reset_index(drop=True)
test.drop_duplicates(inplace=True,subset=['snt_id','term'])
test

In [None]:
prompt_difficulty="""
    You are a robot that rates the difficulty of different terms.
    You provide ONE LEVEL o difficulty for scientific terms.
    You need to consider two words as one term.
    Provide ONE rating for the understablity difficulty of term provided.
    There are 3 levels. You need to use: e for easy, m for medium and d for difficult.
    Give the rating inside of curly braces like this {e}
    You can reply with ONLY one word.
    Example source: autonomous vehicles
    Example answer: {'m'}
    Now here is my sentence:
"""

In [None]:
c=completion(prompt_difficulty+test['term'][0])
c

In [None]:
c = extract_value_inside_curly_braces(c)
c

In [None]:
test['difficulty'] = test['term'].apply(lambda x: extract_value_inside_curly_braces(completion(prompt_difficulty + x)))
test

In [None]:
!pip install wikipedia

In [None]:
import wikipedia

def get_wikipedia_definition(term):
    try:
        # Fetch Wikipedia summary for the term
        summary = wikipedia.summary(term)
        return summary
    except wikipedia.exceptions.DisambiguationError as e:
        # If there's a disambiguation error, handle it as needed
        return "DisambiguationError: Ambiguous term"
    except wikipedia.exceptions.PageError as e:
        # If the page doesn't exist, handle it as needed
        return "PageError: Term not found"
    except Exception as e:
        # Handle other exceptions
        return str(e)

# Assuming test['difficulty'] contains terms for which you want Wikipedia definitions
#test['wiki'] = test['term'].apply(get_wikipedia_definition)
test.loc[test['difficulty'] == 'd', 'wiki'] = test.loc[test['difficulty'] == 'd', 'term'].apply(get_wikipedia_definition)
test

In [None]:
prompt_explanation="""
    You are a robot that explains difficult scientific terms.
    DO NOT add intro like "Sure, I'd be happy to help!"
    Use only once sentance and wrap the sentance in curly braces.
    Don’t justify your answers. Don’t give information not mentioned in the CONTEXT INFORMATION.
    Example source: wireless network environment
    Example answer: {'a system in which devices makes use of Radio Frequency connections between nodes in the network	a system in which devices are connected to a network without the need for physical cables or wires'}
    Example source: Bluetooth wireless technology
    Example answer: {'short-range wireless communication technology that allows devices to connect and exchange data.	It facilitates data exchange between devices like smartphones, computers, and peripherals such as headphones or medical devices. Bluetooth technology eliminates the need for physical cables, providing convenience and versatility in device connectivity.'}
    Example source: application
    Example answer: {'software program or tool designed to perform specific tasks or functions on electronic devices.	It can range from productivity tools and games to utilities and communication platforms on electronic devices such as computers, smartphones, or tablets.'}
    Example source: PDA
    Example answer: {'PDA is the acronym for personal digital assistant, which is a handheld electronic device designed for personal organization, communication, and information access. PDAs may include features such as calendars, contact lists, and note-taking capabilities, serving as portable tools for managing daily tasks.	PDA is the acronym for personal digital assistant, which is a handheld electronic device crafted for personal organization, communication, and information retrieval. PDAs often incorporate features like calendars, contact lists, and note-taking capabilities, functioning as portable tools for managing daily tasks and staying connected. While modern smartphones have largely replaced traditional PDAs, the concept influenced the development of contemporary mobile devices.'}
    Example source: pilot study
    Example answer: {'a preliminary research investigation conducted on a small scale to assess the feasibility, and potential challenges of a larger research project.	an initial and smaller-scale research investigation undertaken to evaluate the feasibility, methodology, and potential obstacles of a larger research project. It serves as a testing ground to refine the study design, identify logistical issues, and enhance the overall robustness and effectiveness of the planned full-scale research endeavor.'}
    Now here is my ONE sentence explanation:
"""

In [None]:
test.loc[test['difficulty'] == 'd', 'explanation'] = test.loc[test['difficulty'] == 'd', 'term'].apply(lambda x: completion(prompt_explanation + x))
test

In [None]:
c=completion(prompt_explanation+test['term'][0])
c

In [None]:
import re

def remove_redundant_text(text):
    # Define patterns to search for
    patterns = [
        r'^Hey there!',
        r'^Sure!',
        r'^As a scientific journalist,',
        r'I\'m here to break down a complex study into simple terms for you\.',
        r'Here\'s a simplified version of the text',
        r'Let me break it down for you:',
        r'I\'m here to break down a complex study into simple terms for you\.',
        r'I\'m here to break down complex scientific concepts into simple, easy-to-understand language.',
        r'I\'m here to break down a complex topic into simpler terms for you. So, let\'s talk about',
        r'Here is my one sentence explanation of'

    ]
    # Compile regular expressions
    regex_patterns = [re.compile(pattern) for pattern in patterns]

    # Remove patterns from text
    for pattern in regex_patterns:
        text = re.sub(pattern, '', text).strip()

    return text

In [None]:
def remove_first_line(text):
    if isinstance(text, str):
        lines = text.split('\n')
        # Check if there are multiple lines
        if len(lines) > 1:
            # Join the lines after the first one
            return '\n'.join(lines[1:])
    return text

# Assuming 'test' DataFrame is already defined
test['explanation'] = test['explanation'].apply(remove_first_line)

# Print the updated DataFrame
print(test)


In [None]:
def process_row(row):
    json_obj = {
        "run_id": "Tomislav&Rowan_Task2.2_LLAMA2_13B_CHAT",
        "manual": 0,
        "snt_id": row["snt_id"],
        "term": row["term"],
        "doc_id": row["doc_id"],
        "difficulty": row["difficulty"]
    }

    # Add definition and explanation if they are not empty
    if row["difficulty"] == "d":
        definition = row.get("definition", None)
        explanation = row.get("explanation", None)
        if definition:
            json_obj["definition"] = definition
        if explanation:
            json_obj["explanation"] = explanation

    return json_obj

# Example usage:
# Assuming 'test' is your DataFrame
json_output = []
for index, row in test.iterrows():
    json_output.append(process_row(row))

# Write the JSON output to a file
with open('st_t2.json', 'w') as f:
    json.dump(json_output, f, indent=4)
