In [67]:
import os
from dotenv import load_dotenv
from langchain.chains import LLMChain
from langchain.llms import OpenAI
from langchain.prompts import PromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain_core.messages import AIMessage, HumanMessage, SystemMessage, BaseMessage
from langchain_experimental import tot
from typing import List
import json
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.pydantic_v1 import BaseModel, Field

load_dotenv()

openai_api_key = os.getenv("OPENAI_API_KEY")

llm = ChatOpenAI(
    model="gpt-4o",
    openai_api_key=openai_api_key,
    streaming=True,
    temperature=0.0,
)

class MethodDetail(BaseModel):
    reasoning: str
    passages: List[str]
    confidence_score: float

class MethodOutput(BaseModel):
    methods: List[str]
    method_details: dict[str, MethodDetail]
    
method_parser = JsonOutputParser(pydantic_object=MethodOutput)

In [None]:
reasoning_step = """
    Step 1: Parse the abstract into a tree structure. The tree structure should be constructed as follows:
"""

In [89]:
system_prompt = SystemMessage(
    content="""
    You are a method extraction AI whose purpose is to identify and extract method keywords from an academic abstract. Your role is to locate the specific methodologies, techniques, or approaches mentioned in the abstract and provide justification for why each keyword represents a method.

    ### Definition of Methods:
    - "Methods" refers to the **specific processes**, **techniques**, **procedures**, or **approaches** used in conducting the research. This includes techniques for data collection, data analysis, algorithms, experimental procedures, or any other specific methodology employed by the researchers. Methods should not include general descriptions, conclusions, or research themes.

    ### What You Should Do:
    1. Extract keywords that refer to the **methods** used in the abstract.
    2. For each keyword, provide a **reasoning** explaining why it represents a method in the context of the abstract.
    3. Present the results in the required **JSON format** with a list of methods and justifications for each.

    ### JSON Output Requirements:
    - **Response Format**: You must return your output as a JSON object.
    - The JSON object must contain:
    - A key `"methods"` whose value is a list of extracted **method keywords**.
    - A key for each method keyword that containes 2 keys:
        - `"reasoning"`: A string that provides the **reasoning** behind why that keyword was extracted.
        - "passages": A list of strings that are the passages from the abstract that lead you to believe that this is a method keyword.
        - "confidence_score": A float between 0 and 1 that represents the confidence in the keyword.
        
    ### JSON Structure:
    ```json
        {
        "methods": [
            "<method_keyword_1>",
            "<method_keyword_2>"
        ],
        "<method_keyword_1>": {
            "reasoning": "<explain why this is a method keyword>",
            "passages": ["<list of passages from the abstract which lead you to believe this is a method keyword>"],
            "confidence_score": <confidence score float value between 0 and 1>
        },
        "<method_keyword_2>": {
            "reasoning": "<explain why this is a method keyword>"
            "passages": ["<list of passages from the abstract which lead you to believe this is a method keyword>"],
            "confidence_score": <confidence score float value between 0 and 1>
        }
    }
    ```
    
    See the following examples:
    
    ### Example 1: Correct Extraction

    **Abstract:**
    “Drawing on expectation states theory and expertise utilization literature, we examine the effects of team members’ actual expertise and social status on the degree of influence they exert over team processes via perceived expertise. We also explore the conditions under which teams rely on perceived expertise versus social status in determining influence relationships in teams. To do so, we present a contingency model in which the salience of expertise and social status depends on the types of intragroup conflicts. Using multiwave survey data from 50 student project teams with 320 members at a large national research institute located in South Korea, we found that both actual expertise and social status had direct and indirect effects on member influence through perceived expertise. Furthermore, perceived expertise at the early stage of team projects is driven by social status, whereas perceived expertise at the later stage of a team project is mainly driven by actual expertise. Finally, we found that members who are being perceived as experts are more influential when task conflict is high or when relationship conflict is low. We discuss the implications of these findings for research and practice.”

    Output:
    ```json
    {
        "methods": [
            "multiwave survey data collection",
            "contingency modeling"
        ],
        "multiwave survey data collection": {
            "reasoning": "Multiwave survey data collection is the specific method used to gather data from participants over multiple time points, providing a clear methodological process for the research.",
            "passages": [
                "Using multiwave survey data from 50 student project teams with 320 members at a large national research institute located in South Korea"
            ],
            "confidence_score": 0.95
        },
        "contingency modeling": {
            "reasoning": "Contingency modeling is the method used to analyze the relationship between expertise, social status, and intragroup conflicts, forming the backbone of the data analysis.",
            "passages": [
                "we present a contingency model in which the salience of expertise and social status depends on the types of intragroup conflicts"
            ],
            "confidence_score": 0.90
        }
    }
    ```
    
    #### Explanation for Correct Extraction:
    
    - **Multiwave survey data collection**: This is a method because it refers to how data was gathered from the research subjects over multiple time points. The **confidence score (0.95)** reflects that this is a well-established data collection method.
    - **Contingency modeling**: This is a method because it describes the analytical process used to explore relationships between variables like expertise and social status. The **confidence score (0.90)** reflects the significance of this method in the research.
    
    ### Example 2: Incorrect Extraction

    **Abstract:**
    “Drawing on expectation states theory and expertise utilization literature, we examine the effects of team members’ actual expertise and social status on the degree of influence they exert over team processes via perceived expertise. We also explore the conditions under which teams rely on perceived expertise versus social status in determining influence relationships in teams. To do so, we present a contingency model in which the salience of expertise and social status depends on the types of intragroup conflicts. Using multiwave survey data from 50 student project teams with 320 members at a large national research institute located in South Korea, we found that both actual expertise and social status had direct and indirect effects on member influence through perceived expertise. Furthermore, perceived expertise at the early stage of team projects is driven by social status, whereas perceived expertise at the later stage of a team project is mainly driven by actual expertise. Finally, we found that members who are being perceived as experts are more influential when task conflict is high or when relationship conflict is low. We discuss the implications of these findings for research and practice.”
    
    Output:
    ```json
    {
        "methods": [
            "intragroup conflict",
            "perceived expertise",
            "social status",
            "multiwave survey data collection"
        ],
        "intragroup conflict": {
            "reasoning": "Intragroup conflict is a key factor in determining team dynamics and was analyzed in the research.",
            "passages": [
                "the salience of expertise and social status depends on the types of intragroup conflicts"
            ],
            "confidence_score": 0.75
        },
        "perceived expertise": {
            "reasoning": "Perceived expertise is one of the core variables examined in the study, making it a methodological focus.",
            "passages": [
                "perceived expertise at the early stage of team projects is driven by social status"
            ],
            "confidence_score": 0.70
        },
        "social status": {
            "reasoning": "Social status is an important factor that influences member dynamics in teams, making it a key methodological focus.",
            "passages": [
                "perceived expertise at the early stage of team projects is driven by social status"
            ],
            "confidence_score": 0.65
        },
        "multiwave survey data collection": {
            "reasoning": "Multiwave survey data collection is the method used to gather data from participants over multiple time points, providing a clear methodological process for the research.",
            "passages": [
                "Using multiwave survey data from 50 student project teams with 320 members at a large national research institute located in South Korea"
            ],
            "confidence_score": 0.95
        }
    }
    ```
    
    #### Explanation for Incorrect Extraction:

    - **Intragroup conflict**: This is incorrect because **intragroup conflict** is a variable or condition examined in the research, not a method. It is part of the analysis, not a process or technique used to conduct the research.
    - **Perceived expertise**: This is incorrect because **perceived expertise** is a measured variable, not a method. It’s what the study investigates, but it’s not a methodological process.
    - **Social status**: This is incorrect because **social status** is another variable the study looks at. Like the others, it’s part of the analysis, not a method.
    
    IMPORTANT: Do not include the markdown json code block notation in your response. Simply return the JSON object.
    The markdown json code block notation is: ```json\n<your json here>\n```, do not include the ```json\n``` in your response.
    IMPORTANT: You must return the output in the specified JSON format. If you do not return the output in the specified JSON format, you have failed.
    """
)


abstract = """
    Drawing on expectation states theory and expertise utilization literature, we examine the effects of team members' actual expertise and social status on the degree of influence they exert over team processes via perceived expertise. We also explore the conditions under which teams rely on perceived expertise versus social status in determining influence relationships in teams. To do so, we present a contingency model in which the salience of expertise and social status depends on the types of intragroup conflicts. Using multiwave survey data from 50 student project teams with 320 members at a large national research institute located in South Korea, we found that both actual expertise and social status had direct and indirect effects on member influence through perceived expertise. Furthermore, perceived expertise at the early stage of team projects is driven by social status, whereas perceived expertise at the later stage of a team project is mainly driven by actual expertise. Finally, we found that members who are being perceived as experts are more influential when task conflict is high or when relationship conflict is low. We discuss the implications of these findings for research and practice.
    """

human_prompt = HumanMessage(
    content=f"Abstract:\n{abstract}"
)

messages: list[BaseMessage] = [
    system_prompt,
    human_prompt
]

prompt = PromptTemplate(
    template="{system_prompt}\n\nAbstract:\n{abstract}\n",
    input_variables=["system_prompt.content", "abstract"],
    partial_variables={"format_instructions": method_parser.get_format_instructions()}
)

chain = prompt | llm | method_parser

output = chain.invoke({"system_prompt": system_prompt.content, "abstract": abstract})
print(output)
methods_list = output["methods"]
print(f"Methods: {output["methods"]}")
for method in methods_list:
    print(f"Method: {method}")
    print(f"Reasoning: {output[method]["reasoning"]}")
    print(f"Passages: {output[method]["passages"]}")
    print(f"Confidence: {output[method]["confidence_score"]}")

{'methods': ['multiwave survey data collection', 'contingency modeling'], 'multiwave survey data collection': {'reasoning': 'Multiwave survey data collection is the specific method used to gather data from participants over multiple time points, providing a clear methodological process for the research.', 'passages': ['Using multiwave survey data from 50 student project teams with 320 members at a large national research institute located in South Korea'], 'confidence_score': 0.95}, 'contingency modeling': {'reasoning': 'Contingency modeling is the method used to analyze the relationship between expertise, social status, and intragroup conflicts, forming the backbone of the data analysis.', 'passages': ['we present a contingency model in which the salience of expertise and social status depends on the types of intragroup conflicts'], 'confidence_score': 0.9}}
Methods: ['multiwave survey data collection', 'contingency modeling']
Method: multiwave survey data collection
Reasoning: Multiw

In [90]:
# Abstract sentence by setence
class SentenceDetails(BaseModel):
    sentence: str
    meaning: str
    reasoning: str
    confidence_score: float

class AbstractThemes(BaseModel):
    sentence_details: List[SentenceDetails]
    overall_theme: str
    
abstract_parser = JsonOutputParser(pydantic_object=AbstractThemes)

methods = ", ".join(output["methods"])
json_example = """
    {
      "sentence_details": [
        {
          "sentence": "Original sentence 1",
          "meaning": "Meaning of the sentence.",
          "reasoning": "Why this is the meaning of the sentence.",
          "confidence_score": Confidence score (0.0 - 1.0)
        },
        {
          "sentence": "Original sentence 2",
          "meaning": "Meaning of the sentence.",
          "reasoning": "Why this is the meaning of the sentence.",
          "confidence_score": Confidence score (0.0 - 1.0)
        },
        ...
      ],

    }
"""
# t = "overall_theme: Overall theme of the abstract based on the sentence assessments \
#        2. **Determine the overall theme of the abstract**:
#        - After assessing each sentence, review the meanings and reasonings provided for each sentence, and then deduce the overall theme of the abstract.
#        - The overall theme should summarize the main idea or focus of the research described in the abstract. This summary should be verbose, detailed, and nuanced. You should reference passages from the abstract as well as your reasoning for each sentence to form this summary. You shouldn't use any of the identified methods to form this summary, this summary should focus on the main idea of the research, as in what the research is doing rather than how it is doing it. The identified methods are: {methods}. "
    # - The overall theme should reflect the general focus of the research, combining the interpretations of each sentence. Theme means the overall idea of the research, do not include how the research is doing it, focus on what the research is doing.


abstract_prompt_template = f"""
    You are tasked with analyzing an abstract of a research paper. Your task involves the following steps:

    1. **Analyze each sentence in the abstract**: 
       - For each sentence, determine the meaning of the sentence, provide a reasoning for your interpretation, and assign a confidence score between 0 and 1 based on how confident you are in your assessment.
       - The meaning should concisely describe what the sentence is conveying, especially focusing on the core ideas or methods mentioned.
       - The reasoning should explain why you interpreted the sentence this way, considering the context and content of the sentence.
       - The confidence score should reflect how sure you are about your interpretation, with 1 being completely certain and 0 being highly uncertain.
       
    Your output should follow this structure:

    {json_example}

    IMPORTANT: 
    - Be concise but clear in your meanings and reasonings.
    - Ensure that the confidence score reflects how certain you are about the meaning of the sentence in context.
    """

abstract_system_prompt = SystemMessage(
    content=abstract_prompt_template
)

abstract_prompt = PromptTemplate(
   template="{abstract_system_prompt}\n\n## Abstract: \n{abstract}\n",
   input_variables=["abstract_system_prompt.content", "abstract"],
   partial_variables={"format_instructions": abstract_parser.get_format_instructions()}
)

json_structure = """
    ```json
    {
        "summary": "Detailed summary of the abstract",
        "reasoning": "Detailed reasoning for the summary",
        "feedback": `"Feedback for the previous assistant"
    }
    ```
"""

class AbstractSummary(BaseModel):
    summary: str
    reasoning: str
    feedback: str
    
abstract_summary_parser = JsonOutputParser(pydantic_object=AbstractSummary)

abstract_chain = abstract_prompt | llm | abstract_parser
abstract_chain_output = json.dumps(abstract_chain.invoke({"abstract_system_prompt": abstract_system_prompt.content, "abstract": abstract,}), indent=4)
method_json_format = """
```json
    {
        "methods": [
            "<method_keyword_1>",
            "<method_keyword_2>"
        ],
        "<method_keyword_1>": {
            "reasoning": "<explain why this is a method keyword>",
            "passages": ["<list of passages from the abstract which lead you to believe this is a method keyword>"],
            "confidence_score": <confidence score float value between 0 and 1>
        },
        "<method_keyword_2>": {
            "reasoning": "<explain why this is a method keyword>"
            "passages": ["<list of passages from the abstract which lead you to believe this is a method keyword>"],
            "confidence_score": <confidence score float value between 0 and 1>
        }
    }
    ```
  """
method_json_output = json.dumps(output, indent=4)

abstract_summary_system_template = f"""
You are an expert AI researcher that is tasked with summarizing academic research abstracts. Your task is to analyze the abstract and extract the main ideas and themes. You should not use the identified methods to form this summary, this summary should focus on the main idea of the research, as in what the research is doing rather than how it is doing it.

In order to better assist you, methodologies have already been extracted from the abstract. They are in the following JSON format:
{method_json_format}

Here are the already extracted methdologies and their details:
{method_json_output}

In addition to the methodologies, a previous assistant has already analyzed each sentence in the abstract and provided a meaning, reasoning, and confidence score for each sentence. Your task is to analyze these results and determine the overall theme of the abstract. Here are the results of the sentence analysis:
{abstract_chain_output}

Your output should contain the following:
- summary: A detailed summary of the abstract which aims to capture the main idea of the research while not being concerned with the specific methods used to conduct the research.
- reasoning: A detailed reasoning for the summary you have provided.
- feedback: Feedback detailing any issues you may think of that may have affected your ability to accurately summarize the abstract, as well as any requests you may have for the previous assistant to improve their analysis of the abstract so that you can more easily summarize it.

Your ouput should be a JSON object with the following structure:

{json_structure}

IMPORTANT: Do not include the markdown json code block notation in your response. Simply return the JSON object.
The markdown json code block notation is: ```json\n<your json here>\n```, do not include the ```json\n``` in your response.
IMPORTANT: You must return the output in the specified JSON format. If you do not return the output in the specified JSON format, you have failed.
"""

summary_system_prompt = SystemMessage(
    content=abstract_summary_system_template
)

summary_prompt = PromptTemplate(
  template="{abstract_summary_system_template}\n\n## Original Abstract: \n{abstract}",
  input_variables=["abstract_summary_system_template.content", "abstract"],
  partial_variables={"format_instructions": abstract_summary_parser.get_format_instructions()}
)

summary_chain = summary_prompt | llm | abstract_summary_parser
summary_chain_output = summary_chain.invoke({"abstract_summary_system_template": summary_system_prompt.content, "abstract": abstract})
print(json.dumps(summary_chain_output, indent=4))
# Print the summary, reasoning, and feedback


{
    "summary": "The research investigates how team members' actual expertise and social status influence their perceived expertise and, consequently, their impact on team processes. It explores the conditions under which teams prioritize perceived expertise over social status in determining influence relationships. The study introduces a model that links the importance of expertise and social status to different types of intragroup conflicts. Findings from survey data of student teams in South Korea reveal that both actual expertise and social status affect influence through perceived expertise. Early in team projects, social status drives perceived expertise, while actual expertise becomes more influential later. Additionally, perceived experts are more influential when task conflict is high or relationship conflict is low. The study concludes with a discussion on the implications of these findings for both research and practical applications.",
    "reasoning": "The summary capture

In [80]:
# Update this line to convert the prompt to a valid input type
def stream(messages: List[BaseMessage]):
    response = ""
    for i, chunk in enumerate(llm.stream(messages)):
        # Print without new line and flush immediately
        # if i == 0:
        #     chunk.pretty_print()
        print(chunk.content, end="", flush=True)
        response += chunk.content
    return response
response = stream(messages)

json_response = json.loads(response)
print("\n\n")
print(json_response)

methods = []
for method in json_response["methods"]:
    methods.append(method)

print(methods)

for method in methods:
    reasoning = json_response[method]["reasoning"]
    confidence_score = json_response[method]["confidence_score"]
    print(f"Method: {method}")
    print(f"Reasoning: {reasoning}")
    print(f"Confidence Score: {confidence_score}")
    print("\n\n")

{
    "methods": [
        "multiwave survey data collection",
        "contingency modeling"
    ],
    "multiwave survey data collection": {
        "reasoning": "Multiwave survey data collection is the specific method used to gather data from participants over multiple time points, providing a clear methodological process for the research.",
        "passages": [
            "Using multiwave survey data from 50 student project teams with 320 members at a large national research institute located in South Korea"
        ],
        "confidence_score": 0.95
    },
    "contingency modeling": {
        "reasoning": "Contingency modeling is the method used to analyze the relationship between expertise, social status, and intragroup conflicts, forming the backbone of the data analysis.",
        "passages": [
            "we present a contingency model in which the salience of expertise and social status depends on the types of intragroup conflicts"
        ],
        "confidence_score": 

In [88]:
output_dict = {
    "chain_1_output": output,
    "chain_2_output": abstract_chain_output,
    "chain_3_output": summary_chain_output
}

with open("chain_outputs.json", "w") as f:
    json.dump(output_dict, f, indent=4)