<a href="https://colab.research.google.com/github/UrologyUnbound/SIOP_ML_2024_Discord/blob/main/colabs/Clarity.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Rating Item Clarity Notebook
This notebook is designed to tackle the challenge of predicting the average clarity rating for each personality item based on responses.  

## Challenge Description
Respondents rated the clarity of personality test items using a 7-point scale from 1 = extremely unclear to 7 = extremely clear.  


In [None]:
!pip install pandas langchain langchain_openai

In [3]:
import pandas as pd
import os
from langchain import FewShotPromptTemplate, PromptTemplate
from langchain_openai import ChatOpenAI
from google.colab import userdata

In [6]:
clarity_train_data = pd.read_csv("https://raw.githubusercontent.com/UrologyUnbound/SIOP_ML_2024_Discord/main/data/train/clarity_train.csv")
clarity_test_data = pd.read_csv("https://raw.githubusercontent.com/UrologyUnbound/SIOP_ML_2024_Discord/main/data/test/clarity_test_public.csv")

In [7]:
# Manually add env key to the `api_key` argument
llm = ChatOpenAI(api_key= userdata.get('OPENAI_API_KEY'), model_name="gpt-3.5-turbo", temperature=0.0)

In [8]:
instructions = """
Your task is to predict the average clarity rating for each item based on the responses.
Respondents rated the clarity of personality test items using a 7-point scale from 1 = extremely unclear to 7 = extremely clear.
The output should not make up information and not reference these given instructions or context; only output the answer.
"""

In [9]:
def extract_examples_clarity(dataset_row):
    # Extract personality_item and clarity from each row of the dataset
    personality_item = dataset_row["personality_item"]
    clarity = dataset_row["clarity"]

    return [{"personality_item": personality_item, "clarity": clarity}]

def create_examples_clarity(dataset):
    # Extract examples from the first three rows of the dataset
    examples = []
    for i in range(3):
        examples.extend(extract_examples_clarity(dataset.loc[i]))

    return examples

def create_example_prompt_clarity():
    # Create a formatter for the examples
    example_prompt = PromptTemplate(
        input_variables=["personality_item", "clarity"],
        template="Personality Item: {personality_item}\nClarity: {clarity}"
    )

    return example_prompt

def create_template_clarity(dataset):
    # Generate a few shot prompt template
    examples = create_examples_clarity(dataset)
    template = FewShotPromptTemplate(
        examples=examples,
        example_prompt=create_example_prompt_clarity(),
        prefix=instructions,
        suffix="Personality Item: {input}",
        input_variables=["input"],
    )

    return template

In [10]:
example_prompt = create_template_clarity(clarity_train_data).format(input=clarity_test_data["personality_item"][0])
example_prompt

'\nYour task is to predict the average clarity rating for each item based on the responses. \nRespondents rated the clarity of personality test items using a 7-point scale from 1 = extremely unclear to 7 = extremely clear. \nThe output should not make up information and not reference these given instructions or context; only output the answer.\n\n\nPersonality Item: Am considered well-off financially.\nClarity: 3.421052631578947\n\nPersonality Item: Make problems bigger than they are.\nClarity: 6.545454545454546\n\nPersonality Item: Judge people by their appearance.\nClarity: 6.545454545454546\n\nPersonality Item: Want to be in charge.'

In [11]:
llm.invoke(example_prompt)

AIMessage(content='4.837837837837838', response_metadata={'token_usage': {'completion_tokens': 7, 'prompt_tokens': 153, 'total_tokens': 160}, 'model_name': 'gpt-3.5-turbo', 'system_fingerprint': 'fp_3bc1b5746c', 'finish_reason': 'stop', 'logprobs': None})