In [73]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [74]:
import sys
import pandas as pd
from dotenv import load_dotenv
from langchain.llms import HuggingFaceHub
from langchain import PromptTemplate, LLMChain
from langchain.llms.huggingface_pipeline import HuggingFacePipeline

# require "HUGGINGFACEHUB_API_TOKEN" in .env file!
load_dotenv()

sys.path.append("../")
from utils.data_utils import preprocess_score_df, preprocess_pattern_map_df
from utils.data_structurer import DataStructurer
from utils.constants import ANSWER_COL

In [75]:
score_df = pd.read_csv("../data/csa_score_1.csv")
pattern_map_df = pd.read_csv("../data/pattern_map.csv")
score_df = preprocess_score_df(score_df)
pattern_map_df = preprocess_pattern_map_df(pattern_map_df)

In [76]:
structurer = DataStructurer(score_df, pattern_map_df, 1122128, 10)

In [77]:
def preprocess(input_string):
    input_string = str(input_string)
    preprocessed_string = input_string.strip()
    preprocessed_string = preprocessed_string.strip("\n")
    return preprocessed_string

In [78]:
# template = """Compute the overall score of the response based on the provided criteria. Return only one score number (Max score is {max_score}).
# ###Criteria### 
# {criteria}

# ###Answer### 
# {answer}

# ###Score###
# """
template = """Compute the overall score of the response based on the provided criteria. Return only one score number (Max score is {max_score}).
###Criteria### 
{criteria}

###Answer### 
{answer}

###Score###
"""

template = preprocess(template)
prompt = PromptTemplate(template=template, input_variables=["criteria", "answer", "max_score"])

In [79]:
# llm = HuggingFaceHub(
#         # repo_id="google/flan-t5-base",
#         repo_id="google/flan-t5-large",
#         # repo_id="SeaLLMs/SeaLLM-7B-v2",
#         # repo_id="scb10x/typhoon-7b",
#         # repo_id="openthaigpt/openthaigpt-1.0.0-alpha-7b-chat-ckpt-hf"
#         # repo_id="migtissera/Tess-M-Creative-v1.0",
#         # repo_id="google/flan-t5-xl",
#         # model_kwargs={
#         #     "temperature": 0,
#         #     "max_length": 128,
#         # }
#     )

# llm_chain = LLMChain(
#     prompt=prompt,
#     llm=llm,
# )

In [80]:
# criteria = """
# + 2 point for having risk mitigation.
# + 3 point for having data-driven decision.
# """
# answer = """Data-Driven Decisions and Risk Mitigation
# """
# max_score = 5
# # criteria = preprocess(criteria)
# # answer = preprocess(answer)

# # result = llm_chain.run({"criteria": criteria, "answer": answer})
# # print(result)


# def predict_score(answer, pattern, max_score):
#     criteria = preprocess(pattern) 
#     answer = preprocess(answer)

#     result = llm_chain.run({"criteria": criteria, "answer": answer, "max_score": max_score})
    
#     try:
#         result = float(result)
#     except ValueError:
#         print(result)
#         result = -1
#     return result

# predict_score(answer, criteria, max_score)

In [81]:
# from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline

# hf = HuggingFacePipeline.from_model_id(
#     model_id="LoneStriker/SeaLLM-7B-v2-GGUF",
#     task="text-generation",
#     # pipeline_kwargs={"max_new_tokens": 10},
#     device=1,
# )

# from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
# from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

# model_id = "LoneStriker/SeaLLM-7B-v2-GGUF"
# tokenizer = AutoTokenizer.from_pretrained(model_id)
# model = AutoModelForCausalLM.from_pretrained(model_id)
# pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=10)
# hf = HuggingFacePipeline(pipeline=pipe)

# chain = prompt | hf

# def predict_score(answer, pattern, max_score):
#     criteria = preprocess(pattern) 
#     answer = preprocess(answer)

#     result = chain.invoke({"criteria": criteria, "answer": answer, "max_score": max_score})
    
#     try:
#         result = float(result)
#     except ValueError:
#         print(result)
#         result = -1
#     return result

In [82]:
from langchain_openai import ChatOpenAI, OpenAI

# llm = ChatOpenAI(temperature=0.0, base_url="http://localhost:1234/v1", api_key="not-needed")
llm = OpenAI(temperature=0.0, base_url="http://localhost:1234/v1", api_key="not-needed")

llm_chain = LLMChain(prompt=prompt, llm=llm)


def predict_score(answer, pattern, max_score):
    criteria = preprocess(pattern)
    answer = preprocess(answer)

    result = llm_chain.invoke(
        {"criteria": criteria, "answer": answer, "max_score": max_score}
    )
    result = result["text"]

    try:
        result = float(result)
    except ValueError:
        print(result)
        result = -1
    return result

In [83]:
# predict_list = []
# for index, row, in structurer.score_df.iterrows():
#     pred_score_list = []
#     answer = row[ANSWER_COL]
    
#     for pattern_idx, (pattern, pattern_max_score) in enumerate(zip(structurer.pattern_list, structurer.pattern_max_score_list)):
#         pred_score = predict_score(answer, pattern, pattern_max_score)
#         pred_score_list.append(pred_score)
#         # print(answer, pattern, pattern_max_score)
#         print(f"{index=}, {pattern_idx=}\t, Actual: {row[f'pattern1_{pattern_idx+1}']}, Pred: {pred_score}, Max: {pattern_max_score}")
#     predict_list.append(pred_score_list)
        
# # structurer.pattern_list
# predict_list

In [84]:
import sys
import pandas as pd

sys.path.append("../")
from evaluation.evaluate import Evaluation

In [85]:
seallm_gguf_csa_1_result = [[0.5, 0.5, 0.75, 0.75],
 [1.0, 0.75, 1.0, 1.0],
 [1.5, 1.0, 1.0, 1.0],
 [1.0, 0.5, 0.75, 0.75],
 [1.0, 0.5, 0.75, 0.75],
 [1.0, 1.0, 1.0, 1.0],
 [0.5, 0.5, 0.75, 0.75],
 [1.0, 0.5, 0.75, 0.75],
 [-1, -1, -1, -1],
 [1.0, 1.0, 1.0, 1.0],
 [1.0, 0.5, 0.75, 1.0],
 [1.0, 0.75, 0.9, 1.0],
 [1.0, 0.9, 0.9, 0.9],
 [1.0, 0.75, 1.0, 0.75],
 [0.5, 0.5, 0.5, 0.75],
 [0.5, 0.5, 0.75, 0.75],
 [0.5, 0.5, 0.75, 0.75],
 [1.0, 0.25, 0.5, 0.5],
 [1.0, 0.5, 0.5, 0.5],
 [1.0, 0.75, 0.9, 0.9],
 [1.0, 0.5, 0.7, 0.75],
 [1.0, 1.0, 1.0, 1.0],
 [0.5, 0.5, 0.75, 0.75],
 [0.5, 0.5, 0.5, 0.75],
 [0.5, 0.5, 0.75, 0.75],
 [0.5, 0.5, 0.75, 0.75],
 [0.5, 0.5, 0.75, 0.75],
 [0.5, 0.5, 0.75, 0.75],
 [1.0, 0.3, 0.3, 0.3],
 [0.5, 0.75, 0.9, 0.9],
 [0.5, 0.5, 0.75, 0.5],
 [1.0, 0.5, 1.0, 1.0],
 [2.0, 0.75, 0.75, 0.75],
 [1.0, 0.5, 0.75, 0.75],
 [1.0, 0.25, 0.5, 0.5],
 [1.5, 0.75, 1.0, 1.0],
 [1.0, 0.75, 1.0, 1.0],
 [1.0, 0.75, 1.0, 0.75],
 [0.5, 0.5, 0.75, 0.75],
 [1.0, 0.5, 0.75, 1.0],
 [0.7, 0.7, 0.7, 0.7],
 [0.5, 0.5, 0.5, 0.75],
 [2.0, 0.75, 0.75, 0.9],
 [0.5, 0.5, 0.5, 0.5],
 [2.0, 0.75, 0.75, 0.75],
 [1.0, 0.75, 0.9, 0.9],
 [2.0, 0.75, 0.75, 0.75],
 [1.0, 0.75, 0.75, 0.75],
 [1.0, 0.5, 0.75, 0.75],
 [0.5, 0.5, 0.5, 0.5],
 [1.0, 1.0, 1.0, 1.0],
 [1.0, 0.75, 1.0, 1.0],
 [0.5, 0.5, 0.5, 0.5],
 [1.0, 0.3, 0.3, 0.3],
 [0.5, 0.5, 0.7, 0.75],
 [0.5, 0.5, 0.75, 0.75],
 [1.5, 1.0, 1.0, 1.0],
 [1.0, 0.75, 1.0, 0.75],
 [0.5, 0.5, 0.5, 0.75],
 [1.0, 0.2, 0.5, 0.5],
 [0.5, 0.5, 0.9, 0.75],
 [0.5, 0.5, 0.5, 0.75],
 [1.0, 0.0, 0.0, 0.0],
 [0.5, 0.5, 0.75, 0.75],
 [1.0, 0.75, 1.0, 0.9],
 [2.0, 0.75, 0.9, 0.5],
 [0.5, 0.75, 0.75, 0.75],
 [0.5, 0.5, 0.5, 0.75],
 [0.5, 0.5, 0.75, 0.7],
 [0.5, 0.5, 0.75, 0.75],
 [1.0, 0.5, 0.75, 0.75],
 [0.5, 0.5, 0.5, 0.5],
 [1.0, 0.7, 0.7, 0.7],
 [1.5, 0.75, 0.9, 1.0],
 [1.0, 0.5, 0.75, 0.75],
 [0.5, 0.5, 0.9, 0.9],
 [1.5, 1.0, 1.0, 1.0],
 [1.0, 0.5, 0.75, 0.75],
 [1.0, 0.5, 0.7, 0.7],
 [1.0, 0.75, 1.0, 1.0],
 [1.0, 0.75, 0.9, 0.9],
 [1.0, 0.5, 0.9, 0.9],
 [0.5, 0.5, 0.5, 0.5],
 [0.5, 0.5, 0.5, 0.5],
 [1.0, 0.75, 1.0, 0.75],
 [1.0, 0.75, 0.75, -1],
 [0.5, 0.5, 0.75, 0.75],
 [1.0, 0.5, 0.75, 0.75],
 [1.0, 1.0, 1.0, 1.0],
 [2.0, 0.75, 0.75, 0.75],
 [1.0, 0.75, 0.9, 0.9],
 [1.0, 0.75, 0.75, 0.9],
 [0.5, 0.5, 0.5, 0.75],
 [1.0, 1.0, 1.0, 1.0],
 [1.0, 0.5, 1.0, 1.0],
 [2.0, 0.75, 0.75, 0.75],
 [1.0, 0.75, 1.0, 0.9],
 [0.5, 0.5, 0.75, 0.75],
 [1.0, 0.3, 0.3, 0.3],
 [1.0, 0.75, 1.0, 1.0],
 [1.0, 0.7, 1.0, 0.9],
 [1.5, 0.75, 0.9, 0.9],
 [1.0, 0.5, 0.5, 0.5],
 [1.0, 0.5, 0.75, 0.75],
 [0.5, 0.5, 0.75, 0.9],
 [1.0, 0.7, 0.75, 0.75],
 [1.0, 0.5, 1.0, 1.0],
 [0.5, 0.5, 0.5, 0.5],
 [1.0, 0.7, 1.0, 1.0],
 [0.5, 0.5, 0.75, 0.5],
 [0.5, 0.5, 0.5, 0.5],
 [1.0, 0.25, 0.5, 0.5],
 [1.0, 0.5, 1.0, 1.0],
 [1.0, 0.5, 0.75, 0.75],
 [1.0, 0.75, 0.75, 0.75],
 [1.0, 0.0, 0.0, 0.0],
 [1.0, 0.5, 0.75, 0.75],
 [0.5, 0.5, 0.9, 0.75],
 [1.0, 0.5, 1.0, 1.0],
 [1.0, 0.7, 0.7, 0.7],
 [0.5, 0.5, 0.75, 0.75],
 [1.0, 0.5, 0.75, 1.0],
 [1.0, 0.75, 0.75, 0.75],
 [1.0, 0.5, 0.75, 0.75],
 [1.0, 0.5, 0.5, 1.0],
 [1.0, 0.75, 0.9, 0.75],
 [0.5, 0.5, 0.75, 0.75],
 [1.0, 0.7, 0.7, 0.9],
 [1.0, 0.5, 0.5, 0.5],
 [1.0, 0.5, 0.75, 0.75],
 [1.0, 0.75, 0.75, 0.75],
 [1.0, 0.75, 0.9, 0.9],
 [0.5, 0.5, 0.9, 0.9],
 [1.0, 0.2, 0.5, 0.5],
 [1.5, 0.5, 0.5, 0.5],
 [0.5, 0.5, 0.5, 0.7],
 [1.0, 0.0, 0.0, 0.0],
 [0.5, 0.5, 0.5, 0.5],
 [1.5, 1.0, 1.0, 1.0],
 [1.0, 0.0, 0.0, 0.0],
 [1.0, 0.7, 0.75, 0.7],
 [1.0, 0.75, 0.75, 0.75],
 [1.0, 0.5, 0.75, 0.75],
 [1.0, 0.75, 0.75, 0.75],
 [1.0, 0.75, 0.75, 0.75],
 [0.5, 0.5, 0.75, 0.75]
 ]

In [86]:
structurer.score_df[structurer.score_df['pattern1_1'].isna()]

Unnamed: 0,student ID,answer,total_score,pattern1_1,pattern1_2,pattern1_3,pattern1_4


In [87]:
evaluation = Evaluation()
predicted_df = evaluation.format_default_list_to_predict_df(seallm_gguf_csa_1_result)
predicted_df

Unnamed: 0,1_pred,2_pred,3_pred,4_pred
0,0.5,0.50,0.75,0.75
1,1.0,0.75,1.00,1.00
2,1.5,1.00,1.00,1.00
3,1.0,0.50,0.75,0.75
4,1.0,0.50,0.75,0.75
...,...,...,...,...
141,1.0,0.75,0.75,0.75
142,1.0,0.50,0.75,0.75
143,1.0,0.75,0.75,0.75
144,1.0,0.75,0.75,0.75


In [88]:
evaluation.get_report(structurer, predicted_df)

1
0      0.0
1      0.0
2      2.0
3      2.0
4      2.0
      ... 
141    2.0
142    2.0
143    2.0
144    2.0
145    0.0
Name: pattern1_1, Length: 146, dtype: float64 0      0.5
1      1.0
2      1.5
3      1.0
4      1.0
      ... 
141    1.0
142    1.0
143    1.0
144    1.0
145    0.5
Name: 1_pred, Length: 146, dtype: float64
2
0      0.0
1      0.0
2      1.0
3      0.0
4      1.0
      ... 
141    1.0
142    1.0
143    1.0
144    1.0
145    0.5
Name: pattern1_2, Length: 146, dtype: float64 0      0.50
1      0.75
2      1.00
3      0.50
4      0.50
       ... 
141    0.75
142    0.50
143    0.75
144    0.75
145    0.50
Name: 2_pred, Length: 146, dtype: float64
3
0      1.0
1      0.0
2      1.0
3      1.0
4      1.0
      ... 
141    1.0
142    0.0
143    1.0
144    1.0
145    0.5
Name: pattern1_3, Length: 146, dtype: float64 0      0.75
1      1.00
2      1.00
3      0.75
4      0.75
       ... 
141    0.75
142    0.75
143    0.75
144    0.75
145    0.75
Name: 3_pred, Length: 14

{'pattern_1': {'pearson': 0.14261114088575605,
  'pearson_p_value': 0.08595112454819626,
  'spearman': 0.1635218808772499,
  'spearman_p_value': 0.04859197976191053,
  'mae': 0.9061643835616439,
  'rmse': 1.0040670719287166},
 'pattern_2': {'pearson': 0.1926976991494691,
  'pearson_p_value': 0.019793205535732286,
  'spearman': 0.2652967405486289,
  'spearman_p_value': 0.0012111935128886317,
  'mae': 0.43082191780821916,
  'rmse': 0.49694270775867166},
 'pattern_3': {'pearson': 0.21162852308424943,
  'pearson_p_value': 0.01033992127892744,
  'spearman': 0.20788156019947981,
  'spearman_p_value': 0.011808352023360391,
  'mae': 0.34315068493150686,
  'rmse': 0.4573973423262572},
 'pattern_4': {'pearson': 0.21447120142704756,
  'pearson_p_value': 0.009335641429714455,
  'spearman': 0.28718965977011285,
  'spearman_p_value': 0.000440357500632756,
  'mae': 0.3407534246575342,
  'rmse': 0.476987550453543},
 'total': {'total_pearson': 0.19035214113663054,
  'total_spearman': 0.2309724603488678

In [89]:
(49*60+51)/(146*4)

5.1215753424657535