In [None]:
%%capture
!pip install bitsandbytes
!pip install langchain-community
!pip install langchain beautifulsoup4 chromadb gradio==5.9.1
!pip -qq install langchain
!pip -qq install langchain-core
!pip install sentence-transformers
!pip install pymupdf
!pip fitz
!pip install pandas openpyxl tqdm

In [None]:
!pip list

In [None]:
from langchain.chains import LLMChain
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.llms import HuggingFacePipeline
from transformers import LlamaForCausalLM, LlamaTokenizer, pipeline
import torch
from typing import List
import gradio as gr
import os
import pickle
from google.colab import userdata
from google.colab import drive
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.cross_encoders import HuggingFaceCrossEncoder
from langchain_core.prompts import PromptTemplate
from langchain.prompts import ChatPromptTemplate

userdata.get('HF_TOKEN')
drive.mount('/content/drive')

from transformers import LlamaForCausalLM, AutoTokenizer, pipeline

model_path = "s4mjang/AM-Detector_llama3.1_final"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = LlamaForCausalLM.from_pretrained(
    model_path, load_in_8bit=True, device_map="auto")
llm_pipeline = pipeline("text-generation", model=model,
    tokenizer=tokenizer, max_new_tokens=4096, do_sample=False)
llm = HuggingFacePipeline(pipeline=llm_pipeline)


In [None]:
new_template = """
<|begin_of_text|>
<|start_header_id|>SYSTEM<|end_header_id|>
You are a helpful assistant for the competition authority.
Choose the most suspicious pattern from the following options:

🔸 Self Preferencing
🔸 Price Fixing
🔸 Manipulation of Randomized Item Logic
🔸 Not Found

➡️ Select ONE that best represents a suspicious pattern.
<|eot_id|>
<|start_header_id|>USER<|end_header_id|>
### Instruction:
Sourcecode Summary:
{code}

Look at the summary of the target code and infer if it is likely to be sanctioned by competition authorities.

### Response:<|eot_id|><|start_header_id|>ASSISTANT<|end_header_id|>
"""
normal_prompt = ChatPromptTemplate.from_template(new_template)

# chain 생성
normal_chain = normal_prompt | llm


def predict(input_code):
  response = normal_chain.invoke({ "code": input_code,})
  report = response.split("<|start_header_id|>ASSISTANT<|end_header_id|>")[-1].strip()
  return str(report)

# Set up Gradio interface
def gradio_interface(query):
    return predict(query)

demo = gr.Interface(fn=gradio_interface, inputs="text", outputs="text", title="Algorithmic Manipulation Detector", description="Enter a query to check if the source code contains algorithmic manipulation.")

demo.launch()


In [None]:
#Testing auto

from datasets import Dataset
import os, glob, pandas as pd
from tqdm import tqdm

def run_summary(input_code):
  response = normal_chain.invoke({ "code": input_code,})
  report = response.split("<|start_header_id|>ASSISTANT<|end_header_id|>")[-1].strip()
  return str(report)


folder_path = "/content/drive/MyDrive/Colab Notebooks/testset/true"
#folder_path = "/content/drive/MyDrive/Colab Notebooks/testset/false"
file_paths = glob.glob(os.path.join(folder_path, '*.txt'))

summaries = []
for txt_file in tqdm(file_paths, desc="FT Test", unit="file"):
    filename = os.path.basename(txt_file)
    with open(txt_file, 'r', encoding='utf-8') as f:
        code = f.read()

    try:
        summary = run_summary(code)
    except Exception as e:
        summary = f"Error: {e}"

    summaries.append({
        'Filename': filename,
        'Result': summary
    })

# 요약 결과만 저장
df_summary = pd.DataFrame(summaries)
output_summary_excel = os.path.join(folder_path, 'predict_summaries(0524)_True.xlsx')
df_summary.to_excel(output_summary_excel, index=False)
print(f"Saved: {output_summary_excel}")


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
RAG Test:  19%|█▉        | 10/52 [20:54<1:26:30, 123.58s/file]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to 

1차 추론 완료 — 저장됨: /content/drive/MyDrive/Colab Notebooks/testset/true/predict_summaries(0524)_rag_True(re).xlsx



