In [1]:
import os
from openai import OpenAI
from dotenv import load_dotenv

load_dotenv()

client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))

In [2]:
# https://help.openai.com/en/articles/4936833-is-the-moderation-endpoint-free-to-use
# Monitoring Purpose 
response = client.moderations.create(
    input="Hey~ Good morning"
)

print(type(response))

<class 'openai.types.moderation_create_response.ModerationCreateResponse'>


In [3]:
print(response.model_dump_json(indent=2))

{
  "id": "modr-Ax3GEZupOv1yIX84VgIcvHVzvNRhM",
  "model": "text-moderation-007",
  "results": [
    {
      "categories": {
        "harassment": false,
        "harassment_threatening": false,
        "hate": false,
        "hate_threatening": false,
        "illicit": null,
        "illicit_violent": null,
        "self_harm": false,
        "self_harm_instructions": false,
        "self_harm_intent": false,
        "sexual": false,
        "sexual_minors": false,
        "violence": false,
        "violence_graphic": false,
        "self-harm": false,
        "sexual/minors": false,
        "hate/threatening": false,
        "violence/graphic": false,
        "self-harm/intent": false,
        "self-harm/instructions": false,
        "harassment/threatening": false
      },
      "category_applied_input_types": null,
      "category_scores": {
        "harassment": 0.00001951787999132648,
        "harassment_threatening": 3.741378407084994e-7,
        "hate": 4.14364194512018e-6,
 

In [4]:
def flagged(text, **kwargs):
    response = client.moderations.create(input=text, **kwargs)
    return response.results[0].flagged


print(flagged("Have a good day"))

False


In [5]:
print(flagged("I will kill you!"))

True


In [6]:
# specific category check
def violence(text, **kwargs):
    response = client.moderations.create(input=text, **kwargs)
    return response.results[0].categories.violence


print(violence("Have a good one"))

False


In [7]:
print(violence("I will kill you!"))

True


In [8]:
# may not be useful in Korean Language
print(violence("죽여버린다!", model="text-moderation-stable"))

False


In [9]:
print(violence("죽여버린다!", model="text-moderation-latest"))

False


In [10]:
# Another way to check Korean Language Moderation
def harmful(text):
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": "다음 문장이 해로우면 'True', 그렇지 않으면 'False'로 대답하세요."},
            {"role": "user", "content": text}
        ]
    )
    return response.choices[0].message.content

In [11]:
print(harmful("좋은 하루 되세요"))

False


In [16]:
print(harmful("죽여버린다"))

True


In [12]:
print(harmful("우울해"))

True


In [13]:
print(harmful("울적해"))

False


In [14]:
print(harmful("황당하네? 이러기야?"))

False


In [15]:
print(harmful("돔황챠~"))

False
