<a href="https://colab.research.google.com/github/Satyadeep-Dey/AI-experiments/blob/main/9__Anonymize.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q openai requests

In [None]:
# imports

import os
import time
from google.colab import drive
from google.colab import userdata
import requests
from IPython.display import Markdown, display, update_display
from openai import OpenAI
from huggingface_hub import login
import re # For using regular expressions (pattern matching)
import json # For parsing JSON data from strings



## Utility Function 1 : Write text into a file

In [None]:
def write_text_to_file(folder_path, file_name, write_text):

  # Always mount Drive explicitly when using Google Drive
  drive.mount('/content/drive', force_remount=True)
  print("Drive mounted.")

  # Wait until MyDrive is available
  mydrive_path = '/content/drive/MyDrive'
  while not os.path.exists(mydrive_path):
      print("Waiting for Drive to be ready...")
      time.sleep(1)

  # Create folder path if it doesn't exist
  folder_path = os.path.join(mydrive_path, folder_path)
  os.makedirs(folder_path, exist_ok=True)

  # Define file path
  file_path = os.path.join(folder_path, file_name)

  # Write content to the file
  with open(file_path, 'w') as file:
      file.write(write_text)


  print("File written successfully to:", file_path)


## Utility Function 2: Read from a file

In [None]:
def read_text_from_file(folder_path, file_name):

  # Always mount Drive explicitly when using Google Drive
  drive.mount('/content/drive', force_remount=True)
  print("Drive mounted.")

  # Wait until MyDrive is available
  mydrive_path = '/content/drive/MyDrive'
  while not os.path.exists(mydrive_path):
      print("Waiting for Drive to be ready...")
      time.sleep(1)

  # Path to the file
  file_path = os.path.join(mydrive_path, folder_path, file_name)

  # Check if the file exists
  if os.path.exists(file_path):
      # Read the content of the file
      with open(file_path, 'r') as file:
          contents = file.read()
      return contents
  else:
      return "File not found!"


In [None]:
# Constants

GPT_4o_mini = "gpt-4o-mini"
GPT_4o ="gpt-4o"


In [None]:
# Sign in to HuggingFace Hub

hf_token = userdata.get('HF_TOKEN')
login(hf_token, add_to_git_credential=True)

In [None]:
# Sign in to OpenAI using Secrets in Colab

openai_api_key = userdata.get('OPENAI_API_KEY')
openai = OpenAI(api_key=openai_api_key)

In [None]:
# Let's read the text first

original_content = read_text_from_file(
    folder_path="Files/Knowledge-Base",
    file_name="A Tale of Two Cities.txt"
)

print(f"The number of characters are : {len(original_content)}")
number_of_words = len(original_content.split())
# Divides a string into a list of substrings based on a specified separator (default is whitespace) and then counts length of list
print(f"Number of words is : {number_of_words}")
print()
#print(original_content)



# Option 1 : Entire Anomymization is done using LLM - Chat GPT from Open AI



*   gpt-4o-mini is not good at this.
*   gpt-4o does a good job=> cost is less than 10 cents for about 6000 words (input + output) . Takes about a minute.










In [None]:
system_message = "You are an assistant who receives some text and then replaces the names of people with new names \
keeping in mind the gender and nationality of the person in this text. \
Also change the name of the story and it's author.Do not put the changed text inside '**' "

user_prompt = "Here is the text : " + original_content

messages = [
    {"role": "system", "content": system_message},
    {"role": "user", "content": user_prompt}
  ]


# print(f"The number of characters are : {len(user_prompt)}")
# number_of_words = len(user_prompt.split())
# # Divides a string into a list of substrings based on a specified separator (default is whitespace) and then counts length of list
# print(f"Number of words is : {number_of_words}")


In [None]:
# lets try with GPT_4o because Mini was terrible !
completion = openai.chat.completions.create(
        model=GPT_4o,
        messages=messages,
        temperature= 0.5
    )


In [None]:
anon_data = completion.choices[0].message.content

print(f"The number of characters are : {len(anon_data)}")
number_of_words = len(anon_data.split())
# Divides a string into a list of substrings based on a specified separator (default is whitespace) and then counts length of list
print(f"Number of words is : {number_of_words}")

#print(anon_data)


In [None]:
write_text_to_file("Files/Knowledge-Base", "Anonymized by OpenAI_TOTC_V4.txt", anon_data)

# Option 2 : Use OpenAI to do Named Entity recognition (NER) then anonymyze programatically by replacing names


## 2.1 : Use OpenAI to do Named Entity recognition (NER) as Python LIST

In [None]:
system_message = "You are an assistant that does named entity recognition (NER) based on a text in user prompt. \
You will return data as the following python lists \
people_name = {} example : people_name ={'John Doe','Johnny Depp','Steffi Graf','Priya Das'}\
city_name ={} example : city_name ={'New Delhi','New York','Bangalore'}\
book_title = {} example : book_title = {'Kidnapped','Ramayana','Three Musketeers'}."

user_prompt = "Here is the text : " + original_content

messages = [
    {"role": "system", "content": system_message},
    {"role": "user", "content": user_prompt}
  ]

In [None]:
# lets try with mini
# seed = 42 -> to make this deterministic
completion = openai.chat.completions.create(
        model='gpt-4o-mini',
        messages=messages,
        seed=42
    )


In [None]:
data_py_list = completion.choices[0].message.content
print(data_py_list)
#write_text_to_file("Files/Knowledge-Base", "Python LIST _TOTC.txt", data_py_list)

## 2.2 : Use OpenAI to do Named Entity recognition (NER) as JSON

In [None]:
# we explain the JSON format so that we can program against the different parts and those parts don't change everytime
system_message = """You are an assistant that does named entity recognition (NER) based on a text in user prompt. \
You will identify the name of each and every person and every town and city .\
You will return data as the following JSON format \
{
  'entities': {
    'novel': {
      'title': 'Three Musketeers',
      'author': 'Alexander Dumas'
    },
    'settings': [
      {
        'location': 'New York',
        'time_period': 'before and during the first world war'
      },
      {
        'location': 'Bombay',
        'time_period': 'before and during the second world war'
      }
    ],
    'characters': [
      {
        'name': 'John Doe',
        'description': 'A nice guy.'
      },
      {
        'name': 'Steffi Graf',
        'description': 'A great tennis player.'
      }
    ],
    }
}
"""

user_prompt = "Here is the text : " + original_content

messages = [
    {"role": "system", "content": system_message},
    {"role": "user", "content": user_prompt}
  ]

In [None]:
# completion = openai.chat.completions.create(
#         model=GPT_4o,
#         messages=messages
#     )

#lets try with mini
completion = openai.chat.completions.create(
        model=GPT_4o_mini,
        messages=messages,
        seed=42
    )

# completeness of output varies depending on prompt. Sometimes some minor characters and places are missed.

In [None]:
data_json_format = completion.choices[0].message.content
print(data_json_format)
#write_text_to_file("Files/Knowledge-Base", "JSON _TOTC_V4.json", data_json_format)

In [None]:
# Parse and extract relevant data from JSON

#data = data_json_format

'''
- `re.search(...)` — Searches the string `data` for the **first occurrence** of the pattern.
- `r"```json(.*?)```"` — This is the raw regex pattern:
  - ``` ```json ``` — Matches the opening marker (exactly three backticks followed by `json`).
  - `(.*?)` — A **non-greedy capture group** that grabs everything in between.
  - ``` ``` — Matches the closing triple backticks.
- `re.DOTALL` — Tells the regex engine to treat **newline characters (`\n`) as normal characters**,
                so it can match JSON that spans **multiple lines**.

'''
match = re.search(r"```json(.*?)```", data_json_format, re.DOTALL)
if match: #Checks whether the regular expression successfully found a JSON block between triple backticks in data
    json_str = match.group(1).strip()
    '''
    Extracts the captured JSON content
    match.group(1) — returns only the content captured in the first set of parentheses .
    .strip() — removes any leading/trailing whitespace or newlines.
    '''
else:
    # fallback: maybe data itself is JSON
    json_str = data_json_format.strip() #Assigns the whole content of data, trimmed of leading/trailing whitespace,

#print(json_str)

try:
    json_obj = json.loads(json_str)
    entities = json_obj["entities"]
    novel = entities["novel"]
    title = novel["title"]
    author = novel["author"]
    # print(title)
    # print(author)

    other_names = [] # a list of other names
    people_names = [] # # a list of people names
    other_names.append(title)

    settings = entities["settings"] # this is a LIST of DICTIONARIES

    for setting in settings:
        #print(setting) -> #{'location': 'London', 'time_period': 'before and during the French Revolution'}
        other_names.append(setting["location"]) # Extract location from this dictionary and append to LIST other_names

    characters = entities["characters"] # this is a LIST of DICTIONARIES

    for character in characters:
        people_names.append(character['name']) # Extract name from this dictionary and add to LIST people_names

    people_names.append(author) # also add name of author since this needs to be anonymized as well

    # IMPORTANT : we can also create a LIST of people_names from characters LIST directly as below
    #people_names = [character["name"] for character in characters]  # create a LIST of names

    for people_name in people_names : # print names one by one
        print(people_name)

    print("-----------------------------")

    for other in  other_names: # print name of novel , cities etc.
        print(other)


except json.JSONDecodeError as e:
    print("JSON parsing error:", e)
    print("Raw data:", json_str)


## Note :
You can now write some logic in python to replace these names with other names to anomymyze people, city and book name .
But making it generic is a challenge and probably not worth it given that Open AI has already done the anonymization for you.