In [1]:
# import os
import google.generativeai as genai
import json
import time
import csv
from dotenv import load_dotenv
import os

In [2]:
load_dotenv()

GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")

# Configure the API key
genai.configure(api_key=GEMINI_API_KEY)


In [3]:
# Define the system instruction
SYSTEM_PROMPT = """You are an expert annotator specializing in personal information NER tagging. For each request, generate a unique and diverse sentence covering different aspects of their personal information such as name, city, work, gender or email."""

In [4]:
USER_PROMPT = """Generate one unique personal information sentence with NER tags in JSON format. Each generated sentence should be different by:
1. Using different combinations of keys and entities
2. Varying sentence structures and personal information
3. Include only meaningful details provided in the json object
4. Every word should be proffesional-speaking


NER Tag Definitions:
{
    "O": "Non-entity tokens"
    "B-NAME": "Start of indian name - (e.g. Amit, Rajesh, Ankit, Brajraj, Rekha, Arshad, Niraj, Jyoti)",
    "I-NAME": "Continuation of names - (e.g Solanki, Kalathia, Singh)",
    "B-GENDER": "Value of gender - (e.g. Male, Female, Transgender)",
    "B-DOB": "Start of Birthdate - (e.g. March 18 2003, 02/01/2004, 08-06-1998)",
    "I-DOB": "Continuation of Birthdate",
    "B-NATIONALITY": "Start of Nationality - (e.g. Indian, American, Chinese, Japanese)",
    "I-NATIONALITY": "Continuation of Nationality",
    "B-CITY": "Start of city name - (e.g. Jodhpur, Bangalore, Hyderabad, Mumbai, Pune)",
    "I-CITY": "Continuation of city name",
    "B-EMAIL": "Start of email address - (e.g., indianindian23@gmail.com, barjrajshivani776@gmail.com)",
    "B-PHONE": "Start of phone number - (e.g. 874 892 4670, 91 7140728911, +9180679 73921, 75081 18639)",
    "I-PHONE": "Continuation of phone number",
    "B-AADHAR": "Start of aadhar card number - (e.g. 257574488836, 800476797877)",
    "I-AADHAR": "Continuation of aadhar card number",
    "B-PAN": "Start of pan number - (e.g., XJVMV8282Y, CKNJX9036X, WZIRX3768L)",
    "B-VOTER": "Start of voter identitiy number - (e.g., KGI6158551, ILX3627901)",
    "B-PASSPORT": "Start of passport number - (e.g., K4190715, S1721121, G48 38859)",
    "I-PASSPORT": "Continuation of passport number",
    "B-LICENSE": "Start of driving license number - (e.g., TN87 20075880927, JK01 20104637542, TR-7120118774157)",
    "I-LICENSE": "Continuation of driving license number",
    "B-ACCOUNTNUMBER": "Start of bank account number - (e.g., 11389823690, 71717839372, 78339305069)",
    "I-ACCOUNTNUMBER": "Continuation of bank account number",
    "B-BANKIFSC": "Start of bank ifsc code - (e.g., IDIB0755055, ICIC0313449, SBIN0547870)",
    "B-CARDNUMBER": "Start of atm card number - (e.g., 40265307837299051, 50786592905011426, 40001623133171904)",
    "B-CVV": "Start of cvv code of atm card - (e.g., 837, 318, 635)",
    "B-IP": "Start of ip address - (e.g., 46.81.232.82, 222.15.54.4, 167.77.106.135)",
}

Rules:
1. Generate exactly one sentence
2. Use proper BIO tagging
3. Keep measurements as single tokens

Required Output Structure:
{
    "sentence": "Complete sentence here",
    "annotations": [
        {
            "token": "word1",
            "tag": "TAG1"
        },
        {
            "token": "word2",
            "tag": "TAG2"
        }
    ]
}

Example Output:
{
    "sentence": "Hello Amit Solanki, I am regret to inform that your aadhar number 437623083136 is not linked with your mobile number 72320 09355, kindly check that.",
    "annotations": [
        {
            "token": "Hello",
            "tag": "O"
        },
        {
            "token": "Amit",
            "tag": "B-NAME"
        },
        {
            "token": "Solanki",
            "tag": "I-NAME"
        },
        {
            "token": ",",
            "tag": "O"
        },
        {
            "token": "I",
            "tag": "O"
        },
        {
            "token": "am",
            "tag": "O"
        },
        {
            "token": "regret",
            "tag": "O"
        },
        {
            "token": "to",
            "tag": "O"
        },
        {
            "token": "inform",
            "tag": "O"
        },
        {
            "token": "that",
            "tag": "O"
        },
        {
            "token": "your",
            "tag": "O"
        },
        {
            "token": "aadhar",
            "tag": "O"
        },
        {
            "token": "number",
            "tag": "O"
        },
        {
            "token": "437623083136",
            "tag": "B-AADHAR"
        },
        {
            "token": "is",
            "tag": "O"
        },
        {
            "token": "not",
            "tag": "O"
        },
        {
            "token": "linked",
            "tag": "O"
        },
        {
            "token": "with",
            "tag": "O"
        },
        {
            "token": "your",
            "tag": "O"
        },
        {
            "token": "mobile",
            "tag": "O"
        },
        {
            "token": "number",
            "tag": "O"
        },
        {
            "token": "72320",
            "tag": "B-PHONE"
        },
        {
            "token": "09355",
            "tag": "I-PHONE"
        },
        {
            "token": ",",
            "tag": "O"
        },
        {
            "token": "kindly",
            "tag": "O"
        },
        {
            "token": "check",
            "tag": "O"
        },
        {
            "token": "that",
            "tag": "O"
        },
        {
            "token": ".",
            "tag": "O"
        },

    ]
}

Generate one new example following this exact JSON structure. Ensure the sentence is different from previous examples and combines multiple technical aspects."""

In [12]:
generation_config = genai.GenerationConfig(
    response_mime_type='application/json',
)

In [14]:
# Select the Gemini model
model = genai.GenerativeModel('gemini-1.5-flash', system_instruction = SYSTEM_PROMPT, generation_config=generation_config)

In [40]:
def parse_ner_json(json_string):
    """
    Parse the JSON output from a NER model into a dictionary of:
    - text: the original sentence
    - tokens: a list of tokens in the sentence
    - ner_tags: a list of NER tags corresponding to the tokens
    """
    data = json.loads(json_string)
    tokens = [ann["token"] for ann in data["annotations"]]
    tags = [ann["tag"] for ann in data["annotations"]]

    return {"text": data["sentence"], "tokens": tokens, "ner_tags": tags}

In [56]:
def generate_text(all_user_data):

    i = 0

    texts = [{"text": "Hello Amit Solanki, I am pleased to inform that your aadhar number 437623083136 is not linked with your mobile number 72320 09355, kindly check that."}]
    
    for user_data in all_user_data:
        prompt = ""

        for key, value in user_data.items():
            if value != "":  # Only include fields that have a value
                prompt += f"{key.replace('_', ' ').capitalize()}: {value}\n"

        try:
            # Generate content
            response = model.generate_content(f"{USER_PROMPT}. This is the user data: {prompt}, sentence pattern to avoid : {texts[-1]['text']}")
    
            data = parse_ner_json(response.text)
    
            texts.append(data)
            
            # Output the generated sentence
            print("User count : ", i , "\n", data, "\n ", type(data), "\n\n")
            i+=1
    
            time.sleep(5)
            
        except Exception as e:
            print(e)
            break

    return texts

In [54]:
with open("user_data.json", "r") as file:
    all_user_data = json.load(file)

In [61]:
texts = generate_text(all_user_data)

User count :  0 
 {'text': 'The voter with ID UDR6099075, residing in Amet, has an email address barjrajshivani194@beyoung.in and phone number 87485 72655, and their PAN is FRIKM6512V.', 'tokens': ['The', 'voter', 'with', 'ID', 'UDR6099075', ',', 'residing', 'in', 'Amet', ',', 'has', 'an', 'email', 'address', 'barjrajshivani194@beyoung.in', 'and', 'phone', 'number', '87485', '72655', ',', 'and', 'their', 'PAN', 'is', 'FRIKM6512V', '.'], 'ner_tags': ['O', 'O', 'O', 'O', 'B-VOTER', 'O', 'O', 'O', 'B-CITY', 'O', 'O', 'O', 'O', 'O', 'B-EMAIL', 'O', 'O', 'O', 'B-PHONE', 'I-PHONE', 'O', 'O', 'O', 'O', 'O', 'B-PAN', 'O']} 
  <class 'dict'> 


User count :  1 
 {'text': 'Mr. Rajesh Kumar, a Male born on 02-09-1968, residing in Bangalore, possesses Aadhaar number 951523422990 and PAN number RRDDW5373Y, and his CVV is 190.', 'tokens': ['Mr.', 'Rajesh', 'Kumar', ',', 'a', 'Male', 'born', 'on', '02-09-1968', ',', 'residing', 'in', 'Bangalore', ',', 'possesses', 'Aadhaar', 'number', '951523422990',

## Converting json to csv

In [7]:
texts.pop(0)

{'text': 'Hello Amit Solanki, I am pleased to inform that your aadhar number 437623083136 is not linked with your mobile number 72320 09355, kindly check that.'}

In [8]:
texts[0]

{'text': 'The passport application of Ms.Anjali Sharma, born on 06/10/2018, with voter ID JKR4943350 and phone number +918657189773, is currently being processed.',
 'tokens': ['The',
  'passport',
  'application',
  'of',
  'Ms.',
  'Anjali',
  'Sharma',
  ',',
  'born',
  'on',
  '06/10/2018',
  ',',
  'with',
  'voter',
  'ID',
  'JKR4943350',
  'and',
  'phone',
  'number',
  '+918657189773',
  ',',
  'is',
  'currently',
  'being',
  'processed',
  '.'],
 'ner_tags': ['O',
  'O',
  'O',
  'O',
  'O',
  'B-NAME',
  'I-NAME',
  'O',
  'O',
  'O',
  'B-DOB',
  'O',
  'O',
  'O',
  'O',
  'B-VOTER',
  'O',
  'O',
  'O',
  'B-PHONE',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O']}

In [9]:
text = []
tokens = []
ner_tags = []

In [10]:
for item in texts:
    text.append(item.get("text", ""))
    tokens.append(item.get("tokens", []))
    ner_tags.append(item.get("ner_tags", []))


In [13]:
# File to save the data
output_file = "texts.csv"

# Save to CSV
with open(output_file, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    # Write header
    writer.writerow(["text", "tokens", "ner_tags"])
    # Write rows
    for text_data, token_list, ner_tag_list in zip(text, tokens, ner_tags):
        writer.writerow([text_data, token_list, ner_tag_list])

print(f"Data successfully saved to {output_file}")

Data successfully saved to texts.csv
