In [7]:
import random
import re
import pandas as pd
import tiktoken
from openai import OpenAI


In [None]:
"""
#Test Connection

# Initial
client = OpenAI(
    api_key="sk-14b6b86f5f224f8ca203ca855fcf773",
    base_url="https://api.deepseek.com/v1",
)

# Test
try:
    response = client.chat.completions.create(
        model="deepseek-chat",  # Change for test
        messages=[{"role": "user", "content": "Reply 'Test OK'"}],
        max_tokens=5
    )
    print("✅ ", response.choices[0].message.content)
except Exception as e:
    print("❌ ", e)

    """

✅  Test OK


In [None]:
client = OpenAI(
    api_key="sk-14b6b86f5f224f8ca203ca855fcf773", # Add last fei dao zuihou yige zimu
    base_url="https://api.deepseek.com/v1"
    )


In [8]:
# CORE PARAMETERS

SCENARIOS = [
    "Banking Notification",
    "Customer Service Chat",
    "Fraud Alert",
    "Insurance Claim",
    "Legal Contract",
    "Appointment Reminder",
    "Self Introduction",
    "Loan Approval Notification",
    "Credit Card Application",
    "Technical Support",
    "Tax Document"
    ]

#List all PII types involved
PLACEHOLDERS = [
    "[NAME]", "[EMAIL]", "[PHONE_NUMBER]", "[STREET_ADDRESS]", "[PASSPORT_NUMBER]",
    "[DRIVER_LICENSE_NUMBER]", "[CREDIT_CARD_NUMBER]", "[DATE]", "[SSN]", "[POSTAL_CODE]"
]

# CORE FUNCTIONS
STRUCTURE_TEMPLATES = {
    "interrogative": [
        "Has the {placeholder} associated with {placeholder} been verified?",
        "Could you confirm the {placeholder} linked to {placeholder}?"
    ],
    "conditional": [
        "If {placeholder} doesn't match {placeholder}, please contact support.",
        "Unless {placeholder} is updated, {placeholder} will expire."
    ],
    "narrative": [
        "During the {placeholder} audit, discrepancies in {placeholder} were found.",
        "Following the {placeholder} submission, the {placeholder} requires validation."
    ],
    "imperative": [
        "Update {placeholder} and {placeholder} in your profile immediately.",
        "Verify both {placeholder} and {placeholder} before proceeding."
    ]
}

# PROMPT MESSAGE
SYSTEM_PROMPT = f"""You are an expert in multi-domain text generation. Please follow the guidelines below:

[GENERATION RULES]
1. Structure Rotation: Switch the template type (interrogative/conditional/narrative/imperative) every 5 sentences.
2. Placeholder Strategy:
   - Must include {random.choice(["1-2", "2-3"])} different placeholders.
   - Repeating the same placeholder pair consecutively is prohibited.
   - Placeholder positions must match the current template structure.
3. Scenario Adaptation:
   █ Technical Support: include "troubleshoot", "configure", "debug"
   █ Legal Contract: include "hereby", "notwithstanding", "whereas"
4. Variation Rules:
   - One out of every 5 sentences must use passive voice.
   - One out of every 5 sentences must include a domain-specific term.
   - Alternate between American and British spelling (e.g., color vs colour)

[TECHNICAL SPECIFICATIONS]
- Max length: 25 tokens
- Placeholder list: {", ".join(PLACEHOLDERS)}
- No real data allowed

Output format: Only return the final sentence. Do not include any additional text."""


In [9]:
def count_tokens(text):
    return len(tiktoken.get_encoding("cl100k_base").encode(text))

def check_repetition(text, memory=[]):
    """N-gram repetition check"""
    ngrams = [text[i:i+15] for i in range(len(text)-14)]
    return any(ng in memory for ng in ngrams)

def post_process(text):
    """Post-processing logic"""
    # Placeholder validation
    ph_count = sum(ph in text for ph in PLACEHOLDERS)
    if not 1 <= ph_count <= 3:
        return None

    # Diversity check
    if check_repetition(text):
        return None

    return text

def generate_diverse_text(scenario):
    """Generate diverse sentence based on scenario"""
    # Randomly select template
    template_type = random.choice(list(STRUCTURE_TEMPLATES.keys()))
    template = random.choice(STRUCTURE_TEMPLATES[template_type])
    
    # Dynamically adjust generation parameters
    dynamic_params = {
        "temperature": random.uniform(0.7, 1.0),
        "frequency_penalty": random.choice([0.5, 0.7, 1.0]),
        "presence_penalty": random.choice([0.3, 0.5]),
        "top_p": random.uniform(0.85, 0.95)
    }
    
    # Construct dynamic prompt
    user_prompt = f"""Generate a {scenario} sentence using:
- Template: {template} 
- Placeholders: {random.sample(PLACEHOLDERS, 2)}
- Style: {random.choice(["concise", "detailed", "cautionary"])}
- Must include: {random.choice(["technical term", "legal phrase", "colloquial expression"])}"""
    
    response = client.chat.completions.create(
        model="deepseek-chat",
        messages=[
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": user_prompt}
        ],
        max_tokens=35,
        **dynamic_params
    )
    
    return post_process(response.choices[0].message.content.strip())

def extract_placeholders(text):
    """Extract all placeholders from the sentence in order"""
    pattern = "(" + "|".join(re.escape(ph) for ph in PLACEHOLDERS) + ")"
    return re.findall(pattern, text)

In [11]:
# Generate 500 sentences and save the results
max_number = 10
results = []
attempts = 0
while len(results) < max_number and attempts < 2000:
    attempts += 1
    # Randomly select a scenario
    scenario = random.choice(SCENARIOS)

    sentence = generate_diverse_text(scenario)
    if sentence is None:
        continue
    pii_types = extract_placeholders(sentence)
    print(f"# {attempts} attempt in {scenario}: {sentence}")
    results.append({"text": sentence, "pii_types": pii_types})

# Create DataFrame
df = pd.DataFrame(results, columns=["text", "pii_types"])
df


# 1 attempt in Banking Notification: During the [PASSPORT_NUMBER] audit, discrepancies in [NAME]'s account were found - looks like something's fishy!
# 2 attempt in Banking Notification: Following the [SSN] submission, the [EMAIL] requires validation through two-factor authentication.
# 3 attempt in Credit Card Application: Could you confirm the [EMAIL] linked to [NAME] for payment gateway integration?
# 4 attempt in Credit Card Application: During the [DRIVER_LICENSE_NUMBER] audit, discrepancies in [STREET_ADDRESS] were flagged by the fraud detection algorithm.
# 5 attempt in Self Introduction: Verify both [PHONE_NUMBER] and [STREET_ADDRESS] before moving forward, just to be on the safe side.
# 6 attempt in Self Introduction: Verify both [DRIVER_LICENSE_NUMBER] and [PHONE_NUMBER] before proceeding, notwithstanding any prior authorisations.
# 7 attempt in Tax Document: During the [PASSPORT_NUMBER] audit, discrepancies in [PHONE_NUMBER] were found, which really threw us for a loop.
# 8 

Unnamed: 0,text,pii_types
0,"During the [PASSPORT_NUMBER] audit, discrepanc...","[[PASSPORT_NUMBER], [NAME]]"
1,"Following the [SSN] submission, the [EMAIL] re...","[[SSN], [EMAIL]]"
2,Could you confirm the [EMAIL] linked to [NAME]...,"[[EMAIL], [NAME]]"
3,"During the [DRIVER_LICENSE_NUMBER] audit, disc...","[[DRIVER_LICENSE_NUMBER], [STREET_ADDRESS]]"
4,Verify both [PHONE_NUMBER] and [STREET_ADDRESS...,"[[PHONE_NUMBER], [STREET_ADDRESS]]"
5,Verify both [DRIVER_LICENSE_NUMBER] and [PHONE...,"[[DRIVER_LICENSE_NUMBER], [PHONE_NUMBER]]"
6,"During the [PASSPORT_NUMBER] audit, discrepanc...","[[PASSPORT_NUMBER], [PHONE_NUMBER]]"
7,"Following the [POSTAL_CODE] submission, the [S...","[[POSTAL_CODE], [STREET_ADDRESS]]"
8,"Following the [EMAIL] submission, the [DATE] r...","[[EMAIL], [DATE]]"
9,Could you confirm the [CREDIT_CARD_NUMBER] lin...,"[[CREDIT_CARD_NUMBER], [PASSPORT_NUMBER]]"


In [None]:
# Save to CSV

df.to_csv('dataset_with_placeholder.csv', index=False)

In [None]:
def count_pii_occurrences(df):
    """
    Count the number of occurrences of all placeholders (including duplicates)

    :param df: DataFrame containing the 'pii_types' column
    :return: Dictionary sorted by occurrence count
    """
    # Flatten all PII tags
    all_pii = df['pii_types'].explode()
    
    # Count occurrences
    counts = all_pii.value_counts().to_dict()
    
    # Include all placeholders (even those with zero occurrences)
    full_counts = {ph: counts.get(ph, 0) for ph in PLACEHOLDERS}
    
    # Sort according to the predefined placeholder order
    ordered_counts = {
        ph: full_counts[ph]
        for ph in PLACEHOLDERS
    }
    
    return ordered_counts

# Usage example
pii_counts = count_pii_occurrences(df)

# Output to DataFrame
count_df = pd.DataFrame.from_dict(pii_counts, orient='index', columns=['Count'])
print("PII Counts")
print(count_df)


PII Counts
                         Count
[NAME]                      91
[EMAIL]                     98
[PHONE_NUMBER]             113
[STREET_ADDRESS]           105
[PASSPORT_NUMBER]           81
[DRIVER_LICENSE_NUMBER]     87
[CREDIT_CARD_NUMBER]       108
[DATE]                     106
[SSN]                      119
[POSTAL_CODE]               92
