In [3]:
global_system_prompt = """You are a specialized compliance auditing assistant designed to analyze call center transcripts for privacy violations. Your task is to detect instances where agents share sensitive financial information without properly verifying the customer's identity.

COMPLIANCE REQUIREMENTS:
1. Agents MUST verify a customer's identity using at least one of these methods BEFORE sharing sensitive information:
   - Date of Birth (DOB) verification
   - Address verification
   - Social Security Number (partial or full) verification

2. Sensitive information includes:
   - Account balances
   - Account numbers
   - Transaction history
   - Credit limits
   - Loan details
   - Payment information

3. The verification and sensitive information sharing can happen anywhere in the conversation, but verification MUST occur before sensitive information is disclosed.

Analyze the entire call transcript carefully and determine:
- Whether identity verification occurred (Yes/No)
- What verification method was used (DOB/Address/SSN/None)
- Whether sensitive information was shared (Yes/No)
- What type of sensitive information was shared (be specific)
- Whether a compliance violation occurred (Yes/No)

Your analysis should be thorough, objective, and based solely on the content of the transcript."""

In [28]:
import os
import json
import pandas as pd
from groq import Groq

class PrivacyComplianceDetector:
    def __init__(self, api_key=None):
        """Initialize the compliance detector with Groq API"""
        self.api_key = api_key or os.environ.get("GROQ_API_KEY")
        if not self.api_key:
            raise ValueError("Groq API key is required. Set GROQ_API_KEY environment variable or pass it directly.")
        
        self.client = Groq(api_key=self.api_key)
        self.system_prompt = global_system_prompt
    
    def format_transcript(self, transcript_data):
        """Format the transcript data into a readable format for the model"""
        formatted_text = []
        for entry in transcript_data:
            speaker = entry.get("speaker", "Unknown")
            text = entry.get("text", "")
            time_start = entry.get("stime", "")
            time_end = entry.get("etime", "")
            formatted_text.append(f"{speaker} [{time_start}-{time_end}]: {text}")
        
        return "\n".join(formatted_text)
    
    def analyze_call_transcript(self, call_id, transcript_data):
        """Analyze a call transcript for privacy compliance violations"""
        formatted_transcript = self.format_transcript(transcript_data)
        
        # Prepare the user prompt with the transcript to analyze
        user_prompt = f"""Please analyze this call center transcript for privacy compliance violations:

CALL TRANSCRIPT (ID: {call_id}):
{formatted_transcript}

Provide your analysis in the following JSON format:
{{
    "verification_performed": true/false,
    "verification_method": "DOB/Address/SSN/Multiple/None",
    "sensitive_info_shared": true/false,
    "sensitive_info_type": "Description of information shared",
    "is_violation": true/false,
    "explanation": "Detailed explanation of your findings"
}}"""

        # Generate the full prompt
        messages = [
            {"role": "system", "content": self.system_prompt},
            {"role": "user", "content": user_prompt}
        ]
        
        # Call the Groq API with Llama 3.3
        try:
            response = self.client.chat.completions.create(
                model="meta-llama/llama-4-scout-17b-16e-instruct",
                messages=messages,
                temperature=0.2,  # Lower temperature for more deterministic responses
                max_tokens=1024,
                response_format={"type": "json_object"}
            )
            
            print(response)
            
            # Parse the model's response
            response_content = response.choices[0].message.content
            result = json.loads(response_content)
            
            # Add the call_id to the result
            result["call_id"] = call_id
            return result
            
        except Exception as e:
            print(f"Error analyzing call {call_id}: {e}")
            return {
                "call_id": call_id,
                "error": str(e),
                "is_violation": False,
                "verification_performed": False,
                "sensitive_info_shared": False
            }
    
    def batch_process_calls(self, call_data_list):
        """Process multiple call transcripts and identify violations"""
        results = []
        
        for call_data in call_data_list:
            call_id = call_data.get("call_id", "unknown")
            transcript = call_data.get("transcript", [])
            
            analysis = self.analyze_call_transcript(call_id, transcript)
            results.append(analysis)
            
            # If there's a violation, print it out
            if analysis.get("is_violation", False):
                print(f"⚠️ Violation detected in call {call_id}")
        
        # Convert results to DataFrame for easier analysis
        df_results = pd.DataFrame(results)
        
        # Filter for violations only
        # violations_df = df_results[df_results["is_violation"] == True]
        # violations_df.to_csv("violations.csv", index=False)
        
        return df_results
    
    def process_directory(self, directory_path, limit=-1, save_to_csv=True):
        """Process all JSON files in a directory"""
        call_data_list = []
        
        # List all JSON files in the directory
        json_files = [f for f in os.listdir(directory_path) if f.endswith('.json')]
        print(f"Found {len(json_files)} JSON files to process")
        
        for idx, filename in enumerate(json_files):
            if limit > 0 and idx >= limit:
                break
            file_path = os.path.join(directory_path, filename)
            try:
                with open(file_path, 'r', encoding='utf-8') as f:
                    data = json.load(f)
                
                call_data_list.append({
                    "call_id": filename,
                    "transcript": data
                })
                
            except Exception as e:
                print(f"Error loading {file_path}: {e}")
        
        # Process all calls and get violations
        results_df = self.batch_process_calls(call_data_list)
        if save_to_csv:
            results_df.to_csv("compliance_violations.csv", index=False)
            print("Results saved to compliance_violations.csv")
        return self.batch_process_calls(call_data_list)

In [29]:
x = PrivacyComplianceDetector()

In [23]:
with open("../All_Conversations/00be25b0-458f-4cbf-ae86-ae2ec1f7fba4.json", 'r', encoding='utf-8') as f:
    data = json.load(f)

x.format_transcript(data)

"Agent [0-7]: Hello, is this Mr. Johnson? This is Lisa calling from XYZ Collections. How are you today?\nCustomer [6.5-12]: I'm sorry, but I think you have the wrong person. My name is Sarah.\nAgent [11-19]: Oh, I apologize for the confusion, Sarah. I'm reaching out about a debt related to an outstanding balance with Definite Bank.\nCustomer [18-24]: I don't have any account with Definite Bank. You might want to check your records.\nAgent [23-30]: Thank you for letting me know. I will make a note to update our records.\nCustomer [29-34]: I appreciate that. Is there anything else I need to do?\nAgent [33-40]: No, that's all. I'm sorry for any inconvenience caused. Have a great day!\nCustomer [39-42]: Thank you, you too!"

In [30]:
p = x.process_directory("../All_Conversations", limit=10)

Found 250 JSON files to process
ChatCompletion(id='chatcmpl-4a41ad83-80a2-497c-9fc5-2b073b308834', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='{\n    "verification_performed": false,\n  "verification_method": "None",\n  "sensitive_info_shared": false,\n  "sensitive_info_type": "None",\n  "is_violation": false,\n  "explanation": "The agent did not share any sensitive financial information, and there was no attempt to verify the customer\'s identity. The conversation was about adding the customer\'s number to the do not call list, which does not require identity verification or sharing sensitive information."\n}', role='assistant', executed_tools=None, function_call=None, reasoning=None, tool_calls=None))], created=1745057653, model='meta-llama/llama-4-scout-17b-16e-instruct', object='chat.completion', system_fingerprint='fp_37da608fc1', usage=CompletionUsage(completion_tokens=99, prompt_tokens=560, total_tokens=659, completion_tim