<a href="https://colab.research.google.com/github/TOM-BOHN/SFDC-User-Permissions-AI/blob/main/Notebooks/SFDC_User_Permission_AI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Setup

### Install Repo and Packages


In [7]:
import os

repo_url = "https://github.com/TOM-BOHN/SFDC-User-Permissions-AI.git"
repo_dir = "SFDC-User-Permissions-AI"  # Directory name for the cloned repo

if os.path.exists(repo_dir):
  # If the directory exists, pull the latest changes
  print(f"Repository '{repo_dir}' already exists. Pulling latest changes...")
  !git pull

else:
  # If the directory doesn't exist, clone the repo
  print(f"Cloning repository '{repo_dir}'...")
  !git clone {repo_url}

Cloning repository 'SFDC-User-Permissions-AI'...
Cloning into 'SFDC-User-Permissions-AI'...
remote: Enumerating objects: 266, done.[K
remote: Counting objects: 100% (266/266), done.[K
remote: Compressing objects: 100% (193/193), done.[K
remote: Total 266 (delta 130), reused 178 (delta 65), pack-reused 0 (from 0)[K
Receiving objects: 100% (266/266), 981.30 KiB | 7.61 MiB/s, done.
Resolving deltas: 100% (130/130), done.


In [None]:
# Install the Python SDK for google gen ai
!pip install -Uq "google-genai==1.7.0"

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/144.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m143.4/144.7 kB[0m [31m39.9 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m144.7/144.7 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25h

### Add Required Libraries and Tools

In [5]:
from google import genai
from google.genai import types

from IPython.display import Markdown, display

genai.__version__

###################################

import sys
import os
import time
from datetime import datetime

import enum
import json

import pandas as pd

###################################

os.chdir('/content/SFDC-User-Permissions-AI')

###################################

# Import the processing functions
from src.processing import extract_json_fields
from src.utils.data_utils import save_data
from src.llms import (
    # Chat session management
    create_chat_session,

    # Category evaluation
    category_eval_summary,
    CategoryRating,
    CategoryLabel,
    classify_category,

    # Risk evaluation
    risk_eval_summary,
    RiskRating,
    classify_risk_rating
)

###################################

# Import the scraping functions
from src.scraping import (
    extract_permission_data,
    clean_permission_data,
    save_permission_data,
    scrape_permissions_from_file
)

### Setup the API key and Client

To run the following cell, your API key must be stored it in a Google secret named `GOOGLE_API_KEY`.

If you don't already have an API key, you can grab one from [AI Studio](https://aistudio.google.com/app/apikey). You can find [detailed instructions in the docs](https://ai.google.dev/gemini-api/docs/api-key).

In [None]:
# To run the following cell, your API key must be stored it in a [Google secret] named `GOOGLE_API_KEY`.
from google.colab import userdata
# Create the client
client = genai.Client(api_key=userdata.get('GOOGLE_API_KEY'))

### Automated Retry Functionality

In [None]:
# This codelab sends a lot of requests, so set up an automatic retry
# that ensures your requests are retried when per-minute quota is reached.
from google.api_core import retry

is_retriable = lambda e: (isinstance(e, genai.errors.APIError) and e.code in {429, 503})

if not hasattr(genai.models.Models.generate_content, '__wrapped__'):
  genai.models.Models.generate_content = retry.Retry(
      predicate=is_retriable)(genai.models.Models.generate_content)

# Scrape the Permissions

In [None]:
html_file_paths = [
    '/content/SFDC-User-Permissions-AI/data/raw/perm_sets_app_perms_salesforce_00DgK000001iK7J.mhtml',
    '/content/SFDC-User-Permissions-AI/data/raw/perm_sets_system_perms_salesforce_00DgK000001iK7J.mhtml',
    '/content/SFDC-User-Permissions-AI/data/raw/perm_sets_app_perms_salesforce_00DHu000002irdp.mhtml',
    '/content/SFDC-User-Permissions-AI/data/raw/perm_sets_system_perms_salesforce_00DHu000002irdp.mhtml',
]
df = scrape_permissions_from_file(
    html_file_paths  = html_file_paths
  , output_path = None
)

In [None]:
df.head(100)

# Import Source Data

In [None]:
# Load the input table from the source
url = "https://raw.githubusercontent.com/TOM-BOHN/SFDC-User-Permissions-AI/refs/heads/main/data/input/User_Permission_Reference_Data__Sample.csv"
perm_list_df = pd.read_csv(url)
perm_list_df.head()

Unnamed: 0,Permission Name,API Name,Description
0,Access Data Cloud Data Explorer,AccessCdpDataExplorer,Allows user access Data Cloud Data Explorer.
1,Administer territory operations,ManageTerritories,Prerequisite user permission for a user to man...
2,Allow sending of List Emails,ListEmailSend,"Allow users to create, edit and send List Emails"
3,Api Only User,ApiUserOnly,Access Salesforce.com only through a Salesforc...
4,Author Apex,AuthorApex,Create Apex classes and triggers.


# Permission Risk Rating

In [None]:
# Load the promt from the source file
with open('/content/SFDC-User-Permissions-AI/src/prompts/templates/prompt_user_perm_risk_rating.md', 'r') as f:
    PROMPT_USER_PERM_RISK_RATING = f.read()
# Display the prompt
print(PROMPT_USER_PERM_RISK_RATING)

# Permission Risk Evaluation Prompt Template  
# --------------------------------------------------
# This template can be imported and formatted with the specific
# `permission_name` and `permission_api_name` and `permission_description` variables to create
# a concrete evaluation prompt for any Salesforce permission.
# --------------------------------------------------

# Instruction
You are a **Salesforce security risk assessor**.
Your task is to evaluate the **inherent risk level** of a Salesforce permission (or capability) when granted to a user.
We will provide you with the permission name and a short description of what it allows.
Analyze the permission against the **Evaluation Criteria** below and assign one of the five **Risk Levels** defined in the Rating Rubric.
Give step‑by‑step reasoning for your decision, citing the specific criteria that most influenced your rating.

# Evaluation

## Metric Definition
**Permission Risk** [aka weighted_score] measures the potential negati

In [None]:
words = len(PROMPT_USER_PERM_RISK_RATING.split())
chars = len(PROMPT_USER_PERM_RISK_RATING)
print(f"\nPrompt Template Statistics:")
print(f"Total words: {words}")
print(f"Total characters: {chars}")

In [None]:
# Create a new chat session
risk_chat_session = create_chat_session(client = client, model_name='gemini-2.0-flash')

# Evaluate a single permission
risk_text_eval, risk_struct_eval = risk_eval_summary(
    prompt=PROMPT_USER_PERM_RISK_RATING,
    name=perm_list_df['Permission Name'][0],
    api_name=perm_list_df['API Name'][0],
    description=perm_list_df['Description'][0],
    model_name='gemini-2.0-flash',
    client=client,
    chat_session=risk_chat_session  # Reuse the same session
)

# Display the result
print(risk_text_eval)
print(f"Risk Rating: {risk_struct_eval.name} ({risk_struct_eval.value})")

Risk Rating: CONTROLLED (2)


In [None]:
# Execute a batch of classifying the inputs using the template
risk_results_df = classify_risk_rating(
      input_df = perm_list_df
    , prompt = PROMPT_USER_PERM_RISK_RATING
    , chat_session = risk_chat_session
    , total_records = None
    , checkin_interval = 120
    , checkpoint_interval = 20
    , debug = True
    , verbose = False
  )
risk_results_df

Starting job to process 2 records.
####################

Analyzing Permission 1 of 2...
Name:        Access Data Cloud Data Explorer
API Name:    AccessCdpDataExplorer
Description: Allows user access Data Cloud Data Explorer.
--------------------
Risk Rating: RiskRating.CONTROLLED
####################

Analyzing Permission 2 of 2...
Name:        Administer territory operations
API Name:    ManageTerritories
Description: Prerequisite user permission for a user to manage a territory branch.
--------------------
Risk Rating: RiskRating.SENSITIVE
####################


####################
Total time taken: 4.93 seconds to process 2 records.
Average time per record: 2.46 seconds

Sample Output of Results:
                   Permission Name               API Name  \
0  Access Data Cloud Data Explorer  AccessCdpDataExplorer   
1  Administer territory operations      ManageTerritories   

                                         Description            Risk Rating  \
0       Allows user access

Unnamed: 0,Permission Name,API Name,Description,Risk Rating,Evaluation,Processing Time
0,Access Data Cloud Data Explorer,AccessCdpDataExplorer,Allows user access Data Cloud Data Explorer.,RiskRating.CONTROLLED,"```json\n{\n ""risk_tier"": ""Controlled"",\n ""r...",2.423239
1,Administer territory operations,ManageTerritories,Prerequisite user permission for a user to man...,RiskRating.SENSITIVE,"```json\n{\n ""risk_tier"": ""Sensitive"",\n ""ri...",2.500572


In [None]:
# Preview a record from the raw output
risk_results_df['Evaluation'][1]

'```json\n{\n  "risk_tier": "Sensitive",\n  "risk_rating": "3",\n  "weighted_score": 2.6,\n  "scores": {\n    "Data_Sensitivity": 3,\n    "Scope_of_Impact": 3,\n    "Configurational_Authority": 3,\n    "External_Data_Exposure": 1,\n    "Regulatory_Obligation": 2,\n    "Segregation_of_Duties": 2,\n    "Auditability": 3,\n    "Reversibility": 2\n  },\n  "rationale": "The ability to administer territory operations involves managing sales territories, potentially impacting sales data and team assignments. Data sensitivity is a concern (score of 3) because territories often contain customer and revenue information. The scope of impact is moderate (score of 3) due to the potential to affect multiple users and opportunities. As such, this permission falls into the \'Sensitive\' category, requiring careful oversight and monitoring.",\n  "confidence": "High"\n}\n```'

In [None]:
# Convert the JSON text output to columns
risk_rating_df = extract_json_fields(
    risk_results_df
  , json_column='Evaluation'
  , debug = True
)


First 5 rows of processed data:


Unnamed: 0,Permission Name,API Name,Description,Risk Rating,Evaluation,Processing Time,Risk Tier,Weighted Score,Scores,Rationale,Confidence
0,Access Data Cloud Data Explorer,AccessCdpDataExplorer,Allows user access Data Cloud Data Explorer.,2,"{ ""risk_tier"": ""Controlled"", ""risk_rating"": ...",2.423239,Controlled,2.1,"{'Data_Sensitivity': 3, 'Scope_of_Impact': 2, ...",Access to Data Cloud Data Explorer allows view...,High
1,Administer territory operations,ManageTerritories,Prerequisite user permission for a user to man...,3,"{ ""risk_tier"": ""Sensitive"", ""risk_rating"": ""...",2.500572,Sensitive,2.6,"{'Data_Sensitivity': 3, 'Scope_of_Impact': 3, ...",The ability to administer territory operations...,High



Columns added: ['Risk Tier', 'Risk Rating', 'Weighted Score', 'Scores', 'Rationale', 'Confidence']


In [None]:
# Preview a record from the parsed output table
risk_rating_df.iloc[0].to_dict()

{'Permission Name': 'Access Data Cloud Data Explorer',
 'API Name': 'AccessCdpDataExplorer',
 'Description': 'Allows user access Data Cloud Data Explorer.',
 'Risk Rating': '2',
 'Evaluation': '{  "risk_tier": "Controlled",  "risk_rating": "2",  "weighted_score": 2.1,  "scores": {    "Data_Sensitivity": 3,    "Scope_of_Impact": 2,    "Configurational_Authority": 1,    "External_Data_Exposure": 2,    "Regulatory_Obligation": 2,    "Segregation_of_Duties": 2,    "Auditability": 2,    "Reversibility": 1  },  "rationale": "Access to Data Cloud Data Explorer allows viewing data that might be sensitive, leading to a Data_Sensitivity score of 3. While primarily for exploration, the potential for extracting or misusing the exposed data exists (External_Data_Exposure = 2). Given these factors, a \'Controlled\' risk tier is appropriate, requiring monitoring to prevent unauthorized data handling.",  "confidence": "High"}',
 'Processing Time': 2.42323899269104,
 'Risk Tier': 'Controlled',
 'Weight

In [None]:
# Save the results DataFrame
save_data(
    data=risk_rating_df,
    filename='results',
    data_type='risk_rating_output_'+str(datetime.now().strftime("%Y%m%d_%H%M%S")),  # This will save to data/output/
    format='csv',
    index=False
)



'data/output/results.csv'

# Permission Category Classification

In [None]:
# Load the promt from the source file
with open('/content/SFDC-User-Permissions-AI/src/prompts/templates/prompt_user_perm_category.md', 'r') as f:
    PROMPT_USER_PERM_CATEGORY = f.read()
# Display the prompt
print(PROMPT_USER_PERM_CATEGORY)

In [None]:
# Create a new chat session
category_chat_session = create_chat_session(client = client, model_name='gemini-2.0-flash')

# Evaluate a single permission
cat_text_eval, structured_cat_rating, structured_cat_label = category_eval_summary(
    prompt=PROMPT_USER_PERM_CATEGORY,
    name=perm_list_df['Permission Name'][0],
    api_name=perm_list_df['API Name'][0],
    description=perm_list_df['Description'][0],
    model_name='gemini-2.0-flash',
    client=client,
    chat_session=category_chat_session  # Reuse the same session
)

# Display the result
print(cat_text_eval)
print(f"Category Rating: {structured_cat_rating.name} ({structured_cat_rating.value})")
print(f"Category Label: {structured_cat_label.name} ({structured_cat_label.value})")

In [None]:
# Execute a batch of classifying the inputs using the template
category_results_df = classify_category(
      input_df = perm_list_df
    , prompt = PROMPT_USER_PERM_CATEGORY
    , chat_session = category_chat_session
    , total_records = 5
    , checkin_interval = 60
    , debug = True
  )
category_results_df

In [None]:
# Preview a record from the raw output
category_results_df['Evaluation'][1]

In [None]:
# Convert the JSON text output to columns
category_df = extract_json_fields(
    category_results_df
  , json_column='Evaluation'
  , fields = {
      'permission_category_label': 'Permission Category Label',
      'permission_category_order': 'Permission Category Order',
      'match_rating_tier': 'Match Rating Tier',
      'match_rating_score': 'Match Rating Score',
      'weighted_match_score': 'Weighted Match Score',
      'scores': 'Scores',
      'rationale': 'Rationale',
      'confidence': 'Confidence'
  }
  , debug = True
)

In [None]:
# Preview a record from the parsed output table
category_df.iloc[0].to_dict()

In [None]:
# Save the results DataFrame
save_data(
    data=category_df,
    filename='category_results'+str(datetime.now().strftime("%Y%m%d_%H%M%S")),
    data_type='output',  # This will save to data/output/
    format='csv',
    index=False
)