## ACTIVATING SESSION 

In [None]:
# Import python packages
import streamlit as st
import pandas as pd
import snowflake.snowpark as snowpark

# We can also use Snowpark for our analyses!
from snowflake.snowpark.context import get_active_session
session = get_active_session()


##  LOADING PII TABLE

In [None]:


# Load the data from PII_DATA table
df = session.table("PII_DATA").select("DOCUMENT", "TEXT").limit(10)

# Show the top 10 rows
df.show()


In [None]:
# Convert the Snowpark DataFrame to a pandas DataFrame for easier manipulation
data = df.to_pandas()


## MASKING USING PROTECTO AUTO DETECT

In [None]:
from snowflake.snowpark.functions import col, call_udf, array_agg, flatten
from snowflake.cortex import Complete, ExtractAnswer, Sentiment, Summarize, Translate

# Assuming you have a Snowflake session named 'session'
# and a DataFrame named 'df' containing your data

# Aggregate the TEXT column into an array
aggregated_df = df.agg(array_agg(col("TEXT")).alias("text_array"))

# Call the PROTECTO_MASK UDF
protecto_mask = call_udf("PROTECTO.VAULT.PROTECTO_MASK",col("text_array"))

# Flatten the UDF result and select the VALUE column
auto_df = aggregated_df.select(flatten(protecto_mask)).select(col("VALUE"))

# Combine the original DataFrame with the masked data
masked_df = pd.concat([df.to_pandas()[["TEXT"]], auto_df.to_pandas()], axis=1).head(5)
masked_df = masked_df.rename(columns={'VALUE': 'MASKED_TEXT'})
masked_df.head()

## SUMMARIZING WITH MASKED DATA

In [None]:
def create_prompt(text):
    return [
        {
            'role': 'system',
            'content': 'You are a helpful AI assistant. You will be summarising the given text and while summarizing make sure you maintain the structure when it mentioned as <PER> and other relevant ones.Dont skip PII info include that in the summary.'
        },
        {'role': 'user', 'content': text}
    ]

def summarize_text(text):
    import json
    return json.loads(Complete("snowflake-arctic",create_prompt(text)))["choices"][0]["messages"]
    

masked_df['SUMMARIZED_TEXT'] = masked_df['MASKED_TEXT'].apply(lambda text: summarize_text(text))
masked_df.head()

In [None]:
from IPython.display import display

print(f"**Masked Text:** \n",masked_df['MASKED_TEXT'][0])

print(f"\n\n**Summarized Text:** \n")
# Split and display line by line to avoid scroll
for line in masked_df['SUMMARIZED_TEXT'][0].split('.'):
    display(line)

In [None]:
df_snowpark = session.create_dataframe(masked_df)

# Write the Snowpark DataFrame to a Snowflake table
df_snowpark.write.mode("overwrite").save_as_table("PII_SUMMARIZED_TEXT")

## UNMASKING RESULTS 

In [None]:

masked_table = session.table("PII_SUMMARIZED_TEXT")
# Aggregate the SUMMARIZED_TEXT column into an array
aggregated_df = masked_table.agg(array_agg(col("MASKED_TEXT")).alias("mask_array"))

# Call the UNMASK UDF
unmask_udf_result = call_udf("PROTECTO.VAULT.PROTECTO_UNMASK", col("mask_array"))

# Flatten the UDF result
auto_df = aggregated_df.select(flatten(unmask_udf_result)).select(col("VALUE"))


unmasked_df = pd.concat([masked_table.to_pandas()[["MASKED_TEXT"]], auto_df.to_pandas()], axis=1).head(5)
unmasked_df = unmasked_df.rename(columns={'VALUE': 'UNMASKED_TEXT'})
unmasked_df.head()

## UNMASKING AFTER SUMMARIZATION

In [None]:

masked_table = session.table("PII_SUMMARIZED_TEXT")
# Aggregate the SUMMARIZED_TEXT column into an array
aggregated_df = masked_table.agg(array_agg(col("SUMMARIZED_TEXT")).alias("summarized_array"))

# Call the UNMASK UDF
unmask_udf_result = call_udf("PROTECTO.VAULT.PROTECTO_UNMASK", col("summarized_array"))

# Flatten the UDF result
auto_df = aggregated_df.select(flatten(unmask_udf_result)).select(col("VALUE"))


summary = pd.concat([masked_table.to_pandas()[["MASKED_TEXT","SUMMARIZED_TEXT"]], auto_df.to_pandas()], axis=1).head(5)
summary = summary.rename(columns={'VALUE': 'UNMASKED_SUMMARY'})
summary.head()
