## Imports

In [1]:
%load_ext autoreload
%autoreload 2

In [None]:
%aimport

In [None]:
import json
import pandas as pd
import numpy as np
from snowflake.snowpark.session import Session
import snowflake.snowpark.functions as F
import snowflake.snowpark.types as T
from dotenv import find_dotenv
from pathlib import Path
from datetime import datetime, timedelta
from faker import Faker
import sys

project_home = Path(find_dotenv()).parent
sys.path.append(str(project_home))
print(sys.path.append(project_home))
from constants import *


## Data Pipline

### Snowpark Session

In [4]:
with open(project_home / 'creds.json', 'r') as ff:
    conn_param=json.load(ff)

session = Session.builder.configs(conn_param).create() 


### Database, Schema & resource creation

In [None]:
comment_tag = """comment='{"origin":"sf_sit","name":"user_license_rationalization","version":{"major":1, "minor":0},"attributes":{"component":"streamlit"}}'"""
session.sql(f"create or replace database {LICENSING_DB} {comment_tag}").collect()
session.sql(f"create schema if not exists {LICENSING_DB}.{LICENSING_SCHEMA} {comment_tag}").collect()
session.sql(f"create stage if not exists {LICENSING_DB}.{LICENSING_SCHEMA}.{DATA_STAGE} {comment_tag}").collect()
session.sql(f"create stage if not exists {LICENSING_DB}.{LICENSING_SCHEMA}.{DEPS_STAGE} {comment_tag}").collect()
session.sql(f"create stage if not exists {LICENSING_DB}.{LICENSING_SCHEMA}.{MODELS_STAGE} {comment_tag}").collect()
session.sql(f"create stage if not exists {LICENSING_DB}.{LICENSING_SCHEMA}.{OBJECT_STAGE} {comment_tag}").collect()

In [None]:


res = session.sql(f'use schema {LICENSING_DB}.{LICENSING_SCHEMA}').collect()
res = session.sql(f'select current_database()').collect()
print(res)


### Generate Sample Data

In [7]:
import random 
today = pd.Timestamp('2030-12-31').date()

def generate_app_logs (n_users, n_days_ago, random_state=0, faker_state = 0):
    
    np.random.seed(random_state)
    fake = Faker()
    Faker.seed(faker_state)
    # today = pd.Timestamp(datetime.now().date())
    
    employee_list = []
    app_list = []
    
    for user in range(1, n_users):
        user_email = fake.ascii_company_email()
        department = fake.company()
        division = fake.job()
        title = fake.job()
        
        has_leaved_company = np.random.uniform (0,100)
        if (has_leaved_company < 5): #Generate 5% people leaving the company 
            last_day_work_days = np.random.uniform (0, n_days_ago)
            field_last_work_day = today - timedelta (days = day)
        else:
            last_day_work_days = 0
            field_last_work_day = today + timedelta (days = 365) # set to the future for now
        
        n_logings = int(np.random.uniform (0, n_days_ago - last_day_work_days))  # Between never logging and every day
        
        for l in range (0, n_logings):
            day = np.random.uniform(0, n_days_ago) # get a ramdom day
            
            ## Let´s introduce some users who do not log in thet last 30 days
            not_loggin = np.random.uniform (0,100)
            if (not_loggin < 25):
                login_day = today - timedelta (days = day + last_day_work_days + 50)
            else:
                login_day = today - timedelta (days = day + last_day_work_days)
            app_id = random.choice([1,2])
            app_list.append([app_id, user_email, login_day])
            
        employee_list.append([app_id, user_email, department, division, title, str(field_last_work_day)])
        
    df_employee = pd.DataFrame(employee_list, columns = ['APP_ID', 'SESSION_USER', 'DEPARTMENT', 'DIVISION', 'TITLE', 'LAST_DAY_OF_WORK'])
    df_app = pd.DataFrame(app_list, columns = ['APP_ID', 'SESSION_USER', 'SNAPSHOT_DATETIME'])
    
    return df_app, df_employee

## write the sample employee_metadata and okta_logs directly to the tables in the DB

In [None]:
df_app_1, df_employee = generate_app_logs (n_users=5000, n_days_ago =3650, random_state= 6, faker_state= 4)
session.write_pandas(df=df_employee, table_name=TBL_EMPLOYEE_METADATA, database=LICENSING_DB, schema=LICENSING_SCHEMA, chunk_size=100000, compression='snappy', parallel=4, auto_create_table=True, overwrite=True )
session.sql(f"ALTER TABLE {LICENSING_DB}.{LICENSING_SCHEMA}.{TBL_EMPLOYEE_METADATA} SET {comment_tag}").collect()
session.write_pandas(df=df_app_1, table_name=TBL_OKTA_USERS, database=LICENSING_DB, schema=LICENSING_SCHEMA, chunk_size=100000, compression='snappy', parallel=4, auto_create_table=True, overwrite=True )
session.sql(f"ALTER TABLE {LICENSING_DB}.{LICENSING_SCHEMA}.{TBL_OKTA_USERS} SET {comment_tag}").collect()

## write the sample app_logs data directly to the table in the DB

In [None]:
df_app_2, k = generate_app_logs (n_users=5000, n_days_ago =3650, random_state= 19, faker_state= 4)
session.write_pandas(df=df_app_1, table_name=TBL_APP_LOGS, database=LICENSING_DB, schema=LICENSING_SCHEMA, chunk_size=100000, compression='snappy', parallel=4, auto_create_table=True, overwrite=True )
session.sql(f"ALTER TABLE {LICENSING_DB}.{LICENSING_SCHEMA}.{TBL_APP_LOGS} SET {comment_tag}").collect()

In [10]:

def generate_working_days (n_days_ago = 365):
    
    today = pd.Timestamp('2030-12-31').date()
    
    calendar_list = []
    
    for d in range(0, n_days_ago):    
        calendar_day = today - timedelta (days = d)
                
        num = np.random.uniform (0,100)
        if (num < 20): # 20% of holidays
            is_working_day = False
        else:
            is_working_day = True
        
        calendar_list.append([calendar_day, is_working_day])
        
    
    df_cal = pd.DataFrame(calendar_list, columns = ['SNAPSHOT_DATETIME', 'WORK_DAY'])
    
    return df_cal    

## write the sample work_days data directly to the table in the DB 

In [None]:
df_cal = generate_working_days(n_days_ago = 3650)
session.write_pandas(df=df_cal, table_name=TBL_WORK_DAYS, database=LICENSING_DB, schema=LICENSING_SCHEMA, chunk_size=100000, compression='snappy', parallel=4, auto_create_table=True, overwrite=True )
session.sql(f"ALTER TABLE {LICENSING_DB}.{LICENSING_SCHEMA}.{TBL_WORK_DAYS} SET {comment_tag}").collect()


## Populate MONITORED_APPS and WHITELISTED_USERS table

In [None]:
vals = []
vals.append(['APP-1', 1])
vals.append(['APP-2', 2])
df = pd.DataFrame(vals, columns = ['APP_NAME', 'APP_ID'])
session.write_pandas(df=df, table_name=TBL_MONITORED_APPS, database=LICENSING_DB, schema=LICENSING_SCHEMA, chunk_size=100000, compression='snappy', parallel=4, auto_create_table=True, overwrite=True )
session.sql(f"ALTER TABLE {LICENSING_DB}.{LICENSING_SCHEMA}.{TBL_MONITORED_APPS} SET {comment_tag}").collect()

vals = []
vals.append('abc.vip@organization.com')
df = pd.DataFrame(vals, columns = ['EMAIL'])
session.write_pandas(df=df, table_name=TBL_WHITELISTED_USERS, database=LICENSING_DB, schema=LICENSING_SCHEMA, chunk_size=100000, compression='snappy', parallel=4, auto_create_table=True, overwrite=True )
session.sql(f"ALTER TABLE {LICENSING_DB}.{LICENSING_SCHEMA}.{TBL_WHITELISTED_USERS} SET {comment_tag}").collect()

### upload solution artifacts to the DEPENDENCY stage

In [None]:

session.sql(f"PUT file://train.py @{LICENSING_DB}.{LICENSING_SCHEMA}.{DEPS_STAGE} overwrite=True auto_compress=False").collect()
session.sql(f"PUT file://constants.py @{LICENSING_DB}.{LICENSING_SCHEMA}.{DEPS_STAGE} overwrite=True auto_compress=False").collect()

### Local utility functions

In [None]:
session.udf.register_from_file(
      file_path="train.py"
      , func_name="contains_anyof"
      , name=f"{LICENSING_DB}.{LICENSING_SCHEMA}.udf_contains_anyof"
      , is_permanent=True
      , packages = ["snowflake-snowpark-python"]
      , imports=[f"@{LICENSING_DB}.{LICENSING_SCHEMA}.{DEPS_STAGE}/constants.py"]
      , stage_location=f"{LICENSING_DB}.{LICENSING_SCHEMA}.{OBJECT_STAGE}"
      , replace=True
)
session.sql(f"ALTER FUNCTION {LICENSING_DB}.{LICENSING_SCHEMA}.udf_contains_anyof(STRING,ARRAY) SET {comment_tag}").collect()

### Model Training

In [None]:
session.sproc.register_from_file(
    file_path = f"@{LICENSING_DB}.{LICENSING_SCHEMA}.{DEPS_STAGE}/train.py"
    ,func_name = "run_model_today"
    ,name=f"{LICENSING_DB}.{LICENSING_SCHEMA}.run_model_today"
    ,input_types=[T.IntegerType(),T.IntegerType(),T.FloatType(),T.BooleanType(),T.BooleanType(),T.BooleanType(), T.BooleanType()]
    ,return_type=T.VariantType()
    ,is_permanent=True
    ,replace=True
    ,stage_location=f"@{LICENSING_DB}.{LICENSING_SCHEMA}.{OBJECT_STAGE}"
    ,packages=['snowflake-snowpark-python','pandas','scikit-learn==1.2.1','joblib==1.1.1', 'numpy']
    ,imports=[f"@{LICENSING_DB}.{LICENSING_SCHEMA}.{DEPS_STAGE}/constants.py"]
)
session.sql(f"ALTER PROCEDURE {LICENSING_DB}.{LICENSING_SCHEMA}.run_model_today(INT, INT, FLOAT, BOOLEAN, BOOLEAN, BOOLEAN, BOOLEAN) SET {comment_tag}").collect()

## License usage probablity prediction with revocation decision
Local trigger

In [None]:
# %%time
from train import run_model_today
results = session.call('run_model_today', 1,120,0.5,False,False,False,False)
display(results)

## Run the section below if you are wanting to run application SiS

In [None]:
# Create stage for Streamlit app files
session.sql(f"create stage if not exists {LICENSING_DB}.{LICENSING_SCHEMA}.STREAMLIT_STAGE {comment_tag}").collect()

# Upload Streamlit app files to the stage (excluding system files/directories)
print("📁 Uploading app files to stage...")

# Main app file and environment
session.sql(f"PUT file://../app.py @{LICENSING_DB}.{LICENSING_SCHEMA}.STREAMLIT_STAGE overwrite=True auto_compress=False").collect()

# Copy sis_environment.yml to environment.yml temporarily for upload
import shutil
shutil.copy('sis_environment.yml', 'environment.yml')
session.sql(f"PUT file://environment.yml @{LICENSING_DB}.{LICENSING_SCHEMA}.STREAMLIT_STAGE overwrite=True auto_compress=False").collect()
# Clean up temporary file
import os
os.remove('environment.yml')
print("  ✅ app.py and environment.yml uploaded")

# appPages Python files only (exclude __pycache__, .DS_Store, etc.)
session.sql(f"PUT file://../appPages/*.py @{LICENSING_DB}.{LICENSING_SCHEMA}.STREAMLIT_STAGE/appPages overwrite=True auto_compress=False").collect()
session.sql(f"PUT file://../appPages/*.md @{LICENSING_DB}.{LICENSING_SCHEMA}.STREAMLIT_STAGE/appPages overwrite=True auto_compress=False").collect()
print("  ✅ appPages files uploaded")

# appUtil Python files only (exclude notebooks, __pycache__, etc.)
session.sql(f"PUT file://constants.py @{LICENSING_DB}.{LICENSING_SCHEMA}.STREAMLIT_STAGE/appUtil overwrite=True auto_compress=False").collect()
session.sql(f"PUT file://train.py @{LICENSING_DB}.{LICENSING_SCHEMA}.STREAMLIT_STAGE/appUtil overwrite=True auto_compress=False").collect()
print("  ✅ appUtil files uploaded")

# Image files (PNG only)
session.sql(f"PUT file://../img/*.png @{LICENSING_DB}.{LICENSING_SCHEMA}.STREAMLIT_STAGE/img overwrite=True auto_compress=False").collect()
print("  ✅ img files uploaded")

# Streamlit config files (if they exist)
try:
    session.sql(f"PUT file://../.streamlit/config.toml @{LICENSING_DB}.{LICENSING_SCHEMA}.STREAMLIT_STAGE/.streamlit overwrite=True auto_compress=False").collect()
    print("  ✅ .streamlit config uploaded")
except:
    print("  ⚠️  No .streamlit config found (optional)")

# Create Streamlit app using session.sql() 
streamlit_sql = f"""
CREATE OR REPLACE STREAMLIT {LICENSING_DB}.{LICENSING_SCHEMA}.LICENSING_APP
ROOT_LOCATION = '@{LICENSING_DB}.{LICENSING_SCHEMA}.STREAMLIT_STAGE'
MAIN_FILE = '/app.py'
QUERY_WAREHOUSE = 'APP_WH'
{comment_tag}
"""

session.sql(streamlit_sql).collect()

print("🎉 Streamlit app 'LICENSING_APP' created successfully!")