In [None]:
# file wrapper-concept.ipynb
# author Jim Smith Jan 2022
# note book to explore and illsutrate the basic concept of a warpper class

## Some basic examples to explore what the wrapper class could look like and be implemented

### Lets start by making some data with one disclosive case
We'll do this by adding an example to the iris data and give it a new class to make things really obvious

In [None]:
import numpy as np
from sklearn import datasets 

iris= datasets.load_iris()
X= iris.data
y = iris.target



#print the max and min values in each feature to help hand-craft the disclopsive point
for  feature in range (4):
    print (f'feature {feature} min {np.min(X[:,feature])}, min {np.max(X[:,feature])}')
    
# now add a single disclosve point with features [7,2,4.5,1] and label 3
X = np.vstack([X,(7,2.0,4.5,1)])
y = np.append(y,4)

### and import some basic libraries to show our point

In [None]:
from sklearn.tree import  plot_tree
import matplotlib.pyplot as plt

## Here's the raw version
-  note I am setting random_state=1 to make it deterministic, just so you ge the same reults as me
 - the general point is not that someone always will, but that they could
 - in practice i ran 10 times not setting random state and got the same tree 4/5 times

In [None]:
# example code with no safety

from sklearn.tree import DecisionTreeClassifier

rawDT = DecisionTreeClassifier(min_samples_leaf = 1, criterion="gini",random_state=1)
rawDT.fit(X,y)

print(f'Training set accuracy in this naive case is {rawDT.score(X,y)}')

fig,ax = plt.subplots(1,1,figsize=(15,10))
output = plot_tree(rawDT,filled=True, ax=ax,fontsize=11)




### As we can see we have several disclosive nodes, one of which is our added point
The exact values cannot be inferred but if we (reasonably) assume all features are non-negative we can get uper and lower bounds for the attribute values on that node:  
> (6.95,inf),  \[0,2.6\],  (0,4.95\],  (0.8, 1.65\]

so this is disclosive to a certain degree.

- In this case I spent 5 minutes manually tuning the values of the added point so that the tree included at least one decision node for each feature

- We can take it as read that it would be fairly trivial to write something like a GA automatically tune the feature values of the added point minimising the differnce betwee nthre upper and lower boiunds for each feauture.

- But that is not really the point of this exercise which was to show that allowing the user to set inappropriatse values for a single parameter could produce a disclosive tree. 


### Diligent user realises problem, and changes their code to enforce at least n samples in each leaf
We'll use n=5 

In [None]:
manualDT = DecisionTreeClassifier(min_samples_leaf = 5, criterion="gini", random_state=1)
manualDT.fit(X,y)

print(f'Training set accuracy in this naive case is {manualDT.score(X,y)}')

fig2,ax2 = plt.subplots(1,1,figsize=(15,10))
output = plot_tree(manualDT,filled=True, ax=ax2,fontsize=11)



### output is now non-disclosive (at least according to the threshold rule)
- easily see we don't get a node for the new class 3

## So lets define a new class SafeDecisionTree 
### after starting by removing the import of DecisionTreeClassfier

In [None]:
del DecisionTreeClassifier
try:
    shouldFail = DecisionTreeClassifier()
except:
    print('call failed correctly')

In [None]:
import importlib, SafeModel
importlib.reload(SafeModel)




from SafeModel import SafeModel, SafeDecisionTree

 

In [None]:

noNameModel = SafeModel()

try:
    print(noNameModel.__str__())
except:
    print("super class has no attributes to print")

In [None]:
safeDTModel= SafeDecisionTree()#(criterion="entropy")
print(safeDTModel.__str__())

In [None]:
safeDTModel.fit(X,y)

print(f'Training set accuracy in this naive case is {safeDTModel.score(X,y)}')

fig,ax = plt.subplots(1,1,figsize=(15,10))
output = plot_tree(safeDTModel.model,filled=True, ax=ax,fontsize=11)





## Now demonstrate the save and reporting functionality

In [None]:
safeDTModel.saveModel(name="testSave.pkl")
safeDTModel.preliminaryCheck()
#safeDTmodel.requestRelease()

## Now lets try to attack this approach
### starting with listing the params then trying to set the params manually after init

In [None]:
print(safeDTModel.__dict__)
print(safeDTModel.model)
print(safeDTModel.model.__dict__)

In [None]:
safeDTModel.model.min_samples_leaf=1

safeDTModel.fit(X,y)

print(f'Training set accuracy in this naive case is {safeDTModel.score(X,y)}')

fig,ax = plt.subplots(1,1,figsize=(15,10))
output = plot_tree(safeDTModel.model,filled=True, ax=ax,fontsize=11)



### This has let the user reset the params so that the model is now disclosive once again.

## Question: what do we do here:
1. Find a way of obfuscating the params so that they cannot be changed from outside the wrapper class
  - hard, not very python esque
  - also what if the use wants to increase min_samples_leaf which would make the model less disclosive than the default values?
2. Put code into the makeReport() function that checks the params values and says if they have been changed.
 - this suggest a route i have thought about, but not put into code yet, which stores the "safe" param values in a read-only file  
   and reads that into a dict in the init(0 and makeReport() functions. 
    - The dict key is name of parameter name, value is a tuple of [operator, value]  
      where operator is one of ["min" |"equals" | "max"] and value is applied to operator
    - the dict is read afresh in __init__() and in makeReport() to prevent users amending values 
 - that would have the benefit of allowing users to increase the min_samples_leaf but report when it was taken below our threshold
 
 - issue is how to deal with situations where the safety is built from a non-linear interaction between param values
   - that is a problem to deal with in the second stage of the project once we have proved the concept
 

In [None]:
# create and fit using recomended params
print("***researcher doesn't change recomended params")
safeDTModel2= SafeDecisionTree()
safeDTModel2.fit(X,y)
safeDTModel2.saveModel(name="safe2.pkl")
safeDTModel2.preliminaryCheck()
safeDTModel2.requestRelease(filename="safe2.pkl")





# change model params to recommended values
print("\n***researcher changes params safely")
safeDTModel3= SafeDecisionTree()
safeDTModel3.model.min_samples_leaf=5
safeDTModel3.fit(X,y)
safeDTModel3.saveModel(name="safe3.pkl")
safeDTModel3.preliminaryCheck()
safeDTModel3.requestRelease(filename="safe3.pkl")



# change model params in a safe way
print("\n***researcher changes params safely")
safeDTModel4= SafeDecisionTree()
safeDTModel4.model.min_samples_leaf=10
safeDTModel4.fit(X,y)
safeDTModel4.saveModel(name="safe4.pkl")
safeDTModel4.preliminaryCheck()
safeDTModel4.requestRelease(filename="safe4.pkl")





# change model params in an  unsafe way
print("\n***researcher changes params unsafely")
safeDTModel5= SafeDecisionTree()
safeDTModel5.model.min_samples_leaf=1
safeDTModel5.saveModel(name="unsafe.pkl")
safeDTModel5.preliminaryCheck()
safeDTModel5.requestRelease(filename= "unsafe.pkl")



In [None]:
!ls; echo "contetns of checkfile are"; cat j4-smith_checkfile.txt
