# Application of Data Shapley on Knee Surgery Images
### Authors: Virtusa

## Python Libraries Used

In [1]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="1" #model will be trained on GPU 1
import keras
from matplotlib import pyplot as plt
import numpy as np
%matplotlib inline
from keras.layers import Input,MaxPooling2D,UpSampling2D, Flatten,Dense, Convolution2D,MaxPool2D
from keras.models import Model,Sequential
from keras.optimizers import RMSprop
import pandas as pd
%load_ext autoreload
%autoreload 2
import sys
import time
import pickle
import cv2

Using TensorFlow backend.


## Introduction

#### Aim:
    - To implement Data Shapley on knee_surgery model so as to identify noisy data(images) for our classification model. 
    - Compare performance of with and without Shapley deep learning CNN model. 
    
#### Methodology:
    Read knee images into a list of respective embedding. Pass the embedding list through DShap() to calculate Data Shapley values. Seggregate images based on high/low Data Shapley values to be used in deep learning CNN model.
#### Dataset Used:
    Knee images of before and after a month surgery

In [2]:
apath = '/vlife-data/vlife-ML/Shachi/image_embed/knee_images/new_knee_images/after/'
bpath = '/vlife-data/vlife-ML/Shachi/image_embed/knee_images/new_knee_images/before/'

## Image Preprocesssing

### Read after_month_surgery images into a list
- Images are read as pixel values and resized with 32x32 px values.
- Converted images are stored in a lost
- Images are labelled as 1

In [3]:
labels=[]
images=[]
names=[]
grayimages=[]
for file in sorted(os.listdir(apath)):
    filepath=os.path.join(apath,file)
    img=cv2.imread(filepath)
    img=cv2.resize(img,(32,32))
    img1=cv2.cvtColor(img,cv2.COLOR_BGR2RGB)
    grayimg=cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)
    images.append(img1)
    grayimages.append(grayimg)
    file=file.split('.jpg')[0]
    names.append(file)
images=(np.asarray(images))
grayimages=(np.asarray(grayimages))
grayimages=grayimages/255

In [4]:
grayimages.shape

(134, 32, 32)

In [5]:
labels=np.ones(134,dtype='int')

### Read before_month_surgery images into a list
- Images are read as pixel values and resized with 32x32 px values.
- Converted images are stored in a lost
- Images are labelled as 0

In [6]:
image=[]
name=[]
grayimage=[]
for file in sorted(os.listdir(bpath)):
    filepath=os.path.join(bpath,file)
    img=cv2.imread(filepath)
    img=cv2.resize(img,(32,32))
    img1=cv2.cvtColor(img,cv2.COLOR_BGR2RGB)   
    grayimg=cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)
    image.append(img1)
    grayimage.append(grayimg)    
    file=file.split('.jpg')[0]
    name.append(file)
grayimage=(np.asarray(grayimage))
image=(np.asarray(image))
grayimage=grayimage/255

In [7]:
grayimage.shape

(177, 32, 32)

In [8]:
label=np.zeros(177,dtype='int')

### Prepare knee_surgery image dataset to build a model

In [9]:
images= np.concatenate((images,image),axis=0)
grayimages= np.concatenate((grayimages,grayimage),axis=0)
labels= np.concatenate((labels,label),axis=0)
names=name+names

In [10]:
grayimages=np.expand_dims(grayimages,axis=3)

In [11]:
images.shape

(311, 32, 32, 3)

In [12]:
grayimages.shape

(311, 32, 32, 1)

## Generate Image Embeddings

Embedding is a converted low-dim vector from high-dim array which are semantically close to each other. In Neural Networks, can be said as an intermediate output.

Techniques of generating embeddings in Deep Neural Net:
* Auto-Encoder
* Pre-trained Model 

We are going for Pre-trained model technique where following tasks are implemented.
* Train an image classification model 
* Remove model's last layer of sigmoid activation function
* Generate vector of embeddings from the last-layer-removed model by passing image arrays

In [14]:
model=Sequential()
model.add(Convolution2D(32,2,2, input_shape=(32,32,1),activation='relu'))
model.add(MaxPooling2D(pool_size=(2,2)))
model.add(Convolution2D(64,3,3,activation='relu'))
model.add(MaxPooling2D(pool_size=(2,2)))
model.add(Flatten())
model.add(Dense(1,activation='sigmoid'))

Instructions for updating:
If using Keras pass *_constraint arguments to layers.



  
  after removing the cwd from sys.path.


In [15]:
model.compile(optimizer='adam',metrics=['accuracy'],loss='binary_crossentropy')

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [16]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_1 (Conv2D)            (None, 31, 31, 32)        160       
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 15, 15, 32)        0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 13, 13, 64)        18496     
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 6, 6, 64)          0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 2304)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 2305      
Total params: 20,961
Trainable params: 20,961
Non-trainable params: 0
__________________________________________________

In [17]:
model.fit(grayimages,labels, epochs=10)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.callbacks.History at 0x7f9c66dc5910>

In [18]:
model = Model(inputs=model.inputs, outputs=model.layers[-1].output)
model.layers.pop()
model1 = Model(inputs=model.inputs, outputs=model.layers[-1].output)
model1.summary()

Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_1_input (InputLayer)  (None, 32, 32, 1)         0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 31, 31, 32)        160       
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 15, 15, 32)        0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 13, 13, 64)        18496     
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 6, 6, 64)          0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 2304)              0         
Total params: 18,656
Trainable params: 18,656
Non-trainable params: 0
_______________________________________________________

### Generate embeddings

In [19]:
vectors=model1.predict(grayimages)

In [20]:
vectors.shape

(311, 2304)

In [21]:
vectors[0].shape[0]

2304

### Build dataframe of embedded vector of images

In [22]:
col=[]
for i in range(vectors[0].shape[0]):
    col.append('x'+str(i))

In [23]:
df1=pd.DataFrame(vectors,columns=col)

In [24]:
df1.head(2)

Unnamed: 0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,...,x2294,x2295,x2296,x2297,x2298,x2299,x2300,x2301,x2302,x2303
0,0.0,0.583658,0.0,0.278999,0.422945,0.240002,0.429372,0.095949,0.162866,1.06246,...,0.544727,0.679212,0.506755,0.19081,0.109557,0.418965,0.306553,0.0,0.0,0.707116
1,0.0,0.787829,0.035811,0.341216,0.597704,0.276381,0.4161,0.322096,0.137851,1.475991,...,0.107091,0.716279,0.285379,0.456655,0.364303,0.276428,0.155343,0.0,0.0,0.388306


In [26]:
df2=pd.DataFrame(columns=['CLASS'])

In [27]:
df2['CLASS']=labels

In [29]:
df=pd.concat([df1,df2],axis=1)

In [31]:
df3=pd.DataFrame(columns=['img_name'])
df3['img_name']=names

In [32]:
df=pd.concat([df3,df],axis=1)

In [33]:
df

Unnamed: 0,img_name,x0,x1,x2,x3,x4,x5,x6,x7,x8,...,x2295,x2296,x2297,x2298,x2299,x2300,x2301,x2302,x2303,CLASS
0,0_before,0.0,0.583658,0.000000,0.278999,0.422945,0.240002,0.429372,0.095949,0.162866,...,0.679212,0.506755,0.190810,0.109557,0.418965,0.306553,0.0,0.0,0.707116,1
1,100_before,0.0,0.787829,0.035811,0.341216,0.597704,0.276381,0.416100,0.322096,0.137851,...,0.716279,0.285379,0.456655,0.364303,0.276428,0.155343,0.0,0.0,0.388306,1
2,101_before,0.0,0.490048,0.028478,0.311732,0.595897,0.322091,0.462107,0.085859,0.220919,...,0.558683,0.397504,0.082787,0.076551,0.295827,0.228750,0.0,0.0,0.622904,1
3,102_before,0.0,0.898348,0.000000,0.529872,0.439545,0.060917,0.238451,0.208466,0.097565,...,0.483302,0.358719,0.247918,0.136771,0.294839,0.224553,0.0,0.0,0.444664,1
4,103_before,0.0,0.899669,0.000000,0.531076,0.445491,0.063092,0.236066,0.212391,0.100149,...,0.487024,0.364249,0.252098,0.135872,0.297459,0.226406,0.0,0.0,0.439750,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
306,96_after,0.0,0.519422,0.039701,0.437030,0.565162,0.242445,0.422828,0.030521,0.222343,...,0.305267,0.190781,0.148461,0.111052,0.144582,0.112914,0.0,0.0,0.202290,0
307,97_after,0.0,0.810818,0.000000,0.072176,0.145295,0.054518,0.047322,0.189544,0.043516,...,0.589333,0.397108,0.233496,0.174989,0.350880,0.301557,0.0,0.0,0.575434,0
308,98_after,0.0,0.513886,0.038142,0.435579,0.590671,0.251901,0.425681,0.052741,0.220616,...,0.302051,0.199302,0.146744,0.107106,0.154021,0.116740,0.0,0.0,0.207460,0
309,99_after,0.0,0.536693,0.033060,0.449456,0.557479,0.241109,0.405025,0.034209,0.222100,...,0.291468,0.198582,0.150877,0.110008,0.148939,0.107374,0.0,0.0,0.184939,0


In [34]:
#Randomize data records
df=df.sample(frac=1,random_state=0)

#### Dataframe of embedded vector of images

In [35]:
df

Unnamed: 0,img_name,x0,x1,x2,x3,x4,x5,x6,x7,x8,...,x2295,x2296,x2297,x2298,x2299,x2300,x2301,x2302,x2303,CLASS
212,131_after,0.000000,1.228871,0.000000,0.106893,0.378725,0.113637,0.000000,0.325422,0.006754,...,0.753120,0.651524,0.110024,0.148169,0.474146,0.354438,0.0,0.0,0.920092,0
146,72_before,0.000000,0.575476,0.000000,0.155242,0.268377,0.100185,0.165688,0.202334,0.045886,...,0.365571,0.213823,0.333159,0.165871,0.175932,0.134899,0.0,0.0,0.283085,0
225,22_after,0.000000,0.575308,0.000000,0.153417,0.266900,0.097767,0.164801,0.204356,0.047476,...,0.362826,0.210589,0.332663,0.164258,0.173274,0.134092,0.0,0.0,0.280454,0
129,57_before,0.064903,0.203333,0.053989,0.483411,0.771733,0.344765,0.595645,0.187687,0.323053,...,0.539960,0.275502,0.050918,0.035403,0.267898,0.231771,0.0,0.0,0.556198,1
89,20_before,0.067074,0.195816,0.058180,0.483589,0.757361,0.341903,0.600656,0.180184,0.326846,...,0.541061,0.280669,0.048102,0.033316,0.267276,0.230436,0.0,0.0,0.557565,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
251,46_after,0.000000,0.579465,0.000000,0.153379,0.268222,0.098768,0.167883,0.207206,0.047215,...,0.365722,0.212347,0.332824,0.165113,0.177396,0.134560,0.0,0.0,0.282950,0
192,113_after,0.000000,0.645952,0.000000,0.214214,0.579669,0.294877,0.453265,0.223428,0.189683,...,0.437469,0.236357,0.137048,0.095436,0.224110,0.161896,0.0,0.0,0.336993,0
117,46_before,0.061635,0.206741,0.052556,0.484535,0.760611,0.336274,0.599860,0.185466,0.326512,...,0.536913,0.281877,0.045808,0.035034,0.267436,0.229992,0.0,0.0,0.550937,1
47,142_before,0.000000,0.578623,0.006492,0.288660,0.444134,0.246010,0.433019,0.113606,0.154036,...,0.650414,0.493058,0.197702,0.127854,0.402234,0.285287,0.0,0.0,0.685036,1


# Data Shapley

- Data Shapley value for a particular contributor, evolved from Shapley Value for cooperative game theory, can be considered as a quantifiable value given to it based on their individual contribution to the final outcome. 
- This theory aims at calculating and quantifying the contributory value of each data point, wherein each data point refers to a record of the dataset. 
- This algorithm finds out how much each record contributes to the final evaluation metric of the model such as accuracy, F1 score, etc.

In [40]:
from Shapley import ShapNN
from DShap import *
from shap_utils import *

### Data Preparation For Calculating Shapley values

In [41]:
x = df.iloc[:,1:-1]
y = df.iloc[:,[-1]]

In [42]:
y = y.astype(int)

In [43]:
df.shape

(311, 2306)

In [44]:
df.head(249)

Unnamed: 0,img_name,x0,x1,x2,x3,x4,x5,x6,x7,x8,...,x2295,x2296,x2297,x2298,x2299,x2300,x2301,x2302,x2303,CLASS
212,131_after,0.000000,1.228871,0.000000,0.106893,0.378725,0.113637,0.000000,0.325422,0.006754,...,0.753120,0.651524,0.110024,0.148169,0.474146,0.354438,0.0,0.0,0.920092,0
146,72_before,0.000000,0.575476,0.000000,0.155242,0.268377,0.100185,0.165688,0.202334,0.045886,...,0.365571,0.213823,0.333159,0.165871,0.175932,0.134899,0.0,0.0,0.283085,0
225,22_after,0.000000,0.575308,0.000000,0.153417,0.266900,0.097767,0.164801,0.204356,0.047476,...,0.362826,0.210589,0.332663,0.164258,0.173274,0.134092,0.0,0.0,0.280454,0
129,57_before,0.064903,0.203333,0.053989,0.483411,0.771733,0.344765,0.595645,0.187687,0.323053,...,0.539960,0.275502,0.050918,0.035403,0.267898,0.231771,0.0,0.0,0.556198,1
89,20_before,0.067074,0.195816,0.058180,0.483589,0.757361,0.341903,0.600656,0.180184,0.326846,...,0.541061,0.280669,0.048102,0.033316,0.267276,0.230436,0.0,0.0,0.557565,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
196,117_after,0.000000,0.602964,0.000000,0.142713,0.250961,0.090995,0.156461,0.217935,0.039057,...,0.372479,0.216908,0.332648,0.165916,0.178303,0.138481,0.0,0.0,0.287858,0
6,105_before,0.086887,0.000000,0.202608,0.455540,0.600294,0.300128,0.462706,0.040631,0.243717,...,0.343463,0.190030,0.202804,0.186609,0.149572,0.104736,0.0,0.0,0.152989,1
68,161_before,0.000000,0.796945,0.036396,0.340470,0.593390,0.278328,0.417004,0.305991,0.139854,...,0.718166,0.253682,0.432112,0.358564,0.277990,0.150024,0.0,0.0,0.377479,1
203,123_after,0.067970,0.395416,0.222730,0.638634,0.431447,0.334274,0.314200,0.000000,0.400629,...,0.238757,0.289729,0.342533,0.340138,0.115667,0.045735,0.0,0.0,0.158571,0


#### Performing train-test split
Dataset is split into two sets using train_test_split() scikit function where 80% and 20% are of train and test respectively.

In [45]:
x_train=x.head(249)
y_train=y.head(249)
x_test=x.tail(62)
y_test=y.tail(62)
x_train = x_train.values
y_train = y_train.values

##### Convert x_test and y_test to an array

In [46]:
x_test = x_test.to_numpy()
y_test = y_test.to_numpy()

#### Convert list of lists to a single list

In [47]:
Y_train = []
for i in y_train:
    Y_train.extend(i)

In [48]:
Y_test = []
for i in y_test:
    Y_test.extend(i)

#### Convert to an array type for running Shapley

In [49]:
Y_train = np.asarray(Y_train)
Y_test = np.asarray(Y_test)

In [50]:
print(type(x_train))
print(type(Y_train))
print(type(x_test))
print(type(Y_test))

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>


##### Mention the model to be used e.g logistic, NN etc.

In [51]:
model = 'logistic'
problem = 'classification'
num_test = 2304
directory = './tempLog_for_ImageShap1'

In [52]:
import shutil
shutil.rmtree(directory, ignore_errors=True)

### Main Function Call for Calculating Shapley Values

- This is the main method that invokes all shapley functions and returns Data Shapley values.
- This procedure is done thrice with different seed values so that different permutations of data points are considered.

#### Seed : 0
- dshap.run() calculates the shapley values
- Set g_run and loo_run as False to only calculate TMC_Shapley values

In [53]:
%%time
dshap = DShap(x_train, Y_train, x_test, Y_test, num_test, sources=None, model_family=model, metric='accuracy',
              directory=directory, seed=0)
dshap.run(100, 0.1,g_run=False, loo_run=False)


LOO values calculated!
10 out of 100 TMC_Shapley iterations.
20 out of 100 TMC_Shapley iterations.
30 out of 100 TMC_Shapley iterations.
40 out of 100 TMC_Shapley iterations.
50 out of 100 TMC_Shapley iterations.
60 out of 100 TMC_Shapley iterations.
70 out of 100 TMC_Shapley iterations.
80 out of 100 TMC_Shapley iterations.
90 out of 100 TMC_Shapley iterations.
100 out of 100 TMC_Shapley iterations.
10 out of 100 TMC_Shapley iterations.
20 out of 100 TMC_Shapley iterations.
30 out of 100 TMC_Shapley iterations.
40 out of 100 TMC_Shapley iterations.
50 out of 100 TMC_Shapley iterations.
60 out of 100 TMC_Shapley iterations.
70 out of 100 TMC_Shapley iterations.
80 out of 100 TMC_Shapley iterations.
90 out of 100 TMC_Shapley iterations.
100 out of 100 TMC_Shapley iterations.
10 out of 100 TMC_Shapley iterations.
20 out of 100 TMC_Shapley iterations.
30 out of 100 TMC_Shapley iterations.
40 out of 100 TMC_Shapley iterations.
50 out of 100 TMC_Shapley iterations.
60 out of 100 TMC_Shaple

### Merging The Results and Visualizing them

The results obtained from DShap( ) after running it for different seeds are merged to obtain a unique averaged value for the data points

In [54]:
dshap.merge_results()

entered merge
entered merge
SKIPPED: 0000


### Negative TMC-Shapley Values
- The data points that have negative (low)TMC-Shapley values are identified and removed from the original dataset

In [55]:
dshap.values_tmc

array([ 2.55517827e-03,  2.86078098e-03,  2.92020374e-03,  3.28522920e-03,
        1.85059423e-03, -3.64176570e-03,  7.13073005e-04,  3.03056027e-03,
       -9.33786078e-05,  8.65874363e-04,  3.59932088e-03,  1.58743633e-03,
        6.62139219e-04,  2.03735144e-03,  3.60780985e-03,  3.42105263e-03,
        1.71477080e-03,  1.01867572e-03,  8.14940577e-04,  4.01528014e-03,
       -3.36162988e-03,  3.56536503e-04,  1.85059423e-03,  2.32597623e-03,
        5.68760611e-04,  2.97113752e-04,  1.02716469e-03,  3.37860781e-03,
        2.00339559e-03,  2.06281834e-03,  2.40237691e-03,  1.26485569e-03,
        3.54838710e-03,  2.43633277e-03,  3.47198642e-03,  1.49405772e-03,
        1.09507640e-03,  2.41935484e-03,  3.65025467e-04,  1.99490662e-03,
        1.48556876e-03,  2.23259762e-03,  9.42275042e-04,  2.46179966e-04,
        8.23429542e-04,  1.77419355e-03, -2.17317487e-03,  1.48556876e-03,
       -7.31748727e-03,  2.30050934e-03,  1.21392190e-03,  1.92699491e-03,
        3.39558574e-03,  

In [57]:
print(min(dshap.values_tmc), max(dshap.values_tmc))

-0.01079796264855679 0.013191850594227339


#### Listing out the tmc-shapley values of the given dataset

In [58]:
dataTMC = pd.DataFrame({'TMC_Score': dshap.vals_tmc[:]})

In [60]:
dataTMC

Unnamed: 0,TMC_Score
0,0.002555
1,0.002861
2,0.002920
3,0.003285
4,0.001851
...,...
244,0.001647
245,-0.010798
246,0.001452
247,0.001282


##### Sorting out the tmc-shapley values that fall below '0', i.e Low data shapley values

In [62]:
dataNEG_TMC = dataTMC[(dataTMC['TMC_Score']<=0)]

##### Creating a list out of the indices of the records having low tmc-shapley values

In [63]:
drop_list_TMC = list(dataNEG_TMC.index.values) 

In [64]:
len(drop_list_TMC)

15

In [65]:
df.shape

(311, 2306)

#### Dataframe of only positively-scored images

In [66]:
modDf_TMC = df.drop(drop_list_TMC)

In [67]:
modDf_TMC.shape

(296, 2306)

In [68]:
df_TMC=pd.concat([df,dataTMC],axis=1)

In [69]:
df_TMC.shape

(311, 2307)

#### Map each shapley value with the corresponding image

In [70]:
df_TMC.head()

Unnamed: 0,img_name,x0,x1,x2,x3,x4,x5,x6,x7,x8,...,x2296,x2297,x2298,x2299,x2300,x2301,x2302,x2303,CLASS,TMC_Score
0,0_before,0.0,0.583658,0.0,0.278999,0.422945,0.240002,0.429372,0.095949,0.162866,...,0.506755,0.19081,0.109557,0.418965,0.306553,0.0,0.0,0.707116,1,0.002555
1,100_before,0.0,0.787829,0.035811,0.341216,0.597704,0.276381,0.4161,0.322096,0.137851,...,0.285379,0.456655,0.364303,0.276428,0.155343,0.0,0.0,0.388306,1,0.002861
2,101_before,0.0,0.490048,0.028478,0.311732,0.595897,0.322091,0.462107,0.085859,0.220919,...,0.397504,0.082787,0.076551,0.295827,0.22875,0.0,0.0,0.622904,1,0.00292
3,102_before,0.0,0.898348,0.0,0.529872,0.439545,0.060917,0.238451,0.208466,0.097565,...,0.358719,0.247918,0.136771,0.294839,0.224553,0.0,0.0,0.444664,1,0.003285
4,103_before,0.0,0.899669,0.0,0.531076,0.445491,0.063092,0.236066,0.212391,0.100149,...,0.364249,0.252098,0.135872,0.297459,0.226406,0.0,0.0,0.43975,1,0.001851


In [71]:
df_TMC=df_TMC.sort_values(by='TMC_Score')

In [72]:
df_TMC

Unnamed: 0,img_name,x0,x1,x2,x3,x4,x5,x6,x7,x8,...,x2296,x2297,x2298,x2299,x2300,x2301,x2302,x2303,CLASS,TMC_Score
245,40_after,0.000000,0.520624,0.046866,0.440244,0.596480,0.255171,0.427030,0.039237,0.217873,...,0.197020,0.147968,0.111078,0.151058,0.114604,0.0,0.0,0.197470,0,-0.010798
48,143_before,0.000000,0.480234,0.025560,0.304019,0.602367,0.326866,0.466311,0.088336,0.223159,...,0.386089,0.069192,0.075423,0.291007,0.228329,0.0,0.0,0.627075,1,-0.007317
163,88_before,0.000000,1.228899,0.000000,0.113848,0.398141,0.120975,0.000000,0.327981,0.006742,...,0.682844,0.171369,0.207391,0.496507,0.346075,0.0,0.0,0.920577,0,-0.006800
179,101_after,0.242088,0.000000,0.261432,0.547186,0.726428,0.430509,0.594869,0.001482,0.293618,...,0.172629,0.269613,0.242494,0.143757,0.129684,0.0,0.0,0.127782,0,-0.005806
168,92_before,0.000000,1.031794,0.000000,0.152664,0.123510,0.049361,0.059183,0.313152,0.070317,...,0.409735,0.006893,0.034800,0.348812,0.314608,0.0,0.0,0.807750,0,-0.004015
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
306,96_after,0.000000,0.519422,0.039701,0.437030,0.565162,0.242445,0.422828,0.030521,0.222343,...,0.190781,0.148461,0.111052,0.144582,0.112914,0.0,0.0,0.202290,0,
307,97_after,0.000000,0.810818,0.000000,0.072176,0.145295,0.054518,0.047322,0.189544,0.043516,...,0.397108,0.233496,0.174989,0.350880,0.301557,0.0,0.0,0.575434,0,
308,98_after,0.000000,0.513886,0.038142,0.435579,0.590671,0.251901,0.425681,0.052741,0.220616,...,0.199302,0.146744,0.107106,0.154021,0.116740,0.0,0.0,0.207460,0,
309,99_after,0.000000,0.536693,0.033060,0.449456,0.557479,0.241109,0.405025,0.034209,0.222100,...,0.198582,0.150877,0.110008,0.148939,0.107374,0.0,0.0,0.184939,0,


### Pickle list of positively-scored images

In [105]:
df_TMC[df_TMC['TMC_Score'] <0 ]['img_name']

245      40_after
48     143_before
163     88_before
179     101_after
168     92_before
5      104_before
20     118_before
167     91_before
63     157_before
84     176_before
46     141_before
60     154_before
133     60_before
8      107_before
239      35_after
Name: img_name, dtype: object

In [98]:
with open('positive_images.pkl', 'wb') as f:
    pickle.dump((df_TMC[df_TMC['TMC_Score'] <0 ]['img_name']).tolist(), f)

# CRUX
- Data Shapley algorithm trained on 311 knee surgery images
- 15 images came out as negatively scored Data Shapley value
- 296 images turned out to be good for our model performance

# END