In [8]:
import pandas as pd
import argparse
import numpy as np
import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import TensorDataset, DataLoader, ConcatDataset
from sklearn.model_selection import train_test_split, KFold
import shap
import matplotlib.pyplot as plt

from model.train import train_and_test, cross_validation
from model.code.load_data import preprocess_data
from model.code.models import NN, LSTM

In [9]:
parser = argparse.ArgumentParser()

# Model params
parser.add_argument('--model', type=str, default='NN',
                help='Which model to use for training: NN or LSTM')
parser.add_argument('--num_features', type=int, default=2834,
                help='Length of an input sequence/ amount of features each sample contains')
parser.add_argument('--input_size', type=int, default=1,
                help='Size of an input sequence')
parser.add_argument('--LSTM_hidden_size', type=int, default=128,
                help='Number of units in each LSTM layer')
parser.add_argument('--LSTM_num_layers', type=int, default=2,
                help='Number of hidden layers in the LSTM')
parser.add_argument('--NN_hidden', type=list, default=[128,128,128],
                help='List of which the length is the number of hidden layers and the values are the layer sizes in the NN')
parser.add_argument('--num_classes', type=int, default=3,
                help='Number of classes the model needs to be able to predict')

# Training params
parser.add_argument('--batch_size', type=int, default=7,
                help='Number of examples to process in a batch')
parser.add_argument('--learning_rate', type=float, default=0.001,
                help='Learning rate')
parser.add_argument('--num_epochs', type=int, default=50,
                help='Amount of epochs used in training')
parser.add_argument('--test_size', type=float, default=0.3,
                help='Amount of data to use for testing (leave zero to use all data for training')
parser.add_argument('--device', type=str, default='cuda',
                help='Device to use (cpu or gpu)')

args, unknown = parser.parse_known_args()


# Device configuration
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# Load the data innto PyTorch
train_call = pd.read_csv('full_data.csv', delimiter=',' )
train_clin = pd.read_csv('Train_clinical.txt', delimiter='\t' )
train_arr, labels, new_df = preprocess_data(train_call, train_clin)

args.num_features = train_arr.shape[1]
print("Number of features that will be used: ", args.num_features)
if args.model == 'NN':
    model = NN(args).to(args.device)
elif args.model == 'LSTM':
    model = LSTM(args).to(args.device)

torch.manual_seed(42)

X_train, X_test, y_train, y_test = train_test_split(train_arr, labels, test_size=args.test_size)
train_data = TensorDataset(torch.from_numpy(X_train), torch.from_numpy(y_train))
test_data = TensorDataset(torch.from_numpy(X_test), torch.from_numpy(y_test))

train_loader = torch.utils.data.DataLoader(train_data, 
                                    batch_size=args.batch_size,
                                    shuffle=True)
test_loader = torch.utils.data.DataLoader(test_data, 
                                    batch_size=args.batch_size, 
                                    shuffle=False)

train_and_test(args, model, train_loader, test_loader)


Number of features that will be used:  918
Epoch [50/50], Loss: 0.0000, Accuracy: 100.0
Completed training!

25 correct of  30
Accuracy of the network on the test Array sequences: 83.33333333333333 %


([41.42857142857143,
  78.57142857142857,
  97.14285714285714,
  97.14285714285714,
  98.57142857142857,
  100.0,
  100.0,
  100.0,
  100.0,
  100.0,
  100.0,
  100.0,
  100.0,
  100.0,
  100.0,
  100.0,
  100.0,
  100.0,
  100.0,
  100.0,
  100.0,
  100.0,
  100.0,
  100.0,
  100.0,
  100.0,
  100.0,
  100.0,
  100.0,
  100.0,
  100.0,
  100.0,
  100.0,
  100.0,
  100.0,
  100.0,
  100.0,
  100.0,
  100.0,
  100.0,
  100.0,
  100.0,
  100.0,
  100.0,
  100.0,
  100.0,
  100.0,
  100.0,
  100.0,
  100.0],
 [tensor(1.0783, device='cuda:0', grad_fn=<DivBackward0>),
  tensor(0.8390, device='cuda:0', grad_fn=<DivBackward0>),
  tensor(0.3078, device='cuda:0', grad_fn=<DivBackward0>),
  tensor(0.0921, device='cuda:0', grad_fn=<DivBackward0>),
  tensor(0.0373, device='cuda:0', grad_fn=<DivBackward0>),
  tensor(0.0140, device='cuda:0', grad_fn=<DivBackward0>),
  tensor(0.0043, device='cuda:0', grad_fn=<DivBackward0>),
  tensor(0.0035, device='cuda:0', grad_fn=<DivBackward0>),
  tensor(0.0024, 

In [29]:
# cross_validation(args, model, train_arr, labels)
# batch = next(iter(train_loader))
# batch = batch
# data, _ = batch
features = (new_df["ID_no"]).tolist()
gfeatures = (new_df["Gene_IDs"]).tolist()
# print(features)
# print(len(features))
# f = [x for x in features]
# # df["fruit"].astype("|S")
# print(len(features))
# f = []
sm = torch.load("model/NN.pth").to(args.device)
# dataset =  TensorDataset(torch.from_numpy(train_arr), torch.from_numpy(labels))
X_full = np.vstack((X_train, X_test))
e = shap.DeepExplainer(sm, torch.from_numpy(X_train).to(args.device))
shap_values = e.shap_values(torch.from_numpy(X_full).to(args.device))


class1 = shap_values[0]
class2 = shap_values[1]
class3 = shap_values[2]


# shap.summary_plot(shap_values, features=torch.from_numpy(X_train).to(args.device), feature_names = features, show=False)
# plt.savefig("summary_plot.png")

m = (np.mean(np.abs(shap_values), axis=0))
# print(m.shape)
mm = np.mean(m, axis=0)
# print(mm.shape)
new = np.c_[features, mm ]   
print(new.shape)
df = pd.DataFrame(new, columns=['feature_no', 'feature_importance'])
# df = pd.DataFrame({
#     "mean_0": np.mean(np.abs(class1), axis=0), 
#     "std_0": np.std(np.abs(class1), axis=0),
#     "id":features
# }, index=[2])

# df.columns = ['feature_importance']
# df = df.rename(index={0: "feature_no", 1: "feature_importance"})
# df.head()
# # print(df)
# df.to_csv('final.csv')
df['gene_ids'] = np.array(gfeatures)    
# print(df)
df = df.sort_values(by=['feature_importance'], ascending=False)
df = df.astype({'feature_no': 'int32'})
df = df.reset_index(drop=True)
df
# df.to_csv('final.csv')


(918, 2)


Unnamed: 0,feature_no,feature_importance,gene_ids
702,2184.0,0.330739,"['ENSG00000113492', 'ENSG00000064999', 'ENSG00..."
768,2379.0,0.106000,"['ENSG00000142892', 'ENSG00000176204', 'ENSG00..."
66,230.0,0.101475,['ENSG00000118200']
278,855.0,0.100935,"['ENSG00000113658', 'ENSG00000123453']"
719,2213.0,0.094339,"['ENSG00000117013', 'ENSG00000064042', 'ENSG00..."
...,...,...,...
1,25.0,0.003993,"['ENSG00000041988', 'ENSG00000146576', 'ENSG00..."
173,554.0,0.003124,"['ENSG00000162620', 'ENSG00000188687', 'ENSG00..."
814,2537.0,0.002973,"['ENSG00000215114', 'ENSG00000139734', 'ENSG00..."
860,2671.0,0.002570,"['ENSG00000133216', 'ENSG00000173535', 'ENSG00..."
