# This is the modified generative model which utilizes the protein interaction information for generating gene clusters. The main generative model is implemented using [Snorkel](https://github.com/HazyResearch/snorkel)

In [None]:
from scipy import sparse
import numpy as np
from snorkel.learning import GenerativeModel
import ast
import os
import json

## Loading the MOO-based solutions as weak supervision solutions

In [None]:
dataset_type = 'prostrate'
path_prefix = 'data/'+dataset_type+'_panther/'
filepath = path_prefix + dataset_type + '_NDS_labels_50.txt'

## Loading the protein interaction infromation as the weights of the MOO-based solutions 

In [None]:
weightpath = path_prefix + dataset_type + '_weight_list.txt'
ppitext = '_withPPI'
filename = os.path.splitext(os.path.basename(filepath))[0]

## Accessing MOO-based solutions along with their PPI-based weights

In [None]:
w = open(weightpath, 'r')
f = open(filepath, 'r')

x = f.readlines()
weights = w.read()

lis_0 = x[-1]
lis_0 = lis_0[7:]

lis_0 = ast.literal_eval(lis_0)
weights = ast.literal_eval(weights)

<b>lis_0</b> is a list of list which contains MOO-based solutions and their corresponding weights is stored in <b>weights</b>

## Loading of three GO-based solutions

These three types of GO-based solutions are obtained from [PANTHER](http://www.pantherdb.org/) classification system. These solutions are based on the biological process(BP), molecular function(MF) and cellular component(CC) of the genes.   

In [None]:
with open(path_prefix + 'panther_labels_bp.txt') as json_file:
    bio_labels1 = json.load(json_file)

with open(path_prefix + 'panther_labels_mf.txt') as json_file:
    bio_labels2 = json.load(json_file)

with open(path_prefix + 'panther_labels_cc.txt') as json_file:
    bio_labels3 = json.load(json_file)

## Appending all MOO-based solutions along with the GO-based solutions in a list 

In [None]:
lis_1 = []

for row in lis_0:
    row = [x+1 for x in row]
    lis_1.append(row)


lis_1.append(bio_labels1)
lis_1.append(bio_labels2)
lis_1.append(bio_labels3)

Converting solution list(<b>lis_1</b>) into a sparse matrix

In [None]:
labels = np.array(lis_1)
sparse_labels = sparse.csr_matrix(np.transpose(labels))

## Initilizing and training of the generative model.

Here, <b><i>LF_acc_prior_weights</i></b> is the argument of the <b><i>gen_model</i></b> which accepts the protein interaction information as the weights. 
Set <b><i>LF_acc_prior_weights</i></b> = none if dont use any explicitly defined weights

In [None]:
gen_model = GenerativeModel()
gen_model.train(sparse_labels, LF_acc_prior_weights = weights) 
train_marginals = gen_model.marginals(sparse_labels)

## Storing the final labels and marginals in the file.  

In [None]:
with open(path_prefix + 'Panther_Marginals_' + filename + ppitext + '.txt', 'w') as f:
    for item in train_marginals:
        f.write("%s\n" % item)



final_labels = []
for row in train_marginals:
	final_labels.append(row.tolist().index(max(row)))


with open(path_prefix + 'Panther_gen_' + filename + ppitext + '.txt', 'w') as f:
    for item in final_labels:
        f.write("%s\n" % item)