# Notebook to create E2up or P4up QSAR models or apply them
This notebook follow every steps used to create the QSARs E2up and P4up models and apply them on the Mammary carcinogens list.<br>
It also included data analysis of each set of chemicals (Clustring, summary, ...) 

In [2]:
# load modulde for the analysis
from os import path
import MCcrossref_data
import steroidogenesis_data
import merge_MCcrossWithStereo
import pathFolder
import buildQSAR
import applyQSAR
import pubmedSearch

## 1. Initialisation of the project
### 1.1. Define folder paths and files used as inputs of the analysis  

In [3]:
# Define folders #
##################
PR_ROOT = path.abspath("../../") + "/"
PR_RESULTS = pathFolder.createFolder(PR_ROOT + "results/")
PR_DATA = PR_ROOT + "data/"

# Define dataset #
##################
# cross ref excel including all dataset to process
p_listChem = PR_DATA + "cross_lists_for_analysis_090921.xlsx" # excel created with ehp sup and Jenny list 
# exposure by chemicals
p_exposure = PR_DATA + "BCRelExposureSources_P65_051221.csv"
# list of hormone in SMILES that can be considered for the 
p_hormones = PR_DATA + "hormones.csv"
# 270 chemicals as input

## 1.2. Calibrate dataset variables

In [4]:
# Dataset preparation value #
#############################
COR_VAL = 0.9 # Person corelation limit value for descriptor selection
MAX_QUANTILE = 0 # % of the set of chemicals that are exactly the same values (0 to inactivate)

# Define datasets #
###################
# cross ref excel including all dataset to process
p_listChem = PR_DATA + "cross_lists_for_analysis_090921.xlsx" # excel created with ehp sup and Jenny list 
# exposure by chemicals
p_exposure = PR_DATA + "BCRelExposureSources_P65_051221.csv"
# list of hormone in SMILES that can be considered for the 
p_hormones = PR_DATA + "hormones.csv"
# 270 chemicals as input

## 2. Start analysis and loading data
### 2.1. Load Steroidogenesis data from Karmaus2016
Will create folder with drawn response curves

In [None]:
c_Stereo = steroidogenesis_data.Steroidogenesis_data(PR_DATA, PR_RESULTS)
c_Stereo.main()

### 2.2. Load and analysis chemicals dataset
It will process chemicals lists compute descriptors

In [None]:
c_MCcrossref = MCcrossref_data.MCcrossref(p_listChem, p_exposure, p_hormones, COR_VAL, MAX_QUANTILE, PR_ROOT + "comptox/", PR_ROOT)
c_MCcrossref.main()

### 2.3. Mix information from chemicals lists with steroidogenesis pathways
Will integrate analysis from chemicals lists and from assay results.

In [None]:
pr_out_mix = pathFolder.createFolder(PR_RESULTS + "crossRef_Stereo/") # define output folder
c_MCcrossref_stereo = merge_MCcrossWithStereo.merge_MCcrossWithStereo(c_MCcrossref, c_Stereo, pr_out_mix)
c_MCcrossref_stereo.main()

## 3. Develop QSAR models
### 3.1. E2up - with no undersampling

In [None]:
name_QSAR = "QSAR_E2_H295R_nosampling_nosingledosecheck_noborderline"
c_QSAR_E2up = buildQSAR.buildQSAR(name_QSAR, "E2up", "H295R", c_MCcrossref, PR_RESULTS, COR_VAL, MAX_QUANTILE)
c_QSAR_E2up.buildDataset(c_Stereo, borderline=0)
c_QSAR_E2up.buildDescSet(["rdkit", "OPERA", "toxprint"])
c_QSAR_E2up.prepDesc()
c_QSAR_E2up.computeSimMatrix()# similarity matrix for the AD
c_QSAR_E2up.runQSARs()# no sampling add

# best model selected manually
n_E2best_run = 2

### 3.2. E2up - undersampling with variable active rate

In [None]:
name_QSAR = "QSAR_E2_H295R_variable-sampling_singledosecheck_noborderline"
c_QSAR_E2up = buildQSAR.buildQSAR(name_QSAR, "E2up", "H295R", c_MCcrossref, PR_RESULTS, COR_VAL, MAX_QUANTILE)
c_QSAR_E2up.buildDataset(c_Stereo, borderline=0)
c_QSAR_E2up.buildDescSet(["rdkit", "OPERA", "toxprint"])
c_QSAR_E2up.prepDesc()
c_QSAR_E2up.computeSimMatrix()# similarity matrix for the AD
c_QSAR_E2up.runQSARs([0.10, 0.9])

### 3.3. P4up - no undersampling

In [None]:
name_QSAR = "QSAR_P4_H295R_nosampling_nosingledosecheck_noborderline"
c_QSAR_P4up = buildQSAR.buildQSAR(name_QSAR, "P4up", "H295R", c_MCcrossref, PR_RESULTS, COR_VAL, MAX_QUANTILE)
c_QSAR_P4up.buildDataset(c_Stereo, borderline=0)
c_QSAR_P4up.buildDescSet(["rdkit", "OPERA", "toxprint"])
c_QSAR_P4up.prepDesc()
c_QSAR_P4up.computeSimMatrix()# similarity matrix for the AD
c_QSAR_P4up.runQSARs()# no sampling add

### best model selected manually
#n_P4best_run = 3

### 3.4. P4up - undersampling with variable active rate

In [None]:
name_QSAR = "QSAR_P4_H295R_variable-sampling_singledosecheck_noborderline"
c_QSAR_P4up = buildQSAR.buildQSAR(name_QSAR, "P4up", "H295R", c_MCcrossref, PR_RESULTS, COR_VAL, MAX_QUANTILE)
c_QSAR_P4up.buildDataset(c_Stereo, borderline=0)
c_QSAR_P4up.buildDescSet(["rdkit", "OPERA", "toxprint"])
c_QSAR_P4up.prepDesc()
c_QSAR_P4up.computeSimMatrix()# similarity matrix for the AD
c_QSAR_P4up.runQSARs([0.10, 0.9])

## 4. Apply QSAR models to MC list
### 4.1. Apply QSAR E2up

In [None]:
pr_model_E2up = PR_RESULTS + "QSAR_E2_H295R_nosampling_nosingledosecheck_noborderline/rdkit-OPERA-toxprint_0.9-0/classQSAR/" + str(n_E2best_run) + "/"
pr_E2MC_pred = pathFolder.createFolder(PR_RESULTS + "predMC_E2/")
c_applyQSARE2 = applyQSAR.applyQSAR(c_MCcrossref, pr_model_E2up, pr_E2MC_pred)
c_applyQSARE2.loadDataFromCrossRef("MC", ["E2up", "H295R"], 1)
c_applyQSARE2.buildDescSet(["rdkit", "OPERA", "toxprint"])
c_applyQSARE2.applyQSARModels()
c_applyQSARE2.computeAD()
c_applyQSARE2.applyToxPrintSignifcant(PR_RESULTS)
c_applyQSARE2.mergePredToxPrintQSAR(AD_cutoff = 0.75, nb_significant_toxPrint = 3, QSAR_prob= 0.5)
c_applyQSARE2.extractStructure()

### 4.2. Apply QSAR P4up

In [None]:
pr_model_P4up = PR_RESULTS + "QSAR_E2_H295R_nosampling_nosingledosecheck_noborderline/rdkit-OPERA-toxprint_0.9-0/classQSAR/" + str(n_P4best_run) + "/"
pr_P4MC_pred = pathFolder.createFolder(PR_RESULTS + "predMC_P4/")
c_applyQSARP4 = applyQSAR.applyQSAR(c_MCcrossref, pr_model_P4up, pr_P4MC_pred)
c_applyQSARP4.loadDataFromCrossRef("MC", ["P4up", "H295R"], 1)
c_applyQSARP4.buildDescSet(["rdkit", "OPERA", "toxprint"])
c_applyQSARP4.applyQSARModels()
c_applyQSARP4.computeAD()
c_applyQSARP4.applyToxPrintSignifcant(PR_RESULTS)
c_applyQSARP4.mergePredToxPrintQSAR(AD_cutoff = 0.75, nb_significant_toxPrint = 3, QSAR_prob= 0.5)
c_applyQSARP4.extractStructure()

## 5. Litterature search to find evidences of steroidogenesis for predicted chemicals E2up or P4up
### 5.1. For E2up

In [None]:
email = "borrel@silentspring.org"

In [None]:
p_pred_E2up_xlx = PR_DATA + "pred_MC/ToxPrint_QSAR_E2up.xlsx"
l_term_to_screen_E2 = ["\"estradiol\"", "\"steroidogenesis\"", "\"aromatase\"", "\"reproductive\"", "\"breast\"", "\"mammary\""]

pr_litt_E2 = pathFolder.createFolder(PR_RESULTS + "LittMC_E2/")
c_litt_E2 = pubmedSearch.pubmedSearch(p_pred_E2up_xlx, l_term_to_screen_E2, pr_litt_E2, email)
c_litt_E2.do_search(l_term_to_screen_E2)


### 5.1. For P4up

In [None]:
p_pred_P4up_xlx = PR_DATA + "pred_MC/ToxPrint_QSAR_P4up.xlsx"
l_term_to_screen_P4 = ["\"steroidogenesis\"", "\"progesterone\"", "\"aromatase\"", "\"reproductive\"", "\"breast\"", "\"mammary\""]

pr_litt_P4 = pathFolder.createFolder(PR_RESULTS + "LittMC_P4/")
c_litt_P4 = pubmedSearch.pubmedSearch(p_pred_P4up_xlx, l_term_to_screen_P4, pr_litt_P4, email)
c_litt_P4.do_search(l_term_to_screen_P4)