# Disjointification Example

- Demonstrates feature selection through disjointification
- Data is an example of gene expression on patients

## defs/imports/loads

In [31]:
import disjointification
from disjointification import load_gene_expression_data, Disjointification
from pathlib import Path
from pprint import pprint
import pandas as pd
import numpy as np

## Survey the dataset & decide on model parameters
### Survey the data

In [32]:
if 'labels_df' not in locals() or 'features_df' not in locals():
        print(f"Dataframes not loaded. Loading.")
        ge_data = load_gene_expression_data()
        features_df = ge_data["features"]
        labels_df = ge_data["labels"]
        print(f"features_df loaded with shape {features_df.shape}")
        print(f"labels_df loaded with shape {labels_df.shape}")
print(f"labels df shape: {labels_df.shape}")
print(f"features df shape: {features_df.shape}")

labels df shape: (3069, 7)
features df shape: (3069, 9260)


### Set Model paramers
- load_last_save_point, last_save_point - enables loading of a previous model and provide a path where it was saved, in .pkl format
- min_num_features - disjointification will stop after the best N features found
- correlation_threshold - disjointification will only select a feature less correlated to the previous ones than this
- select_num_features select_num_instance - allows shrinking the dataset to a given size (int) or fraction (fraction), primarily for debugging
- alert selection, debug print - printout when a feature has been selected via disjointification and when various actions are taken, for debugging
- model_save_folder - root path under which different models are saved

In [33]:
load_last_save_point = False
last_save_point = r"model\06_24_2023__10_58_52\06_24_2023__10_59_03_(3069, 9260).pkl"

# shrink the dataset for debugging
select_num_features = 1.0
select_num_instances = 1.0
alert_selection = True
debug_print = False
model_save_folder = r"\model"
min_num_features = 800
correlation_threshold = 0.2

### Create model

In [34]:
if load_last_save_point:
    print(f"loading model from last save point {last_save_point}")
    test = disjointification.from_file(last_save_point)
else:
    test = Disjointification(features_file_path=None, labels_file_path=None, features_df=features_df, 
                             labels_df=labels_df, select_num_features=select_num_features, select_num_instances=select_num_instances, 
                             root_save_folder=model_save_folder, do_set=False, alert_selection=alert_selection, 
                             correlation_threshold=correlation_threshold, min_num_features=min_num_features)
    test.set()
test.describe()

saving model...
saved model to C:\model\08_08_2023__20_07_47\08_08_2023__20_07_47.pkl
Disjointification Test Description
features data: (3069, 9260)
labels data: (3069, 2)
regression label: Lympho
classification label: ER
correlation method regression: pearson
correlation method regression: kendall
min num of features to keep in disjointification: 800
correlation threshold: 0.2
last save point: \model\08_08_2023__20_07_47\08_08_2023__20_07_47.pkl
number of features kept in disjointification: lin 0, log 0


### Create a save point

In [35]:
last_save_point = test.last_save_point_file
print('last save point:')
print(last_save_point)
test = disjointification.from_file(last_save_point)

last save point:
\model\08_08_2023__20_07_47\08_08_2023__20_07_47.pkl


In [36]:
test.describe()

Disjointification Test Description
features data: (3069, 9260)
labels data: (3069, 2)
regression label: Lympho
classification label: ER
correlation method regression: pearson
correlation method regression: kendall
min num of features to keep in disjointification: 800
correlation threshold: 0.2
last save point: \model\08_08_2023__20_07_47\08_08_2023__20_07_47.pkl
number of features kept in disjointification: lin 0, log 0


### Run Disjointification

In [37]:
start_time = disjointification.utils.get_dt_in_fmt()
print(f"{start_time} Running Disjointificatioin")
test.run_disjointification()

08_08_2023__20_07_59 Running Disjointificatioin
saving model...
saved model to C:\model\08_08_2023__20_07_47\08_08_2023__20_07_47.pkl
saving model...
saved model to C:\model\08_08_2023__20_07_47\08_08_2023__20_07_47.pkl


In [38]:
pprint(f"features selected in disjointification lin: \n{test.features_selected_in_disjointification_lin}")

('features selected in disjointification lin: \n'
 "['LCK', 'NFKB1', 'CNR1', 'RAB1B', 'PTPRF', 'ZNF358', 'PRCP', 'SULT1B1', "
 "'LEPROTL1', 'PSTPIP2', 'SYNJ2', 'MYL2', 'ABCB11', 'NUDCD3', 'GRM8', 'SNTA1', "
 "'C19orf73', 'SSR4', 'RHOH', 'ZNF394', 'IL17A', 'NTRK1', 'GNB3', 'CFHR3', "
 "'GRK4', 'SYT2', 'FAR2', 'SMYD3', 'TMSB15B', 'MMP20', 'IL20RA', 'UGGT2', "
 "'CDV3', 'DEFA4', 'QRSL1', 'OR3A3', 'DDX43', 'ARL14', 'OMG', 'GATC', "
 "'IL20RB', 'FHL2', 'CYP2A13', 'ATF3', 'APC2', 'CHD7', 'SUPT3H', 'RGPD1', "
 "'ACTN3', 'FLOT2', 'OTUB2', 'GNG13', 'FAM153A', 'BTNL2', 'RETN', 'CYP4F2', "
 "'DNAJC8', 'CCDC15', 'KIR3DL3', 'CPNE6', 'CSPG5', 'CPPED1', 'CISD1', 'ATP4A', "
 "'YOD1', 'C3orf14', 'PYGL', 'HK2', 'PHYH', 'MC4R', 'CLEC4M', 'GLUD2', 'AAK1', "
 "'SLC12A5', 'VRTN', 'RPGRIP1', 'MAN1B1', 'CRYBA4', 'RPSA', 'CASP7', 'PADI3', "
 "'ANXA10', 'TPPP3', 'RRH', 'HIST1H2BI', 'IMPG2', 'PEPD', 'SLC5A5', 'FADS1', "
 "'WWP2', 'GPC5', 'DNAJB1', 'EIF3C', 'MYH14', 'MLC1', 'KIF25', 'CALM1', "
 "'BCAN', 'AURKC', 

In [39]:
pprint(f"features selected in disjointification log: \n{test.features_selected_in_disjointification_log}")

('features selected in disjointification log: \n'
 "['ESR1', 'SLC26A3', 'SLC24A2', 'AHSG', 'AMHR2', 'GH2', 'PIP4K2B', 'DOCK3', "
 "'HRG', 'GLRX5', 'GLRA3', 'CRISP2', 'ISOC1', 'KRT34', 'TEC', 'MTNR1A', "
 "'MYH6', 'HSPB3', 'ANO2', 'HRH3', 'VPS35', 'TNNI3', 'DNAH2', 'IL17A', "
 "'R3HCC1', 'CTBP1', 'IFNA10', 'HSD17B1', 'ARC', 'SARDH', 'GDF3', 'PGM3', "
 "'SLC39A8', 'CDC37L1', 'SLC28A3', 'DTNB', 'NKX2-1', 'MCF2', 'FGF14', "
 "'ZNF549', 'SERPINC1', 'KRT20', 'SLC22A1', 'CALCA', 'PRDM9', 'GPC5', "
 "'ALDH3B1', 'RNLS', 'GH1', 'S100G', 'SCN1A', 'RBFOX1', 'DSCAM', 'ATP2B3', "
 "'NR5A1', 'CAMK2G', 'SGK2', 'RBPJL', 'TUBA3C', 'CASP7', 'HAND1', 'ATP12A', "
 "'SLC25A4', 'STMN4', 'MNS1', 'TMPRSS15', 'NCR2', 'PRKACA', 'NR1I3', 'KIF25', "
 "'EDF1', 'RSF1', 'CYP2F1', 'DSCR4', 'CYP51A1', 'GNPDA1', 'CCZ1', 'TGM3', "
 "'FAM50B', 'CALML4', 'TDRD3', 'DNTT', 'HAUS4', 'RFPL2', 'DNAH3', 'TRDMT1', "
 "'LRRC36', 'HIST1H3I', 'FOLR3', 'SLC34A1', 'CFHR2', 'BTNL3', 'PEX5', 'CDH15', "
 "'HIST1H4C', 'IL13', 'MYOG', 'KLH

In [40]:
print(f"Last saved \n{test.last_save_point_file} \nat \n{test.last_save_time}")

Last saved 
\model\08_08_2023__20_07_47\08_08_2023__20_07_47.pkl 
at 
08_09_2023__06_05_46
