# Disjointification Example

- Demonstrates feature selection through disjointification
- Data is an example of gene expression on patients

## defs/imports/loads

In [4]:
import disjointification
from disjointification import load_gene_expression_data, Disjointification
from pathlib import Path
from pprint import pprint
import pandas as pd
import numpy as np

## Survey the dataset & decide on model parameters
### Survey the data

In [8]:
if 'labels_df' not in locals() or 'features_df' not in locals():
        print(f"Dataframes not loaded. Loading.")
        ge_data = load_gene_expression_data()
        features_df = ge_data["features"]
        labels_df = ge_data["labels"]
        print(f"features_df loaded with shape {features_df.shape}")
        print(f"labels_df loaded with shape {labels_df.shape}")
print(f"labels df shape: {labels_df.shape}")
print(f"features df shape: {features_df.shape}")

labels df shape: (3069, 8)
features df shape: (3069, 9266)


### Set Model paramers
- load_last_save_point, last_save_point - enables loading of a previous model and provide a path where it was saved, in .pkl format
- min_num_features - disjointification will stop after the best N features found
- correlation_threshold - disjointification will only select a feature less correlated to the previous ones than this
- select_num_features select_num_instance - allows shrinking the dataset to a given size (int) or fraction (fraction), primarily for debugging
- alert selection, debug print - printout when a feature has been selected via disjointification and when various actions are taken, for debugging
- model_save_folder - root path under which different models are saved

In [9]:
load_last_save_point = False
last_save_point = r"model\06_24_2023__10_58_52\06_24_2023__10_59_03_(3069, 9260).pkl"

# shrink the dataset for debugging
select_num_features = 1.0
select_num_instances = 1.0
alert_selection = True
debug_print = False
model_save_folder = r"\model"
min_num_features = 2500
correlation_threshold = 0.7

### Create model

In [13]:
if load_last_save_point:
    print(f"loading model from last save point {last_save_point}")
    test = disjointification.from_file(last_save_point)
else:
    test = Disjointification(features_file_path=None, labels_file_path=None, features_df=features_df, 
                             labels_df=labels_df, select_num_features=select_num_features, select_num_instances=select_num_instances, 
                             root_save_folder=model_save_folder, do_set=False, alert_selection=alert_selection, 
                             correlation_threshold=correlation_threshold, min_num_features=min_num_features)
    test.set()
test.describe()

saving model...
saved model to C:\model\07_09_2023__21_36_14\07_09_2023__21_36_14.pkl
Disjointification Test Description
features data: (3069, 9260)
labels data: (3069, 2)
min num of features to keep in disjointification: 2500
correlation threshold: 0.7
last save point: \model\07_09_2023__21_36_14\07_09_2023__21_36_14.pkl


### Create a save point

In [14]:
last_save_point = test.last_save_point_file
print('last save point:')
print(last_save_point)
test = disjointification.from_file(last_save_point)

last save point:
\model\07_09_2023__21_36_14\07_09_2023__21_36_14.pkl


In [15]:
test.describe()

Disjointification Test Description
features data: (3069, 9260)
labels data: (3069, 2)
min num of features to keep in disjointification: 2500
correlation threshold: 0.7
last save point: \model\07_09_2023__21_36_14\07_09_2023__21_36_14.pkl


### Run Disjointification

In [16]:
start_time = disjointification.utils.get_dt_in_fmt()
print(f"{start_time} Running Disjointificatioin")
test.run_disjointification()

07_09_2023__21_36_30 Running Disjointificatioin
saving model...
saved model to C:\model\07_09_2023__21_36_14\07_09_2023__21_36_14.pkl
saving model...
saved model to C:\model\07_09_2023__21_36_14\07_09_2023__21_36_14.pkl


In [17]:
pprint(f"features selected in disjointification lin: \n{test.features_selected_in_disjointification_lin}")

('features selected in disjointification lin: \n'
 "['LCK', 'PLCG2', 'SLA', 'GIMAP6', 'FCRL2', 'B2M', 'HLA-DOA', 'ZBTB32', "
 "'TNFRSF17', 'FCER2', 'KLRC1', 'VNN2', 'LAIR2', 'BCL2A1', 'HSD11B1', 'FXYD2', "
 "'CD1B', 'STK17B', 'P2RX5', 'TCF7', 'STX11', 'GP1BA', 'RASSF2', 'SLCO5A1', "
 "'FNBP1', 'IL15', 'DEF6', 'VCAM1', 'GPR25', 'CASP4', 'VAMP5', 'CAMK4', "
 "'LAP3', 'CR1', 'APOBEC3C', 'SYK', 'DUSP2', 'CCL4', 'BTN3A2', 'GBP2', "
 "'SIPA1', 'SLC12A3', 'LY96', 'FOXN2', 'CSK', 'RAB8B', 'DOCK10', 'PLA1A', "
 "'TMEM140', 'STK4', 'CCL17', 'CXCL13', 'CRLF3', 'PIP4K2A', 'CCL13', 'PTPN6', "
 "'MAP3K14', 'IL7', 'CXCR4', 'NMI', 'WNT1', 'BTN2A2', 'P2RY6', 'TCN2', "
 "'CYTH1', 'MATK', 'CCL22', 'ARAP2', 'NECAP2', 'MSL3', 'CD101', 'TRIM21', "
 "'GMIP', 'MX2', 'ITM2C', 'PPP2R2B', 'HLA-DQB1', 'FXYD7', 'BATF3', 'PRDM8', "
 "'ACHE', 'ARRB2', 'RASGRP3', 'ADA', 'CEACAM21', 'SOD2', 'NFKB2', 'BCL2L14', "
 "'IL21', 'ASGR2', 'ZNF80', 'ISG20', 'F5', 'IL1R2', 'C3', 'DAPP1', 'C2', "
 "'IL10', 'LBR', 'ITIH1', 'SLC15

In [18]:
pprint(f"features selected in disjointification log: \n{test.features_selected_in_disjointification_log}")

('features selected in disjointification log: \n'
 "['A2M', 'PRKACB', 'PPP3CC', 'PPP3R1', 'PPP4R1', 'PPP4R4', 'PPP5C', 'PPP6C', "
 "'PPP6R1', 'PPRC1', 'PPT1', 'PPT2', 'PPWD1', 'PPY', 'PQLC1', 'PQLC3', "
 "'PRAF2', 'PPP3CB', 'PPP3CA', 'PPP2R5E', 'PPP2R2B', 'PPP1R9A', 'PPP2CA', "
 "'PPP2CB', 'PPP2R1A', 'PPP2R1B', 'PPP2R2A', 'PPP2R2D', 'PPP2R5D', 'PPP2R3A', "
 "'PPP2R3B', 'PPP2R3C', 'PPP2R5A', 'PPP2R5B', 'PPP2R5C', 'PRAME', 'PRAMEF10', "
 "'PRB2', 'PRG2', 'PRDX4', 'PRDX6', 'PREB', 'PREP', 'PREPL', 'PRF1', 'PRG3', "
 "'PRDX2', 'PRH2', 'PRIM1', 'PRIM2', 'PRKAA2', 'PRKAB1', 'PRKAB2', 'PRDX3', "
 "'PRDX1', 'PRB3', 'PRDM12', 'PRC1', 'PRCC', 'PRCP', 'PRDM1', 'PRDM10', "
 "'PRDM11', 'PRDM13', 'PRDM9', 'PRDM14', 'PRDM16', 'PRDM2', 'PRDM4', 'PRDM5', "
 "'PRDM8', 'PPP1R8', 'PPP1R7', 'PPP1R3C', 'PPBP', 'PPA1', 'PPARA', 'PPARD', "
 "'PPARG', 'PPARGC1A', 'PPAT', 'PPCDC', 'POU6F1', 'PPCS', 'PPDPF', 'PPEF1', "
 "'PPEF2', 'PPFIA1', 'PPFIA2', 'POU6F2', 'POU5F1', 'PPFIA4', 'POR', 'POMZP3', "
 "'PON2', 'PON

In [20]:
print(f"Last saved {test.last_save_point_file} at {test.last_save_time}")

Last saved \model\07_09_2023__21_36_14\07_09_2023__21_36_14.pkl at 07_10_2023__01_16_21
