<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Small-data-set-demo" data-toc-modified-id="Small-data-set-demo-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Small data set demo</a></span><ul class="toc-item"><li><span><a href="#Import-packages" data-toc-modified-id="Import-packages-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Import packages</a></span></li><li><span><a href="#Simulate-data-set" data-toc-modified-id="Simulate-data-set-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Simulate data set</a></span></li><li><span><a href="#Covert-to-positive-unlabeled-data-set" data-toc-modified-id="Covert-to-positive-unlabeled-data-set-1.3"><span class="toc-item-num">1.3&nbsp;&nbsp;</span>Covert to positive-unlabeled data set</a></span></li><li><span><a href="#Train-PURF-model" data-toc-modified-id="Train-PURF-model-1.4"><span class="toc-item-num">1.4&nbsp;&nbsp;</span>Train PURF model</a></span></li><li><span><a href="#Examine-the-model" data-toc-modified-id="Examine-the-model-1.5"><span class="toc-item-num">1.5&nbsp;&nbsp;</span>Examine the model</a></span></li><li><span><a href="#Session-info" data-toc-modified-id="Session-info-1.6"><span class="toc-item-num">1.6&nbsp;&nbsp;</span>Session info</a></span></li></ul></li></ul></div>

# Small data set demo

## Import packages

In [1]:
from sklearn.datasets import make_classification
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from scipy.spatial import distance
from purf.pu_ensemble import PURandomForestClassifier
import pickle
import time

## Simulate data set

In [2]:
X, y = make_classification(
    n_samples=5000, 
    n_features=300, 
    n_informative=250, 
    n_redundant=40, 
    n_repeated=10, 
    n_classes=2, 
    n_clusters_per_class=1, 
    class_sep=2, 
    random_state=1)

X = pd.DataFrame(X)

print('%d data points and %d features' % (X.shape))
print('%d positive out of %d total' % (sum(y), len(y)))

5000 data points and 300 features
2495 positive out of 5000 total


## Covert to positive-unlabeled data set

In [3]:
res_ = pd.DataFrame({'label' : y})

rf = RandomForestClassifier(
    n_estimators = 1000,
    max_samples = min(sum(y==0), sum(y==1)),
    oob_score = True,
    n_jobs = -1,
    random_state = 30
)
rf.fit(X, y)
res_['rf'] = rf.oob_decision_function_[:,1]
y_orig = y.copy()

# 99% unlabeled
np.random.seed(0)
y[np.random.choice(np.where((res_['label'] == 1))[0], replace=False, size=50)] = 2 
y[y == 1] = 0
y[y == 2] = 1
res_['pu_label'] = y

print('%d positive out of %d total' % (sum(y), len(y)))
res_.to_csv('./simulation_labels.csv')

50 positive out of 5000 total


## Train PURF model

In [4]:
# Define function
def train_purf(features, outcome, res_path, pickle_path='./tmp.pkl', pos_level=0.5, save_model=True):
    # Imputation
    imputer = SimpleImputer(strategy='median')
    X = imputer.fit_transform(features)
    X = pd.DataFrame(X, index=features.index, columns=features.columns)
    y = outcome
    features = X
    # Training PURF
    purf = PURandomForestClassifier(
        n_estimators = 1000,
        oob_score = True,
        n_jobs = -1,
        random_state = 42,
        pos_level = pos_level
    )
    purf.fit(X, y)
    # Storing results
    res = pd.DataFrame({'protein_id': X.index, 'antigen_label' : y})
    res['OOB score'] = purf.oob_decision_function_[:,1]
    res = res.groupby('protein_id').mean().merge(features, left_index=True, right_index=True)
    res.to_csv(res_path)
    if save_model is True:
        with open(pickle_path, 'wb') as out:
            pickle.dump(purf, out, pickle.HIGHEST_PROTOCOL)

# Train model
st = time.time()
train_purf(X, y, res_path='./simulation_res.csv')
et = time.time()

print('Execution time:', round(et - st, 2), 'seconds')

Execution time: 96.82 seconds


## Examine the model

In [5]:
purf_model = pickle.load(open('./tmp.pkl', 'rb'))
print(purf_model)
print(purf_model.__dict__.keys())

PURandomForestClassifier(n_estimators=1000, n_jobs=-1, oob_score=True,
                         random_state=42)
dict_keys(['base_estimator', 'n_estimators', 'estimator_params', 'bootstrap', 'oob_score', 'n_jobs', 'random_state', 'verbose', 'warm_start', 'class_weight', 'max_samples', 'pos_level', 'criterion', 'max_depth', 'min_samples_split', 'min_samples_leaf', 'min_weight_fraction_leaf', 'max_features', 'max_leaf_nodes', 'min_impurity_decrease', 'min_impurity_split', 'ccp_alpha', 'n_features_in_', 'n_features_', 'n_outputs_', 'classes_', 'n_classes_', 'base_estimator_', 'estimators_', 'oob_decision_function_', 'oob_score_'])


## Session info

In [6]:
import session_info

session_info.show()