In [3]:
import numpy as np
import pandas as pd

from sklearn.utils import check_random_state
from sklearn.datasets import fetch_openml 

In [4]:
#https://www.openml.org/search?type=data&sort=version&status=any&order=asc&exact_name=MagicTelescope&id=1120
data = fetch_openml("MagicTelescope", version=1, parser='auto').frame
data["class:"] = data["class:"].cat.codes # categorical class to numerical
iclass=10

In [5]:
data.describe()

Unnamed: 0,fLength:,fWidth:,fSize:,fConc:,fConc1:,fAsym:,fM3Long:,fM3Trans:,fAlpha:,fDist:,class:
count,19020.0,19020.0,19020.0,19020.0,19020.0,19020.0,19020.0,19020.0,19020.0,19020.0,19020.0
mean,53.250154,22.180966,2.825017,0.380327,0.214657,-4.331745,10.545545,0.249726,27.645707,193.818026,0.35163
std,42.364855,18.346056,0.472599,0.182813,0.110511,59.206062,51.000118,20.827439,26.103621,74.731787,0.477492
min,4.2835,0.0,1.9413,0.0131,0.0003,-457.9161,-331.78,-205.8947,0.0,1.2826,0.0
25%,24.336,11.8638,2.4771,0.2358,0.128475,-20.58655,-12.842775,-10.849375,5.547925,142.49225,0.0
50%,37.1477,17.1399,2.7396,0.35415,0.1965,4.01305,15.3141,0.6662,17.6795,191.85145,0.0
75%,70.122175,24.739475,3.1016,0.5037,0.285225,24.0637,35.8378,10.946425,45.88355,240.563825,1.0
max,334.177,256.382,5.3233,0.893,0.6752,575.2407,238.321,179.851,90.0,495.561,1.0


Create semi-supervised data using (stratified) random selection (MCAR):

In [12]:
from sklearn.model_selection import train_test_split

def make_unlabeled_random_strafied(data, iclass, proportion, random_state=42):
    if not (0 < proportion <= 1):
        raise ValueError("The proportion of labeled samples must be a float between 0 and 1.")
    
    # Split the dataset into labeled and unlabeled subsets
    labeled_data, unlabeled_data = train_test_split(
        data,
        stratify=data.iloc[:,iclass],
        train_size=proportion,
        random_state=random_state
    )
    
    # Replace the class values of the unlabeled data with NaN
    unlabeled_data = unlabeled_data.copy()
    # Convert to float to allow NaN values
    unlabeled_data = unlabeled_data.astype({unlabeled_data.columns[iclass]: float})
    unlabeled_data.iloc[:,iclass] = np.nan
    
    # Combine the labeled and unlabeled data back
    ss_data = pd.concat([labeled_data, unlabeled_data]).sort_index()

    return ss_data

In [13]:
ss_data = make_unlabeled_random_strafied(data, iclass, 0.1, random_state=32)
ss_data.describe()

Unnamed: 0,fLength:,fWidth:,fSize:,fConc:,fConc1:,fAsym:,fM3Long:,fM3Trans:,fAlpha:,fDist:,class:
count,19020.0,19020.0,19020.0,19020.0,19020.0,19020.0,19020.0,19020.0,19020.0,19020.0,1902.0
mean,53.250154,22.180966,2.825017,0.380327,0.214657,-4.331745,10.545545,0.249726,27.645707,193.818026,0.351735
std,42.364855,18.346056,0.472599,0.182813,0.110511,59.206062,51.000118,20.827439,26.103621,74.731787,0.477637
min,4.2835,0.0,1.9413,0.0131,0.0003,-457.9161,-331.78,-205.8947,0.0,1.2826,0.0
25%,24.336,11.8638,2.4771,0.2358,0.128475,-20.58655,-12.842775,-10.849375,5.547925,142.49225,0.0
50%,37.1477,17.1399,2.7396,0.35415,0.1965,4.01305,15.3141,0.6662,17.6795,191.85145,0.0
75%,70.122175,24.739475,3.1016,0.5037,0.285225,24.0637,35.8378,10.946425,45.88355,240.563825,1.0
max,334.177,256.382,5.3233,0.893,0.6752,575.2407,238.321,179.851,90.0,495.561,1.0


Create semi-supervised data using per-label sampling with different proportions (MAR):

In [16]:
def make_unlabeled_random_per_label(data, iclass, proportions, random_state=42):
    if not all(0 < p <= 1 for p in proportions.values()):
        raise ValueError("The proportion of labeled samples must always be a float between 0 and 1.")

    ss_subsets = []
    rng = check_random_state(random_state)

    for label, prop in proportions.items():
        # Keep only samples with `label`
        data_with_label = data[data.iloc[:,iclass] == label]
        
        # Stratified split for the current class
        labeled_data, unlabeled_data = train_test_split(
            data_with_label,
            train_size=prop,
            random_state=rng
        )
        
        # Replace the class values of the unlabeled data with NaN
        unlabeled_data = unlabeled_data.copy()
        unlabeled_data = unlabeled_data.astype({unlabeled_data.columns[iclass]: float})
        unlabeled_data.iloc[:,iclass] = np.nan
        
        # Combine labeled and unlabeled data for this subset
        ss_subsets.append(pd.concat([labeled_data, unlabeled_data]))
    
    # Combine the subsets from each label back
    ss_data = pd.concat(ss_subsets).sort_index()
    return ss_data

In [17]:
ss_data = make_unlabeled_random_per_label(data, iclass, {0:0.08,1:0.3}, random_state=42)
ss_data.describe()

Unnamed: 0,fLength:,fWidth:,fSize:,fConc:,fConc1:,fAsym:,fM3Long:,fM3Trans:,fAlpha:,fDist:,class:
count,19020.0,19020.0,19020.0,19020.0,19020.0,19020.0,19020.0,19020.0,19020.0,19020.0,2992.0
mean,53.250154,22.180966,2.825017,0.380327,0.214657,-4.331745,10.545545,0.249726,27.645707,193.818026,0.670455
std,42.364855,18.346056,0.472599,0.182813,0.110511,59.206062,51.000118,20.827439,26.103621,74.731787,0.470127
min,4.2835,0.0,1.9413,0.0131,0.0003,-457.9161,-331.78,-205.8947,0.0,1.2826,0.0
25%,24.336,11.8638,2.4771,0.2358,0.128475,-20.58655,-12.842775,-10.849375,5.547925,142.49225,0.0
50%,37.1477,17.1399,2.7396,0.35415,0.1965,4.01305,15.3141,0.6662,17.6795,191.85145,1.0
75%,70.122175,24.739475,3.1016,0.5037,0.285225,24.0637,35.8378,10.946425,45.88355,240.563825,1.0
max,334.177,256.382,5.3233,0.893,0.6752,575.2407,238.321,179.851,90.0,495.561,1.0


Create semi-supervised data when the reason for a missing value is encoded somehow by the features (MNAR):

In [18]:
def make_unlabeled_random_feature_dependent(data, iclass, proportion, labeling_policy, random_state):
    # Validate the labeling policy
    if not callable(labeling_policy):
        raise ValueError("labeling_policy must be a callable function.")
    
    # Apply the labeling policy: get a (sort of) probability per sample
    probabilities = data.apply(lambda x: labeling_policy(x), axis=1)

    print(probabilities.mean())
    scaling_factor = proportion / probabilities.mean()
    probabilities = np.clip(probabilities * scaling_factor, 0, 1)
    
    # Determines samples that remain labeled
    rng = check_random_state(random_state)
    is_labeled = rng.random(len(data)) < probabilities
    
    # Set class value for unlabeled samples to NaN
    ss_data = data.copy()
    ss_data.iloc[~is_labeled, iclass] = np.nan
    
    return ss_data

In [19]:
from sklearn.linear_model import LogisticRegression
rng = check_random_state(31)

# we will be using an LR model trained on a subset of features
prop_features = 0.4 # proportion of features to keep (randomly chosen)
lr_model = LogisticRegression(random_state=rng)
fss = list(rng.choice([col for col in data.columns if col != iclass], 
                      size=int((len(data.columns)-1)*prop_features), 
                      replace=False))
lr_model.fit(data[fss].values, data.iloc[:,iclass])

# The labeling policy as the probability of the positive class predicted by the model
labeling_policy = lambda x: lr_model.predict_proba(x[fss].values.reshape(1, -1))[0, 1]

ss_data = make_unlabeled_random_feature_dependent(data, iclass, 0.1, labeling_policy, random_state=rng)
ss_data.describe()

0.3516297745325064


Unnamed: 0,fLength:,fWidth:,fSize:,fConc:,fConc1:,fAsym:,fM3Long:,fM3Trans:,fAlpha:,fDist:,class:
count,19020.0,19020.0,19020.0,19020.0,19020.0,19020.0,19020.0,19020.0,19020.0,19020.0,1859.0
mean,53.250154,22.180966,2.825017,0.380327,0.214657,-4.331745,10.545545,0.249726,27.645707,193.818026,0.563744
std,42.364855,18.346056,0.472599,0.182813,0.110511,59.206062,51.000118,20.827439,26.103621,74.731787,0.496054
min,4.2835,0.0,1.9413,0.0131,0.0003,-457.9161,-331.78,-205.8947,0.0,1.2826,0.0
25%,24.336,11.8638,2.4771,0.2358,0.128475,-20.58655,-12.842775,-10.849375,5.547925,142.49225,0.0
50%,37.1477,17.1399,2.7396,0.35415,0.1965,4.01305,15.3141,0.6662,17.6795,191.85145,1.0
75%,70.122175,24.739475,3.1016,0.5037,0.285225,24.0637,35.8378,10.946425,45.88355,240.563825,1.0
max,334.177,256.382,5.3233,0.893,0.6752,575.2407,238.321,179.851,90.0,495.561,1.0
