In [116]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import openml

In [109]:
def getNumMissingPatterns(X):
    mask = X.isnull().values
    decimal_arr = []
    for row in mask:
        binary_str = ''.join(['1' if val else '0' for val in row])
        decimal_num = int(binary_str, 2)
        decimal_arr.append(decimal_num)
    
    decimal_arr = set(decimal_arr)
    if 0 in decimal_arr:
        return len(decimal_arr) - 1
    else:
        return len(decimal_arr) 
    
def cat2Dummies(df, cat_cols):
    df = pd.get_dummies(df, columns=cat_cols, dummy_na=True, drop_first = True,dtype=np.float64)
    return df

def dropColumns(df, cols):
    df = df.drop(columns=cols)
    return df

def dropUninformative(df):
    df = df.loc[:, df.std() > 0]
    return df

def printMissingness(df):
    print(f'Percent missing: {df.isna().sum().sum()/df.size:.2%}')
    print(f'Percent of observations with at least one missing feature: {df.isna().any(axis=1).sum()/df.shape[0]:.2%}')
    print(f'Number of unique missing patterns: {getNumMissingPatterns(df)} out of {2**df.shape[1]:,} possible patterns.')

# R-DATASETS

### Chile dataset
- source: [CRAN](https://rdrr.io/cran/carData/man/Chile.html)
- data about voting intentions of Chileans in 1988 elections
- multi-class target of Yes, No, Undecided and Abstain transformed to YES vs. rest classification

In [393]:
# load data
chile = pd.read_csv('datasets\chile.csv')

# drop rows with missing target
chile.dropna(subset=['vote'], inplace=True)

# process target to binary classification YES vs. REST
y = chile['vote']
y = y.apply(lambda x: 1.0 if x == 'Y' else 0.0)

# process categorical features
chile.drop(columns=['rownames', 'vote'], inplace=True)
cat_cols = ['region', 'sex', 'education']
chile = cat2Dummies(chile, cat_cols)
chile = chile.astype(np.float64)

# drop non-informative features
chile = dropUninformative(chile)

# append target and save to csv
chile['y'] = y
chile.to_csv('processed_data\chile.csv', index=False)

# print missingness information
printMissingness(chile)


Percent missing: 0.29%
Percent of observations with at least one missing feature: 3.63%
Number of unique missing patterns: 4 out of 8,192 possible patterns.


### Schooling
- source: [Ecdat](https://rdrr.io/cran/Ecdat/man/Schooling.html)
- data from 1976 about wages and schooling of young men in US
- target is log wage, features are about the individual's education

In [394]:
# load data
schooling = pd.read_csv('datasets\schooling.csv')

# process target
y = schooling['lwage76']

# process categorical features
drop_cols = ['rownames', 'wage76', 'lwage76']
schooling = dropColumns(schooling, drop_cols)

cat_cols = schooling.select_dtypes(include=['object']).columns
schooling = cat2Dummies(schooling, cat_cols)
schooling = schooling.astype(np.float64)

# drop non-informative features
schooling = dropUninformative(schooling)

# append target and save to csv
schooling['y'] = y
schooling.to_csv('processed_data\schooling.csv', index=False)

# print missingness information
printMissingness(schooling)

Percent missing: 1.00%
Percent of observations with at least one missing feature: 32.23%
Number of unique missing patterns: 3 out of 8,589,934,592 possible patterns.


### Schooling 2
- source: [Ecdat](https://rdrr.io/cran/Ecdat/man/RetSchool.html)
- another dataset on schooling and wages
- target is log wage

In [395]:
# load data
retschool = pd.read_csv('datasets/retschool.csv')

# process target
retschool.dropna(subset=['wage76'], inplace=True)
y = retschool['wage76']

# process categorical features
drop_cols = ['rownames', 'wage76']
retschool = dropColumns(retschool, drop_cols)
cat_cols = ['black', 'south76', 'smsa76', 'region', 'smsa66', 'momdad14', 'sinmom14', 'nodaded', 'nomomed', 'famed', 'col4']
retschool = cat2Dummies(retschool, cat_cols)

# drop non-informative features
retschool = dropUninformative(retschool)

# append target and save to csv
retschool['y'] = y
retschool.to_csv('processed_data/retschool.csv', index=False)

# print missingness information
printMissingness(retschool)

Percent missing: 0.04%
Percent of observations with at least one missing feature: 0.62%
Number of unique missing patterns: 1 out of 2,147,483,648 possible patterns.


# UCI REPOSITORY

### Wiki
- source: [UCI dataset repository](https://archive.ics.uci.edu/dataset/334/wiki4he)
- survey of Spanish university employees regarding their use and perception of Wikipedia
- mix of categorical, continuous and 5-point likert scale variables
- the target is a binary *would/would not recommned using Wikipedia to their students*

In [117]:
# load data
wiki = pd.read_csv('datasets\wiki4HE.csv', sep=';')

# process categorical features
wiki.replace('?', np.nan, inplace=True)
cat_cols = ['GENDER', 'DOMAIN', 'PhD', 'UNIVERSITY', 'UOC_POSITION', 'OTHER_POSITION', 'OTHERSTATUS', 'USERWIKI']
wiki = cat2Dummies(wiki, cat_cols)
wiki = wiki.astype(np.float64)

# drop non-informative features
wiki = dropUninformative(wiki)

# process target (recommends Wiki to students vs. doesn't recommend)
wiki.dropna(subset=['Use3'], inplace=True)
y = wiki['Use3']
y = y.apply(lambda x: 1.0 if x in [5, 4] else 0.0)
drop_cols = ['Use1', 'Use2', 'Use3', 'Use4', 'Use5']
wiki = dropColumns(wiki, drop_cols)

# append target and save to csv
wiki['y'] = y
wiki.to_csv('processed_data\yes_miss\wiki.csv', index=False)

# print missingness information
printMissingness(wiki)


Percent missing: 1.52%
Percent of observations with at least one missing feature: 31.31%
Number of unique missing patterns: 157 out of 147,573,952,589,676,412,928 possible patterns.


### Support2 Dataset
- source: [UCI Repository](https://archive.ics.uci.edu/dataset/880/support2)
- predict (binary) death from patient information

In [118]:
# load data
support = pd.read_excel('datasets/support.xls', engine='xlrd')

# process target
y = support['death']
y = y.astype(np.float64)

# process categorical features
support['income'] = support['income'].map({'under $11k': 0, '$11-$25k': 1, '$25-$50k': 2, '>$50k': 3})
cat_cols = ['sex', 'dzgroup', 'dzclass', 'race', 'sfdm2']
drop_cols = ['hospdead', 'death']
support = dropColumns(support, drop_cols)
support = cat2Dummies(support, cat_cols)
support = support.astype(np.float64)

# drop non-informative features
support = dropUninformative(support)

# append target and save to csv
support['y'] = y
support.to_csv('processed_data\yes_miss\support.csv', index=False)

# print missingness information
printMissingness(support)


*** No CODEPAGE record, no encoding_override: will use 'iso-8859-1'
Percent missing: 9.30%
Percent of observations with at least one missing feature: 96.60%
Number of unique missing patterns: 310 out of 1,125,899,906,842,624 possible patterns.


# KAGGLE

### Housing Price dataset
- source: [Kaggle](https://www.kaggle.com/competitions/home-data-for-ml-course/data?select=train.csv)
- predict sales price based on features of the house

In [398]:
# load data
housing = pd.read_csv('datasets/housing.csv')

# process target
y = housing['SalePrice']
y.astype(np.float64)

# process categorical features
drop_cols = ['Id', 'SalePrice']
housing = dropColumns(housing, drop_cols)
cat_cols = housing.select_dtypes(include=['object']).columns
housing = cat2Dummies(housing, cat_cols)
housing.astype(np.float64)

# drop non-informative features
housing = dropUninformative(housing)

# append target and save to csv
housing['y'] = y
housing.to_csv('processed_data/housing.csv', index=False)

# print missingness information
printMissingness(housing)

Percent missing: 0.09%
Percent of observations with at least one missing feature: 23.22%
Number of unique missing patterns: 5 out of 3,705,346,855,594,118,253,554,271,520,278,013,051,304,639,509,300,498,049,262,642,688,253,220,148,477,952 possible patterns.


### Identifying Age-Related Conditions
- source: [Kaggle](https://www.kaggle.com/competitions/icr-identify-age-related-conditions/data?select=train.csv)
- predict if patient has been identified with some condition or not (binary classification)

In [413]:
# load data
icr = pd.read_csv('datasets/icr.csv')

# process target
y = icr['Class'].astype(np.float64)
icr = dropColumns(icr, ['Class'])

# process categorical features
icr = dropColumns(icr, ['Id'])
cat_cols = icr.select_dtypes(include=['object']).columns
icr = cat2Dummies(icr, cat_cols)
icr = icr.astype(np.float64)

# drop non-informative features
icr = dropUninformative(icr)

# append target and save to csv
icr['y'] = y
icr.to_csv('processed_data/icr.csv', index=False)

# print missingness information
printMissingness(icr)

Percent missing: 0.37%
Percent of observations with at least one missing feature: 11.18%
Number of unique missing patterns: 7 out of 144,115,188,075,855,872 possible patterns.


# OPEN-ML DATASETS

In [119]:
import openml
def loadDataset(dataset_id):
    dataset = openml.datasets.get_dataset(dataset_id, 
                                          download_data=True, 
                                          download_qualities=False, 
                                          download_features_meta_data=False)
    X, y, cat_indicator, feature_names = dataset.get_data(target=dataset.default_target_attribute, 
                                                          dataset_format='dataframe')
    return X, y, cat_indicator, feature_names

### Hypothyroid
- source: [OpenML id: 1000](https://www.openml.org/search?type=data&status=active&id=1000)
- binary classification of thyroid disease 

In [400]:
# load data
hypothyroid, y, cat_indicator, feature_names = loadDataset(1000)

# process target
y = y.apply(lambda x: 1.0 if x == 'P' else 0.0)

# process categorical features
cat_cols = np.array(feature_names)[np.array(cat_indicator)]
hypothyroid = cat2Dummies(hypothyroid, cat_cols)
hypothyroid = dropUninformative(hypothyroid)

# append target and save to csv
hypothyroid['y'] = y
hypothyroid.to_csv('processed_data/hypothyroid.csv', index=False)

# print missingness information
printMissingness(hypothyroid)

Percent missing: 1.77%
Percent of observations with at least one missing feature: 27.07%
Number of unique missing patterns: 16 out of 4,294,967,296 possible patterns.


### Salaries
-  source: [OpenML id: 488](https://www.openml.org/search?type=data&status=active&id=488)
- binary prediction of two types of universities based on the salaries of their staff

In [401]:
# load data
salaries, y, cat_indicator, feature_names = loadDataset(488)

# process target
y = y.apply(lambda x: 1.0 if x in ['I', 'IIA'] else 0.0)

# process categorical features
salaries = dropColumns(salaries, ['State'])

# append target and save to csv
salaries['y'] = y
salaries.to_csv('processed_data/salaries.csv', index=False)

# print missingness information
printMissingness(salaries)

Percent missing: 1.57%
Percent of observations with at least one missing feature: 7.49%
Number of unique missing patterns: 7 out of 16,384 possible patterns.


### Cirrhosis
- source: [OpenML id: 802](https://www.openml.org/search?type=data&status=active&id=802)
- predict liver cirrhosis from patient data

In [120]:
# load data
cirrhosis, y, cat_indicator, feature_names = loadDataset(802)

# process target
y = y.apply(lambda x: 1.0 if x == 'P' else 0.0)

# process categorical features
cirrhosis['day'] = pd.to_numeric(cirrhosis['day'], errors='coerce')
cat_cols = np.array(feature_names)[np.array(cat_indicator)]
cat_cols = np.delete(cat_cols, np.where(cat_cols == 'day'))
cirrhosis = cat2Dummies(cirrhosis, cat_cols)

# drop non-informative features
cirrhosis = dropUninformative(cirrhosis)

# append target and save to csv
cirrhosis['y'] = y
cirrhosis.to_csv('processed_data\yes_miss\cirrhosis.csv', index=False)

# print missingness information
printMissingness(cirrhosis)

Percent missing: 2.96%
Percent of observations with at least one missing feature: 57.02%
Number of unique missing patterns: 9 out of 4,194,304 possible patterns.


### Ottawa Real Estate
- source: [OpenML id: 43417](https://www.openml.org/search?type=data&status=active&id=43417)
- predict price of Ottawa's real estate

In [442]:
# load data
ottawa, y, cat_indicator, feature_names = loadDataset(43417)

# process target
ottawa = ottawa.dropna(subset=['price'])
y = pd.to_numeric(ottawa.price.str.replace(',', ''), errors='coerce')
y = y.astype(float)

# process categorical features
drop_cols = ['price']
ottawa = dropColumns(ottawa, ['price'])
cat_cols = ['propertyType', 'style']
ottawa = cat2Dummies(ottawa, cat_cols)

# drop non-informative features
ottawa = dropUninformative(ottawa)

# append target and save to csv
ottawa['y'] = y
ottawa.to_csv('processed_data/ottawa.csv', index=False)

# print missingness information
printMissingness(ottawa)

Percent missing: 5.78%
Percent of observations with at least one missing feature: 59.53%
Number of unique missing patterns: 21 out of 1,073,741,824 possible patterns.


### Polish Bankruptcy
- source: [OpenML id: 42987](https://www.openml.org/search?type=data&status=active&id=42987)
- predict bankruptcy of Polish firms from numerical variables only

In [404]:
# load data
bankruptcy, y, cat_indicator, feature_names = loadDataset(42987)

# process target
y = bankruptcy['class']
y = y.astype(np.float64)
bankruptcy = dropColumns(bankruptcy, ['class'])

# append target and save to csv
bankruptcy['y'] = y
bankruptcy.to_csv('processed_data/bankruptcy.csv', index=False)

# print missingness information
printMissingness(bankruptcy)

Percent missing: 1.21%
Percent of observations with at least one missing feature: 48.71%
Number of unique missing patterns: 83 out of 36,893,488,147,419,103,232 possible patterns.


### Home Equity Default
- source: [OpenML id: 43337](https://www.openml.org/search?type=data&status=active&id=43337)
- binary prediction of defaulting on home equity

In [121]:
# load data
equity, y, cat_indicator, feature_names = loadDataset(43337)

# process target
y = equity.BAD.astype(np.float64)

# process categorical features
equity = dropColumns(equity, ['BAD'])
cat_cols = ['REASON', 'JOB']
equity = cat2Dummies(equity, cat_cols)
equity = equity.astype(np.float64)

# drop non-informative features
equity = dropUninformative(equity)

# append target and save to csv
equity['y'] = y
equity.to_csv('processed_data\yes_miss\equity.csv', index=False)

# print missingness information
printMissingness(equity)

Percent missing: 4.19%
Percent of observations with at least one missing feature: 41.02%
Number of unique missing patterns: 74 out of 524,288 possible patterns.


### Fico Credit Scoring
- source: [Open ML id: 45554](https://www.openml.org/search?type=data&status=active&id=45554&sort=runs)
- predict GOOD/BAD credit score

In [123]:
# load data
fico, y, cat_indicator, feature_names = loadDataset(45554)

# process target
y = y.apply(lambda x: 1.0 if x == 'Good' else 0.0)

# process categorical features
cat_cols = np.array(feature_names)[np.array(cat_indicator)]
fico = cat2Dummies(fico, cat_cols)
fico = fico.astype(np.float64)

# drop non-informative features
fico = dropUninformative(fico)

# append target and save to csv
fico['y'] = y
fico.to_csv('processed_data/yes_miss/fico.csv', index=False)

# print missingness information
printMissingness(fico)

Percent missing: 3.37%
Percent of observations with at least one missing feature: 74.65%
Number of unique missing patterns: 74 out of 274,877,906,944 possible patterns.


### Mice Protein Expression
- course: [OpenML id: 43445](https://www.openml.org/search?type=data&status=active&id=43445)
- expression of 77 proteins in mice' cerebral cortex used for binary prediction of control vs. treatment

In [407]:
# load data
mice, y, cat_indicator, feature_names = loadDataset(43445)

# process target
y = mice['class']
y = y.apply(lambda x: 1.0 if x[2:4] == 'CS' else 0.0)

# process categorical features
mice = dropColumns(mice, ['Genotype', 'Treatment', 'Behavior', 'class'])

# append target and save to csv
mice['y'] = y
mice.to_csv('processed_data/mice.csv', index=False)

# print missingness information
printMissingness(mice)

Percent missing: 1.66%
Percent of observations with at least one missing feature: 48.89%
Number of unique missing patterns: 26 out of 302,231,454,903,657,293,676,544 possible patterns.


# Data descriptive statistics

In [128]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

directory = 'processed_data/yes_miss'
datasets = os.listdir(directory)
columns=['dataset', 'n', 'd', 'd/n', 'frac_missing', 'frac_n_with_missing', 'missing_patterns', 'frac_missing_patterns']
descriptive_stats = []

for dataset in datasets:
    df = pd.read_csv(f'{directory}/{dataset}')
    n, d = df.shape
    d_miss = df.isna().sum().sum()/df.size
    d_obs_miss = df.isna().any(axis=1).sum()/n
    miss_patterns = getNumMissingPatterns(df)
    frac_miss_patterns = miss_patterns/(2**d)
    descriptive_stats.append([dataset[:-4], n, d, d/n, d_miss, d_obs_miss, miss_patterns, frac_miss_patterns])

In [136]:
descriptive_stats = pd.DataFrame(descriptive_stats, columns=columns).sort_values(by='d/n', ascending=False)
descriptive_stats = descriptive_stats.sort_values(by='frac_missing', ascending=False)
descriptive_stats.to_csv('results\csvs\descriptive_statistics_yes_miss_datasets.csv', index=False)
descriptive_stats

Unnamed: 0,dataset,n,d,d/n,frac_missing,frac_n_with_missing,missing_patterns,frac_missing_patterns
3,support,1000,50,0.05,0.093,0.966,310,2.753353e-13
1,equity,5960,19,0.003188,0.041858,0.410235,74,0.0001411438
2,fico,9871,38,0.00385,0.033706,0.74653,74,2.692104e-10
0,cirrhosis,1945,22,0.011311,0.029586,0.57018,9,2.145767e-06
4,wiki,904,67,0.074115,0.015157,0.313053,157,1.063873e-18


# Data W/O Missing Values

In [138]:
openML_ids = {'philippine': 41145, 
              'christine': 41142, 
              'phoneme': 1489, 
              'wine_quality': 287, 
              'airfoil': 43919}

### Philippine

In [162]:
# load data
X, y, cat_indicator, feature_names = loadDataset(openML_ids['philippine'])

# process target
y = y.astype(np.float64)

# MCAR: 33% of instances have 33% missing features
np.random.seed(101010)
n, p = X.shape
X_missing_MCAR = X.copy()
miss_instances = np.random.choice(X.index, size=int(0.33*n), replace=False)
for instance in miss_instances:
    miss_features = np.random.choice(X.columns, size=int(0.33*p), replace=False)
    X_missing_MCAR.loc[instance, miss_features] = np.nan

# MNAR: 20% of features have upper and lower quartile missing
np.random.seed(101010)
n, p = X.shape
X_missing_MNAR = X.copy()
miss_features = np.random.choice(X.columns, size=int(0.2*p), replace=False)
for feature in miss_features:
    lower, upper = X[feature].quantile([0.25, 0.75])
    miss_instances = X.loc[(X[feature] < lower) | (X[feature] > upper)].index
    X_missing_MNAR.loc[miss_instances, feature] = np.nan

# combine features and target
X['y'] = y
X_missing_MCAR['y'] = y
X_missing_MNAR['y'] = y

# save to csv
X.to_csv('processed_data/no_miss/philippine.csv', index=False)
X_missing_MCAR.to_csv('processed_data/no_miss/philippine_MCAR.csv', index=False)
X_missing_MNAR.to_csv('processed_data/no_miss/philippine_MNAR.csv', index=False)


### Christine

In [152]:
# load data
X, y, cat_indicator, feature_names = loadDataset(openML_ids['christine'])

# process target
y = y.astype(np.float64)

# MCAR: 33% of instances have 33% missing numerical features
np.random.seed(101010)
n, p = X.shape
X_missing_MCAR = X.copy()
miss_instances = np.random.choice(X.index, size=int(0.33*n), replace=False)
for instance in miss_instances:
    miss_features = np.random.choice(X.columns[~np.array(cat_indicator)], size=int(0.33*p), replace=False)
    X_missing_MCAR.loc[instance, miss_features] = np.nan

# MNAR: 20% of numeric features have upper and lower quartile missing
np.random.seed(101010)
n, p = X.shape
X_missing_MNAR = X.copy()
miss_features = X.columns[~np.array(cat_indicator)]
miss_features = np.random.choice(miss_features, size=int(0.2*p), replace=False)
for feature in miss_features:
    lower, upper = X[feature].quantile([0.25, 0.75])
    miss_instances = X.loc[(X[feature] < lower) | (X[feature] > upper)].index
    X_missing_MNAR.loc[miss_instances, feature] = np.nan        

# combine features and target
X['y'] = y
X_missing_MCAR['y'] = y
X_missing_MNAR['y'] = y

# save to csv
X.to_csv('processed_data/no_miss/christine.csv', index=False)
X_missing_MCAR.to_csv('processed_data/no_miss/christine_MCAR.csv', index=False)
X_missing_MNAR.to_csv('processed_data/no_miss/christine_MNAR.csv', index=False)


printMissingness(X_missing_MNAR)

Percent missing: 9.26%
Percent of observations with at least one missing feature: 100.00%
Number of unique missing patterns: 5418 out of 6,110,867,989,448,014,552,298,782,828,208,316,193,809,952,885,676,369,117,472,133,309,891,328,341,254,636,644,354,714,157,002,810,478,547,402,909,205,659,254,601,798,041,839,816,652,081,811,650,677,690,905,917,381,732,887,612,552,813,375,176,895,148,575,684,957,849,535,635,099,785,528,429,764,712,843,434,636,874,594,921,661,022,703,043,781,268,266,332,365,112,500,876,034,108,471,429,975,847,694,022,676,972,136,745,059,017,077,493,834,603,591,563,046,655,596,486,594,803,473,706,869,542,019,927,630,658,780,868,329,467,859,090,412,022,547,341,819,128,775,067,876,977,713,139,580,394,765,119,898,285,824,332,940,707,170,586,319,251,168,872,337,843,945,472 possible patterns.


### Phoneme

In [153]:
# load data
X, y, cat_indicator, feature_names = loadDataset(openML_ids['phoneme'])

# process target
y = y.map(lambda x: 1.0 if x == '1' else 0.0)
y = y.astype(np.float64)

# MCAR: 33% of instances have 33% missing features
np.random.seed(101010)
n, p = X.shape
X_missing_MCAR = X.copy()
miss_instances = np.random.choice(X.index, size=int(0.33*n), replace=False)
for instance in miss_instances:
    miss_features = np.random.choice(X.columns, size=int(0.33*p), replace=False)
    X_missing_MCAR.loc[instance, miss_features] = np.nan

# MNAR: 20% of features have upper and lower quartile missing
np.random.seed(101010)
n, p = X.shape
X_missing_MNAR = X.copy()
miss_features = np.random.choice(X.columns, size=int(0.2*p), replace=False)
for feature in miss_features:
    lower, upper = X[feature].quantile([0.25, 0.75])
    miss_instances = X.loc[(X[feature] < lower) | (X[feature] > upper)].index
    X_missing_MNAR.loc[miss_instances, feature] = np.nan

# combine features and target
X['y'] = y
X_missing_MCAR['y'] = y
X_missing_MNAR['y'] = y

# save to csv
X.to_csv('processed_data/no_miss/phoneme.csv', index=False)
X_missing_MCAR.to_csv('processed_data/no_miss/phoneme_MCAR.csv', index=False)
X_missing_MNAR.to_csv('processed_data/no_miss/phoneme_MNAR.csv', index=False)

### wine_quality

In [154]:
# load data
X, y, cat_indicator, feature_names = loadDataset(openML_ids['wine_quality'])

# process target
y = y.astype(np.float64)

# MCAR: 33% of instances have 33% missing features
np.random.seed(101010)
n, p = X.shape
X_missing_MCAR = X.copy()
miss_instances = np.random.choice(X.index, size=int(0.33*n), replace=False)
for instance in miss_instances:
    miss_features = np.random.choice(X.columns, size=int(0.33*p), replace=False)
    X_missing_MCAR.loc[instance, miss_features] = np.nan

# MNAR: 20% of features have upper and lower quartile missing
np.random.seed(101010)
n, p = X.shape
X_missing_MNAR = X.copy()
miss_features = np.random.choice(X.columns, size=int(0.2*p), replace=False)
for feature in miss_features:
    lower, upper = X[feature].quantile([0.25, 0.75])
    miss_instances = X.loc[(X[feature] < lower) | (X[feature] > upper)].index
    X_missing_MNAR.loc[miss_instances, feature] = np.nan

# combine features and target
X['y'] = y
X_missing_MCAR['y'] = y
X_missing_MNAR['y'] = y

# save to csv
X.to_csv('processed_data/no_miss/wine_quality.csv', index=False)
X_missing_MCAR.to_csv('processed_data/no_miss/wine_quality_MCAR.csv', index=False)
X_missing_MNAR.to_csv('processed_data/no_miss/wine_quality_MNAR.csv', index=False)

### airfoil

In [155]:
# load data
X, y, cat_indicator, feature_names = loadDataset(openML_ids['airfoil'])

# process target
y

# MCAR: 33% of instances have 33% missing features
np.random.seed(101010)
n, p = X.shape
X_missing_MCAR = X.copy()
miss_instances = np.random.choice(X.index, size=int(0.33*n), replace=False)
for instance in miss_instances:
    miss_features = np.random.choice(X.columns, size=int(0.33*p), replace=False)
    X_missing_MCAR.loc[instance, miss_features] = np.nan

# MNAR: 20% of features have upper and lower quartile missing
np.random.seed(101010)
n, p = X.shape
X_missing_MNAR = X.copy()
miss_features = np.random.choice(X.columns, size=int(0.2*p), replace=False)
for feature in miss_features:
    lower, upper = X[feature].quantile([0.25, 0.75])
    miss_instances = X.loc[(X[feature] < lower) | (X[feature] > upper)].index
    X_missing_MNAR.loc[miss_instances, feature] = np.nan

# combine features and target
X['y'] = y
X_missing_MCAR['y'] = y
X_missing_MNAR['y'] = y

# save to csv
X.to_csv('processed_data/no_miss/airfoil.csv', index=False)
X_missing_MCAR.to_csv('processed_data/no_miss/airfoil_MCAR.csv', index=False)
X_missing_MNAR.to_csv('processed_data/no_miss/airfoil_MNAR.csv', index=False)

### Descriptive stats

In [156]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

directory = 'processed_data/no_miss'
datasets = os.listdir(directory)
columns=['dataset', 'n', 'd', 'd/n', 'frac_missing', 'frac_n_with_missing', 'missing_patterns', 'frac_missing_patterns', 'type_of_missingness']
descriptive_stats = []

for dataset in datasets:
    df = pd.read_csv(f'{directory}/{dataset}')
    n, d = df.shape
    d_miss = df.isna().sum().sum()/df.size
    d_obs_miss = df.isna().any(axis=1).sum()/n
    miss_patterns = getNumMissingPatterns(df)
    frac_miss_patterns = miss_patterns/(2**d)
    miss_type = 'MCAR' if 'MCAR' in dataset else 'MNAR' if 'MNAR' in dataset else 'none'
    descriptive_stats.append([dataset[:-4], n, d, d/n, d_miss, d_obs_miss, miss_patterns, frac_miss_patterns, miss_type])

In [161]:
descriptive_stats = pd.DataFrame(descriptive_stats, columns=columns).sort_values(by='d/n', ascending=False)
descriptive_stats = descriptive_stats.sort_values(by='type_of_missingness', ascending=True)
descriptive_stats.to_csv('results\csvs\descriptive_statistics_no_miss_datasets.csv', index=False)

descriptive_stats

Unnamed: 0,dataset,n,d,d/n,frac_missing,frac_n_with_missing,missing_patterns,frac_missing_patterns,type_of_missingness
4,christine_MCAR,5418,1637,0.302141,0.108599,0.329827,1787,0.0,MCAR
7,philippine_MCAR,5832,309,0.052984,0.107833,0.329904,1924,1.844745e-90,MCAR
1,airfoil_MCAR,1503,6,0.003992,0.05489,0.329341,5,0.078125,MCAR
13,wine_quality_MCAR,6497,12,0.001847,0.0825,0.329998,165,0.0402832,MCAR
10,phoneme_MCAR,5404,6,0.00111,0.05499,0.329941,5,0.078125,MCAR
5,christine_MNAR,5418,1637,0.302141,0.092605,1.0,5418,0.0,MNAR
8,philippine_MNAR,5832,309,0.052984,0.097573,1.0,5832,5.591764e-90,MNAR
2,airfoil_MNAR,1503,6,0.003992,0.082834,0.497006,1,0.015625,MNAR
14,wine_quality_MNAR,6497,12,0.001847,0.080101,0.725258,3,0.0007324219,MNAR
11,phoneme_MNAR,5404,6,0.00111,0.083333,0.5,1,0.015625,MNAR
