In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import shuffle
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold

In [2]:
def read_uci(dataset, stats=False):
    path = f'{dataset}.txt'
    df = pd.read_csv(path, delim_whitespace=True, header=None)
    df = df.astype('float64')
    data = df.values
    X, Y = data[:, 1:], data[:, 0].astype('int32')
    if Y.min() == 1:
        Y -= 1
    X = MinMaxScaler().fit_transform(X)
    if stats:
        labels, freq = np.unique(Y, return_counts=True)
        print(dataset, X.shape, len(labels), freq.min()/freq.max(), freq)
    return shuffle(X, Y, random_state=42)

In [3]:
for f in sorted(os.listdir()):
    if not f.endswith('txt'):
        continue
    try:
        X, Y = read_uci(f.split('.')[0], False)
    except Exception as e:
        print("ERROR:", f, e)
        continue
        
    freq = np.bincount(Y)
    ir = freq.max()/freq.min()
    if ir<=2:
        print(f"{f}, {ir:.3f}, {len(freq)}, {X.shape}")

BreastTissue.txt, 1.571, 6, (106, 9)
arrhythmia.txt, 1.295, 2, (420, 278)
australian.txt, 1.248, 2, (690, 14)
breast.txt, 1.684, 2, (569, 30)
bupa.txt, 1.379, 2, (345, 6)
electricity.txt, 1.355, 2, (45312, 14)
heart.txt, 1.250, 2, (270, 13)
ionosphere.txt, 1.786, 2, (351, 34)
iris.txt, 1.000, 3, (150, 4)
letter.txt, 1.108, 26, (20000, 16)
libras.txt, 1.000, 15, (360, 90)
liver.txt, 1.379, 2, (345, 6)
madelon.txt, 1.000, 2, (2600, 500)
mammographic.txt, 1.060, 2, (830, 5)
mfeat.txt, 1.000, 10, (2000, 240)
musk.txt, 1.319, 2, (473, 165)
optdigits.txt, 1.032, 10, (5620, 64)
pendigits.txt, 1.084, 10, (10992, 16)
pima.txt, 1.866, 2, (768, 8)
ring.txt, 1.020, 2, (7400, 20)
segment.txt, 1.000, 7, (2310, 18)
seismic.txt, 1.000, 3, (210, 7)
semeion.txt, 1.045, 10, (1593, 256)
smartphone.txt, 1.000, 6, (180, 66)
sonar.txt, 1.144, 2, (208, 60)
spambase.txt, 1.538, 2, (4601, 57)
twonorm.txt, 1.002, 2, (7400, 20)
vowel.txt, 1.000, 11, (990, 13)
waveform.txt, 1.030, 3, (5000, 21)
wifi.txt, 1.000, 4,

  # This is added back by InteractiveShellApp.init_path()
