In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression

In [2]:
def make_data(dist):
    """Return pd.DataFrame of training data from dist."""
    df = pd.read_csv(f'../data/stats/{dist}/{dist}_degree.csv')[['PolynomialAdditions']]
    X = np.load(f'../data/stats/{dist}/{dist}.npy', allow_pickle=True)
    if dist.startswith('toric'):
        df['Generators'] = [x.shape[0] for x in X]
        df["MaxGeneratorDegree"] = [np.max(np.sum(x[:, :8], axis=-1)) for x in X]
        df["MinGeneratorDegree"] = [np.min(np.sum(x[:, :8], axis=-1)) for x in X]
        df["StdGeneratorDegree"] = [np.std(np.sum(x[:, :8], axis=-1)) for x in X]
        df["MeanGeneratorDegree"] = [np.mean(np.sum(x[:, :8], axis=-1)) for x in X]
        df['PurePowers'] = [np.sum(np.sum(x[:, :8] > 0, axis=-1) == 1) for x in X]
        df['PurePowers'] /= df['Generators']
    else:
        df["Generators"] = X.shape[1]
        df["MaxGeneratorDegree"] = np.max(np.sum(X[:, :, :3], axis=-1), axis=1)
        df["MinGeneratorDegree"] = np.min(np.sum(X[:, :, :3], axis=-1), axis=1)
        df["StdGeneratorDegree"] = np.std(np.sum(X[:, :, :3], axis=-1), axis=1)
        df["MeanGeneratorDegree"] = np.mean(np.sum(X[:, :, :3], axis=-1), axis=1)
        df['PurePowers'] = np.sum(np.sum(X[:, :, :3] > 0, axis=2) == 1, axis=1)
        df['PurePowers'] /= df['Generators']
    return df

In [3]:
DISTS = ['3-20-10-weighted', '3-20-10-uniform', '3-20-4-weighted', '3-20-4-uniform',
         'toric-2-0-5-8', 'toric-4-0-5-8', 'toric-6-0-5-8', 'toric-6-0-10-8']

In [4]:
dfs = {dist: make_data(dist) for dist in DISTS}

In [5]:
models = {}
for dist in DISTS:
    train_df = dfs[dist]
    target = ['PolynomialAdditions']
    features = ['MaxGeneratorDegree', 'MinGeneratorDegree', 'StdGeneratorDegree',
                'MeanGeneratorDegree', 'PurePowers']
    if dist.startswith('toric'):
        features += ['Generators']
    train_size = int(len(train_df) * 0.9)
    model = LinearRegression().fit(train_df[features][:train_size],
                                   train_df[target][:train_size])
    models[dist] = model

In [6]:
scores = pd.DataFrame(index=pd.Series(DISTS, name="Train"),
                      columns=pd.Series(DISTS, name="Test"))
for train_dist in DISTS:
    for test_dist in DISTS:
        test_df = dfs[test_dist]
        target = ['PolynomialAdditions']
        features = ['MaxGeneratorDegree', 'MinGeneratorDegree', 'StdGeneratorDegree',
                    'MeanGeneratorDegree', 'PurePowers']
        if train_dist.startswith('toric'):
            features += ['Generators']
        train_size = int(len(test_df) * 0.9)
        score = models[train_dist].score(test_df[features][train_size:],
                                         test_df[target][train_size:])
        scores[test_dist][train_dist] = score

In [7]:
scores

Test,3-20-10-weighted,3-20-10-uniform,3-20-4-weighted,3-20-4-uniform,toric-2-0-5-8,toric-4-0-5-8,toric-6-0-5-8,toric-6-0-10-8
Train,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
3-20-10-weighted,0.238371,-0.0225923,0.000544522,-0.0235199,-4.54105,-0.123778,-23.178,-193.906
3-20-10-uniform,-0.257226,0.0196979,0.13234,0.0293485,-3.77205,-0.105041,-35.0522,-243.982
3-20-4-weighted,-0.0928812,0.00283698,0.174738,0.0176394,-4.63687,-0.0937603,-47.0565,-423.876
3-20-4-uniform,-0.265502,0.0167369,0.13969,0.0308524,-3.89124,-0.0915673,-42.8715,-361.332
toric-2-0-5-8,-1482.65,-1848.3,-801.407,-1172.01,0.252774,-0.337579,-9414.59,-76130.1
toric-4-0-5-8,-62.7345,-31.1368,-8.04663,-4.3236,-4.78176,0.812433,-889.815,-8275.07
toric-6-0-5-8,-1.99723,-5.83366,-5.4472,-7.72365,-3.77649,0.57533,0.637119,-0.608998
toric-6-0-10-8,-3.5427,-7.63144,-4.99514,-6.8465,-4.48255,0.220194,0.516061,0.882581
