In [1]:
import random
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import scipy.io

from src.utility import *

In [2]:
# Does a few generic processing steps before saving data+metadata dfs in correct format
# data_df must have a "task" column and metadata_df a "_task" column which are used to determine task matches
# Filters for erronous (NaN) data and/or metadata, all-zero columns,
# Removes tasks w few instances
# Re-computes "task", "task_count" variables
# Saves dataframes
def save_dfs(data_df, metadata_df, fn_root, min_instances_per_task=5):

    # Merge dataframes
    join_df = pd.merge(data_df, metadata_df, left_on="task", right_on="_task")
    join_df = join_df.drop(columns=["task"])

    # Filter instances for problems
    if join_df.isna().any().any():
        print("WARNING: Removing NaN rows of data and/or metadata.")
    join_df = join_df[~join_df.isna().any(axis=1)]

    # Fix task-related meta-variables
    join_df["_task_count"] = join_df.groupby("_task").transform("count").iloc[:,0]
    join_df = join_df[join_df["_task_count"] >= min_instances_per_task]

    join_df = join_df.sort_values(by="_task_count", ascending=False)
    join_df["_task"] = join_df.groupby("_task", sort=False).ngroup()
    join_df["task"] = join_df["_task"]
    join_df = join_df.sort_values(by="_task")

    # Filter features where all vals are equal
    join_df = join_df[[c for c in join_df.columns if (len(set(join_df[c])) > 1 or c.startswith("_"))]]

    # Data df
    data_cols = [c for c in data_df.columns if c in join_df.columns]
    if "task" not in data_cols: data_cols += ["task"]
    data_df = join_df[data_cols]

    # Metadata df
    metadata_cols = [c for c in metadata_df.columns if c in join_df.columns]
    if "_task" not in metadata_cols: metadata_cols += ["_task"]
    if "_task_count" not in metadata_cols: metadata_cols += ["_task_count"]
    metadata_df = join_df[metadata_cols].drop_duplicates()
    if "_task_name" not in metadata_df.columns: metadata_df["_task_name"] = metadata_df["_task"]

    # Joined (STL) df
    join_cols = [c for c in join_df.columns if not c.startswith("_")]
    join_df = join_df[join_cols]

    # 1hot encoded STL df
    data_df_1H = data_df.copy()
    data_df_1H["t"] = data_df_1H["task"].astype(object)
    onehot = pd.get_dummies(data_df_1H[["t"]])
    data_df_1H[onehot.columns] = onehot
    data_df_1H = data_df_1H.drop("t", axis=1)

    # Save dataframes
    data_df.to_csv(fn_root+"_data.csv", index=False)
    metadata_df.to_csv(fn_root+"_metadata.csv", index=False)
    join_df.to_csv(fn_root+"_STL_metadata.csv", index=False)
    data_df_1H.to_csv(fn_root+"_STL_onehot.csv", index=False)

### Cubic

In [3]:
out_filepath = r"data\cubic\cubic"

np.random.seed(0)

n_tasks = 100
n_data_per_task = 500
domain = [-1, 1]
coef_range = [-1, 1]
degree = 3
sigma = 0.1
sigma_metadata = 0

# Create metadata table
coefs = [chr(i) for i in range(97, 98+degree)]
metadata = np.random.rand(n_tasks, degree+1)
metadata = metadata * (coef_range[1] - coef_range[0]) + coef_range[0]
metadata_df = pd.DataFrame(metadata, columns=coefs)
metadata_df["_task"] = metadata_df.index
metadata_df["_task_name"] = ""
for i in range(0, len(coefs)):
    if i > 0:
        metadata_df["_task_name"] += " + "
    metadata_df["_task_name"] += metadata_df[coefs[i]].round(2).astype(str)
    if i < len(coefs)-1:
        metadata_df["_task_name"] += "x"
    if i < len(coefs)-2:
        metadata_df["_task_name"] += str(len(coefs)-1-i)

# Create data table
feature_df = pd.DataFrame()
x = np.random.rand(n_tasks*n_data_per_task)
x = x * (domain[1] - domain[0]) + domain[0]
feature_df["x"] = x
task = np.repeat(np.arange(n_tasks), n_data_per_task)
feature_df["task"] = task
for i, row in feature_df.iterrows():
    x = row["x"]
    metadata = metadata_df.iloc[int(row["task"])]
    y = np.random.normal()*sigma
    p = degree
    for coef in metadata[coefs]:
        y += coef * x**p
        p -= 1
    feature_df.at[i, "label"] = y

# Add noise to metadata
metadata_df[coefs] += np.random.rand(n_tasks,degree+1)*sigma_metadata

save_dfs(feature_df, metadata_df, out_filepath)

### Robot Arm

In [4]:
save = True
plot = False

num_tasks = 100
num_instances_per_task = 500
dof = 3                        # Degrees of freedom; number of joints in arm
join_length = [0, 1]

np.random.seed(0)

save = True
output_filename_root = r"data\robot_arm\robot_arm"

metadata_colnames = ["l{}".format(i) for i in range(1,dof+1)]
lengths = np.random.uniform(low=join_length[0], high=join_length[1], size=(num_tasks, dof))
metadata_df = pd.DataFrame(lengths, columns=metadata_colnames)
metadata_df["_task"] = np.arange(num_tasks)
metadata_df["_task_name"] = metadata_df["_task", 

shp = (num_tasks*num_instances_per_task,)
angles = np.stack([np.random.uniform(low=-np.pi, high=0, size=shp),
                   np.random.uniform(low=-np.pi, high=0, size=shp),
                   np.random.uniform(low=-np.pi/2, high=np.pi/2, size=shp)], axis=1)
x = np.zeros(shp)
y = np.zeros(shp)
for i in range(dof):
    x += np.repeat(lengths[:,i], num_instances_per_task) * np.cos(np.sum(angles[:,:i+1], axis=1))
    y += np.repeat(lengths[:,i], num_instances_per_task) * np.sin(np.sum(angles[:,:i+1], axis=1))
feature_vals = np.concatenate([x.reshape(-1,1), y.reshape(-1,1), angles], axis=1)
feature_colnames = ["x", "y", "label1", "label2", "label3"]
feature_df = pd.DataFrame(feature_vals, columns=feature_colnames)
feature_df["task"] = np.repeat(np.arange(num_tasks), num_instances_per_task)


if plot:
    for i in range(10):
        plt.scatter(x[i*num_instances_per_task:(i+1)*num_instances_per_task], y[i*num_instances_per_task:(i+1)*num_instances_per_task])
        plt.show()

if save:
    save_dfs(feature_df, metadata_df, output_filename_root)