### Step 1: Load the constants from constants.py
Import dependencies as well.

In [9]:
from constants import entity_unit_map
import numpy as np
import pandas as pd
import os

### Step 2: Defining a function that maps each entity_name to its corresponding units.
This function will apply the appropriate unit to each predicted value.

In [10]:
def get_unit(entity_name):
    """Return the possible units for a given entity name."""
    return entity_unit_map.get(entity_name, set())

### Step 3: Format Predictions with Units
Assuming that you have a dictionary or list of predictions where each prediction is associated with an entity name, format these predictions with the appropriate units.

In [11]:
def apply_units(entity_name, value):
    """Apply the correct unit based on the entity name."""
    units = get_unit(entity_name)
    if not units:
        return f"{value}"  # Return value without units if no units are defined for the entity
    
    # For simplicity, let's assume the first unit in the set is chosen
    unit = next(iter(units))
    return f"{value} {unit}"


### Step 4: Generate Predictions with Units
Generating predictions and using the apply_units function to format them.

In [13]:
DATASET_FOLDER = '../dataset/'

# Load your test dataset
test_data = pd.read_csv(os.path.join(DATASET_FOLDER, 'test.csv'))
entity_names = test_data['entity_name'].values

# Assuming you have generated predictions
# predictions should be a list or array of numeric values
predictions = np.random.rand(len(entity_names))  # Example predictions

# Apply units to the predictions
formatted_predictions = [apply_units(entity, pred) for entity, pred in zip(entity_names, predictions)]

# Create a DataFrame for the output
output_df = pd.DataFrame({
    'entity_name': entity_names,
    'prediction': formatted_predictions
})

# Save the formatted output to a CSV file
output_df.to_csv('formatted_predictions.csv', index=False)

### Step 5: Load and process the CSV file to a PKL file.

In [33]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import joblib

# Load the CSV file
labels_df = pd.read_csv('formatted_predictions.csv')

# Ensure 'prediction' column is of string type
labels_df['prediction'] = labels_df['prediction'].astype(str)

# Split the 'prediction' column into 'confidence' and 'unit'
# We will manually handle cases where the split operation might fail
def split_prediction(prediction):
    parts = prediction.split(' ', 1)  # Split only on the first space
    if len(parts) == 2:
        return parts
    else:
        return [parts[0], '']  # Handle cases where there is no space

# Apply the split function
labels_df[['confidence', 'unit']] = labels_df['prediction'].apply(lambda x: pd.Series(split_prediction(x)))

# Extract the 'unit' column for labels
labels = labels_df['unit'].values

# Encode labels if they are categorical
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)

# Save encoded labels
np.save('labels.npy', encoded_labels)

# Optionally: Save the label encoder if you need to reverse the encoding later
joblib.dump(label_encoder, 'label_encoder.pkl')


['label_encoder.pkl']

### Create labels.npy as well.

In [36]:
import pandas as pd
import numpy as np

# Load labels
labels_df = pd.read_csv('formatted_predictions.csv')  # Replace with your labels file path

# Assuming the labels are in a column named 'label'
labels = labels_df['prediction'].values

# Encode labels if they are categorical
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)

# Save encoded labels
np.save('labels.npy', encoded_labels)
