# LEGO ETL - Fact

## Some initial import

In [1]:
import pandas as pd
import chromadb
from chromadb.utils import embedding_functions
from chromadb.db.base import UniqueConstraintError

## Work on LEGO staging dataset

### Load from file

In [None]:
df_lego = pd.read_csv('../data/stg/lego/lego.csv')

### Check some LEGO info

In [None]:
df_lego.info()

### Group MinAge in the same ways for anime data

In [8]:
def define_minimun_age(row:float) -> int:
    """Define the minimum age for the lego set
    Args:
        row (float): AgeMin
    Returns:
        int: minimum age
    """
    age = row['AgeMin']
    if age >= 17:
        out_age = 18
    elif age >= 13:
        out_age = 13
    elif age >= 10:
        out_age = 10
    else:
        out_age = 0
    return out_age

df_lego['min_age'] = df_lego.apply(lambda row: define_minimun_age(row), axis = 1)

### Manage missing values
Impute missing value for `Availability`, `PackagingType`, `ThemeGroup` and `Subtheme` with `uncategorized`

In [15]:
df_lego.loc[df_lego['Availability'].isin(['{Not specified}', 'Unknown']), 'Availability'] = 'uncategorized'
df_lego.loc[df_lego['PackagingType']=='{Not specified}', 'PackagingType'] = 'uncategorized'
df_lego.loc[df_lego['Subtheme'].isna(), 'Subtheme'] = 'uncategorized'
df_lego.loc[df_lego['ThemeGroup'].isna(), 'ThemeGroup'] = 'uncategorized'

## Work on Genres
This is to categorise LEGO sets with genres from Anime using ChromaDB

### Read genres file

In [2]:
anime_genres = '../data/prod/genres.csv'
df_genres= pd.read_csv(anime_genres)
df_genres['genre_id'] = df_genres['genre_id'].astype(str)

### Setup ChromaDB
Define chromadb client and embedding function feeding with genres info

In [None]:
chroma_client = chromadb.Client()
em = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="thenlper/gte-large")
try:
    collection = chroma_client.create_collection(name="my_collection", embedding_function=em)
except UniqueConstraintError:
    collection = chroma_client.get_collection(name='my_collection', embedding_function=em)

collection.add(
    documents=list(df_genres['genre_name']),
    ids=list(df_genres['genre_id'])
)

### Create a new column with the text to embed for LEGO genres

In [11]:
df_lego['text_for_embedding'] = df_lego.apply(lambda x: f"{x['Theme']} {x['ThemeGroup']} {x['Subtheme']} {x['SetName']}", axis=1)

### Create a function to extract the 2 more similar genres for each LEGO set

In [12]:
def retrive_genres(row:str) -> list:
    """Retrive the genre for the lego set
    Args:
        row (str): text_for_embedding
    Returns:
        list: list of genres ids
    """ 
    results = collection.query(
        query_texts = [row['text_for_embedding']],
        n_results = 2
    )
    return results['ids'][0]
df_lego['similar_genres'] = df_lego.apply(lambda row: retrive_genres(row), axis = 1)

### Extract the genres list, create a dictionary with the mapping genre_id-lego_ids and save them into a csv file

In [16]:
lego_genres_list = []
for index, row in df_genres.iterrows():
    gen = {}
    gen['_id'] = str(index)
    #gen['name'] = row['genre_name']
    gen['lego_ids'] = []
    lego_genres_list.append(gen.copy())
for index, row in df_lego.iterrows():
    for el in row['similar_genres']:
        for gen in lego_genres_list:
            if el == gen['_id']:
                gen['lego_ids'].append(row['SetID'])

In [23]:
genre_for_df = []
for genre in lego_genres_list:
    gen = {}
    gen['genre_id'] = genre['_id']
    #gen['name'] = genre['name']
    for lego_id in genre['lego_ids']:
        gen['lego_ids'] = lego_id
        genre_for_df.append(gen.copy())
df_lego_genres = pd.DataFrame(genre_for_df)
df_lego_genres.to_csv('../data/prod/lego_genres.csv', index=False)

### Drop useless columns

In [24]:
df_lego.drop(
    columns=['similar_genres', 'text_for_embedding', 'AgeMax', 'AgeMin', 'PricePerPiece', 'StockMarketPerPiece'],
    axis = 1,
    inplace = True
)

## Save LEGO data into a csv file

In [25]:
df_lego.to_csv('../data/prod/lego.csv', index=False)