## Transformaciones de datos categóricos

In [1]:
import pandas as pd
adidas = pd.read_csv('adidas.csv')

In [2]:
adidas.columns

Index(['url', 'name', 'sku', 'selling_price', 'original_price', 'currency',
       'availability', 'color', 'category', 'source', 'source_website',
       'breadcrumbs', 'description', 'brand', 'images', 'country', 'language',
       'average_rating', 'reviews_count', 'crawled_at'],
      dtype='object')

In [3]:
categorias = ['url', 'name', 'sku', 'currency',
       'availability', 'color', 'category', 'source', 'source_website',
       'breadcrumbs', 'description', 'brand', 'images', 'country', 'language']

### Eliminamos columnas donde los niveles de las variables categóricas representan más o igual al 98%

In [4]:
target = .98

columnas_validas = []
columnas_muchos_niveles = []
columnas_un_nivel = []

for c_ in categorias:
    x = adidas.loc[:, [c_] ].drop_duplicates()
    valores_unicos = x.shape[0]
    
    if valores_unicos == 1:
        columnas_un_nivel.append(c_)
    elif (valores_unicos / adidas.shape[0]) >= target:
        columnas_muchos_niveles.append(c_)
    else:
        columnas_validas.append(c_)

In [5]:
columnas_validas

['name', 'availability', 'color', 'category', 'breadcrumbs', 'description']

### Encontramos las columnas que tienen entre 3 y 6 niveles

In [6]:
columnas_validas_2 = []
columnas_descartadas = []

for c_ in columnas_validas:
    x = adidas.loc[:, [c_] ].drop_duplicates()
    valores_unicos = x.shape[0]
    
    if (valores_unicos >= 3) and (valores_unicos <= 6):
        columnas_validas_2.append(c_)
    else:
        columnas_descartadas.append(c_)

In [7]:
columnas_validas_2

['category']

### Tratamiento de la columna "category"

In [8]:
nivel_frecuencia = adidas.groupby( ["category"], as_index = False).agg( {"sku" : ["count"]} )

In [11]:
nivel_frecuencia.columns = nivel_frecuencia.columns.droplevel(1)

In [13]:
nivel_frecuencia['Porcentaje'] = nivel_frecuencia['sku'] / adidas.shape[0]

In [15]:
nivel_frecuencia = nivel_frecuencia.sort_values(['Porcentaje'], ascending = False)

In [16]:
nivel_frecuencia

Unnamed: 0,category,sku,Porcentaje
2,Shoes,426,0.504142
1,Clothing,337,0.398817
0,Accessories,82,0.097041


In [24]:
nivel_frecuencia['category_c'] = nivel_frecuencia.apply( lambda r: 'others' if r['Porcentaje'] < 0.2 else r['category'], axis = 1 )

In [25]:
nivel_frecuencia

Unnamed: 0,category,sku,Porcentaje,category_c
2,Shoes,426,0.504142,Shoes
1,Clothing,337,0.398817,Clothing
0,Accessories,82,0.097041,others


In [26]:
nivel_frecuencia = nivel_frecuencia.loc[:, ['category','category_c']]

In [28]:
nivel_frecuencia.to_csv('category_rules.csv', index_label = False)

In [29]:
adidas.merge(nivel_frecuencia, on = ['category'], how = 'left')

Unnamed: 0,url,name,sku,selling_price,original_price,currency,availability,color,category,source,...,breadcrumbs,description,brand,images,country,language,average_rating,reviews_count,crawled_at,category_c
0,https://www.adidas.com/us/beach-shorts/FJ5089....,Beach Shorts,FJ5089,40,,USD,InStock,Black,Clothing,adidas United States,...,Women/Clothing,Splashing in the surf. Making memories with yo...,adidas,"https://assets.adidas.com/images/w_600,f_auto,...",USA,en,4.5,35,2021-10-23 17:50:17.331255,Clothing
1,https://www.adidas.com/us/five-ten-kestrel-lac...,Five Ten Kestrel Lace Mountain Bike Shoes,BC0770,150,,USD,InStock,Grey,Shoes,adidas United States,...,Women/Shoes,Lace up and get after it. The Five Ten Kestrel...,adidas,"https://assets.adidas.com/images/w_600,f_auto,...",USA,en,4.8,4,2021-10-23 17:50:17.423830,Shoes
2,https://www.adidas.com/us/mexico-away-jersey/G...,Mexico Away Jersey,GC7946,70,,USD,InStock,White,Clothing,adidas United States,...,Kids/Clothing,"Clean and crisp, this adidas Mexico Away Jerse...",adidas,"https://assets.adidas.com/images/w_600,f_auto,...",USA,en,4.9,42,2021-10-23 17:50:17.530834,Clothing
3,https://www.adidas.com/us/five-ten-hiangle-pro...,Five Ten Hiangle Pro Competition Climbing Shoes,FV4744,160,,USD,InStock,Black,Shoes,adidas United States,...,Five Ten/Shoes,The Hiangle Pro takes on the classic shape of ...,adidas,"https://assets.adidas.com/images/w_600,f_auto,...",USA,en,3.7,7,2021-10-23 17:50:17.615054,Shoes
4,https://www.adidas.com/us/mesh-broken-stripe-p...,Mesh Broken-Stripe Polo Shirt,GM0239,65,,USD,InStock,Blue,Clothing,adidas United States,...,Men/Clothing,Step up to the tee relaxed. This adidas golf p...,adidas,"https://assets.adidas.com/images/w_600,f_auto,...",USA,en,4.7,11,2021-10-23 17:50:17.702680,Clothing
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
840,https://www.adidas.com/us/supernova-plus-shoes...,Supernova+ Shoes,FX2858,72,$120,USD,InStock,White,Shoes,adidas United States,...,Women/Shoes,Take off. Touch down. Repeat. These adidas run...,adidas,"https://assets.adidas.com/images/w_600,f_auto,...",USA,en,4.3,151,2021-10-23 17:52:31.937924,Shoes
841,https://www.adidas.com/us/choigo-shoes/H00667....,Choigo Shoes,H00667,70,$100,USD,InStock,White,Shoes,adidas United States,...,Women/Shoes,"If you want drama, the bold female track and f...",adidas,"https://assets.adidas.com/images/w_600,f_auto,...",USA,en,4.7,135,2021-10-23 17:52:32.014973,Shoes
842,https://www.adidas.com/us/daily-3.0-shoes/GZ77...,Daily 3.0 Shoes,GZ7705,35,$50,USD,InStock,Black,Shoes,adidas United States,...,Kids/Shoes,The style is in the details of the Daily 3.0 S...,adidas,"https://assets.adidas.com/images/w_600,f_auto,...",USA,en,4.7,190,2021-10-23 17:52:32.208426,Shoes
843,https://www.adidas.com/us/daily-3.0-shoes/GZ77...,Daily 3.0 Shoes,GZ7706,40,$50,USD,InStock,Pink,Shoes,adidas United States,...,Kids/Shoes,The style is in the details of the Daily 3.0 S...,adidas,"https://assets.adidas.com/images/w_600,f_auto,...",USA,en,4.7,190,2021-10-23 17:52:32.293622,Shoes
