# Encoding

In [18]:
import numpy as np
import pandas as pd
import random 
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.multivariate.manova import MANOVA
from sklearn.preprocessing import StandardScaler
import datetime 
import warnings
warnings.filterwarnings('ignore')
plt.rcParams["figure.figsize"] = (10,8) 

In [19]:
sales = pd.read_csv('ficheros/sales4.csv')
sales.head(2)

Unnamed: 0,state,market,market_size,profit,margin,sales,cogs,total_expenses,marketing,inventory,budget_profit,budget_margin,budget_sales,product_type,product,año
0,Connecticut,East,Small Market,107.0,176.0,292.0,116.0,69.0,38.0,962.0,110.0,160.0,270.0,Coffee,Columbian,2010
1,Connecticut,East,Small Market,75.0,135.0,225.0,90.0,60.0,29.0,1148.0,90.0,130.0,210.0,Coffee,Columbian,2010


In [20]:
sales_cat = sales.select_dtypes(include = 'object') # columnas categóricas
sales_cat.head(2)

Unnamed: 0,state,market,market_size,product_type,product
0,Connecticut,East,Small Market,Coffee,Columbian
1,Connecticut,East,Small Market,Coffee,Columbian


In [21]:
for i in sales_cat.keys():
    print(i)
    print('--------------------------------')
    print(sales_cat[i].unique())
    print('--------------------------------')

state
--------------------------------
['Connecticut' 'Washington' 'California' 'Texas' 'New York' 'Ohio'
 'Illinois' 'Louisiana' 'Florida' 'Wisconsin' 'Colorado' 'Missouri' 'Iowa'
 'Massachusetts' 'Oklahoma' 'Utah' 'Oregon' 'New Mexico' 'New Hampshire'
 'Nevada']
--------------------------------
market
--------------------------------
['East' 'West' 'South' 'Central']
--------------------------------
market_size
--------------------------------
['Small Market' 'Major Market']
--------------------------------
product_type
--------------------------------
['Coffee' 'Tea' 'Espresso' 'Herbal Tea']
--------------------------------
product
--------------------------------
['Columbian' 'Green Tea' 'Caffe Mocha' 'Decaf Espresso' 'Lemon' 'Mint'
 'Darjeeling' 'Decaf Irish Cream' 'Chamomile' 'Earl Grey' 'Caffe Latte'
 'Amaretto' 'Regular Espresso']
--------------------------------


Columnas en las que vamos a realizar el encoding según todo lo analizado en el EDA:

- *market* : tiene orden. Usaremos el **método map**

In [22]:
mapa_market = {'Central':1,'West': 2, 'East': 3, 'South': 4} 

In [23]:
sales['market_encoding'] =sales['market'].map(mapa_market)

- *market_size*: tiene orden. Usaremos el **método map**

In [24]:
mapa_market_size = {'Major Market':1,'Small Market': 0} 

In [25]:
sales['market_size_encoding'] =sales['market_size'].map(mapa_market_size)

- *año*: tiene orden. Usaremos el **método map**

In [26]:
mapa_año = {2010:0,2011:1} 

In [27]:
sales['año_encoding'] =sales['año'].map(mapa_año)

In [28]:
sales.head(2)

Unnamed: 0,state,market,market_size,profit,margin,sales,cogs,total_expenses,marketing,inventory,budget_profit,budget_margin,budget_sales,product_type,product,año,market_encoding,market_size_encoding,año_encoding
0,Connecticut,East,Small Market,107.0,176.0,292.0,116.0,69.0,38.0,962.0,110.0,160.0,270.0,Coffee,Columbian,2010,3,0,0
1,Connecticut,East,Small Market,75.0,135.0,225.0,90.0,60.0,29.0,1148.0,90.0,130.0,210.0,Coffee,Columbian,2010,3,0,0


- *product_type*: no tiene orden. Usaremos el **método get dummies**

In [29]:
product_type_dummies = pd.get_dummies(sales['product_type'],dtype = int) 

In [30]:
df_dummies = pd.concat([sales,product_type_dummies], axis = 1)

In [31]:
df_dummies.head(2)

Unnamed: 0,state,market,market_size,profit,margin,sales,cogs,total_expenses,marketing,inventory,...,product_type,product,año,market_encoding,market_size_encoding,año_encoding,Coffee,Espresso,Herbal Tea,Tea
0,Connecticut,East,Small Market,107.0,176.0,292.0,116.0,69.0,38.0,962.0,...,Coffee,Columbian,2010,3,0,0,1,0,0,0
1,Connecticut,East,Small Market,75.0,135.0,225.0,90.0,60.0,29.0,1148.0,...,Coffee,Columbian,2010,3,0,0,1,0,0,0


Desechamos las columnas que están codificadas y las que ya no nos aportan como *product* porque vamos a utilizar *product_type* que la engloba y *state* porque está clasificada en *market*.

In [32]:
df_dummies.drop(columns = ['product','state','market', 'market_size','product_type', 'año'],axis = 1,inplace = True)

In [33]:
df_dummies.head(2)

Unnamed: 0,profit,margin,sales,cogs,total_expenses,marketing,inventory,budget_profit,budget_margin,budget_sales,market_encoding,market_size_encoding,año_encoding,Coffee,Espresso,Herbal Tea,Tea
0,107.0,176.0,292.0,116.0,69.0,38.0,962.0,110.0,160.0,270.0,3,0,0,1,0,0,0
1,75.0,135.0,225.0,90.0,60.0,29.0,1148.0,90.0,130.0,210.0,3,0,0,1,0,0,0


In [34]:
df_dummies.to_csv('ficheros/sales5.csv',index = False)