# In this notebook we will test the TensorFlow library to analyze the different representations that we can apply using the keras API.

In [70]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter(action='ignore', category=FutureWarning)
pd.set_option('display.max_rows', 40)
pd.set_option('display.max_columns', None)
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers

In [71]:
train = pd.read_csv("/Users/sebascordova/Desktop/Code proyects/Kaggle - House Prices/data/Original/train.csv")

# Hashed Feature

In [73]:
def hashed_column(df, columna, num_bins, output_mode):
    """
        --- Hashed feature adresses three possible problems associated with categorical features:
                - Incomplete vocabulary
                - Model size due to cardinality
                - Cold start

    This function converts categorical column inputs to hashed output. 
    Its arguments are the original df, number of hash bins and the output_mode (int or one_hot). 
    The function returns the original df with the transformed columns (different output depending on the selected output_mode) and the original column deleted.
    """

    column_to = list(columna.unique())
    layer = tf.keras.layers.Hashing(num_bins=num_bins, output_mode=output_mode)
    feature = list(layer(columna.unique()).numpy())

    if output_mode == "one_hot":
        df_feature = pd.DataFrame(feature)
        df_feature = df_feature.add_prefix(columna.name + "_hashed_" + output_mode)
        df_final = df.join(df_feature)
        del(df_final[columna.name])

    else:
        df_feature = pd.DataFrame(list(zip(column_to, feature)),
                columns =[columna.name, f'{columna.name}hashed_' + output_mode])

        df_final = pd.merge(df,df_feature, on = columna.name, how = "left")
        del(df_final[columna.name])

    return df_final

In [74]:
train_prueba = train[['Id', 'MSSubClass', 'MSZoning', 'LotFrontage','Neighborhood']]
train_prueba.head(10)


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,Neighborhood
0,1,60,RL,65.0,CollgCr
1,2,20,RL,80.0,Veenker
2,3,60,RL,68.0,CollgCr
3,4,70,RL,60.0,Crawfor
4,5,60,RL,84.0,NoRidge
5,6,50,RL,85.0,Mitchel
6,7,20,RL,75.0,Somerst
7,8,60,RL,,NWAmes
8,9,50,RM,51.0,OldTown
9,10,190,RL,50.0,BrkSide


In [75]:
train_prueba.Neighborhood.unique()

array(['CollgCr', 'Veenker', 'Crawfor', 'NoRidge', 'Mitchel', 'Somerst',
       'NWAmes', 'OldTown', 'BrkSide', 'Sawyer', 'NridgHt', 'NAmes',
       'SawyerW', 'IDOTRR', 'MeadowV', 'Edwards', 'Timber', 'Gilbert',
       'StoneBr', 'ClearCr', 'NPkVill', 'Blmngtn', 'BrDale', 'SWISU',
       'Blueste'], dtype=object)

### In this case, we want to transform the column Neighborhood, let's see what our output would look like depending on the hashed mode we choose

In [76]:
train_prueba_feature = hashed_column(train_prueba, train_prueba.Neighborhood, 3, "int")
train_prueba_feature.head(15)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,Neighborhoodhashed_int
0,1,60,RL,65.0,2
1,2,20,RL,80.0,2
2,3,60,RL,68.0,2
3,4,70,RL,60.0,1
4,5,60,RL,84.0,0
5,6,50,RL,85.0,1
6,7,20,RL,75.0,2
7,8,60,RL,,0
8,9,50,RM,51.0,1
9,10,190,RL,50.0,0


In [77]:
# We test the one_hot mode to preserve similarity between items
train_prueba_feature = hashed_column(train_prueba, train_prueba.Neighborhood, 3, "one_hot")
train_prueba_feature.head(15)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,Neighborhood_hashed_one_hot0,Neighborhood_hashed_one_hot1,Neighborhood_hashed_one_hot2
0,1,60,RL,65.0,0.0,0.0,1.0
1,2,20,RL,80.0,0.0,0.0,1.0
2,3,60,RL,68.0,0.0,1.0,0.0
3,4,70,RL,60.0,1.0,0.0,0.0
4,5,60,RL,84.0,0.0,1.0,0.0
5,6,50,RL,85.0,0.0,0.0,1.0
6,7,20,RL,75.0,1.0,0.0,0.0
7,8,60,RL,,0.0,1.0,0.0
8,9,50,RM,51.0,1.0,0.0,0.0
9,10,190,RL,50.0,1.0,0.0,0.0


## We don´t need to store the vocabulary because the transformation code is independent of the actual data value, the core of the model only deals with number of bins inputs, not the full vocabulary. The key trade-off here is that we lose model accuracy. When the alternative is to discard the variable because it is too wide, a lossy encoding is an acceptable compromise.