# Categorical Variables

### Three approaches
1. Drop categorical variable
2. Ordinal encoding
3. One-hot encoding

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# read the data
X = pd.read_csv('./train.csv', index_col='Id')
X_test = pd.read_csv('./test.csv', index_col='Id')

# remove rows with missing target
X.dropna(axis=0, subset=['SalePrice'], inplace=True)

# separate target from predictors
y = X.SalePrice
X.drop(['SalePrice'], axis=1, inplace=True)

# drop columns with missing values
cols_with_missing = [col for col in X.columns if X[col].isnull().any()]
X.drop(cols_with_missing, axis=1, inplace=True)
X_test.drop(cols_with_missing, axis=  1, inplace=True)

# split dataset into training and validation data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

In [None]:
X_train.head()

### score_dataset function: returns MAE (Mean Absolute Error) for different approaches

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# compare different approaches
def score_dataset(X_train, X_valid, y_train, y_valid):
  model = RandomForestRegressor(n_estimators=100, random_state=0)
  model.fit(X_train, y_train)
  preds = model.predict(X_valid)
  return mean_absolute_error(y_valid, preds)

### 1. Drop Categorical Variable

In [None]:
drop_X_train = X_train.select_dtypes(exclude=['object'])
drop_X_valid = X_valid.select_dtypes(exclude=['object'])

print(f"MAE from approach 1: {score_dataset(drop_X_train, drop_X_valid, y_train, y_valid)}")

### 2. Ordinal Encoding

#### Drop problematic columns

In [None]:
# categorical columns
categorical_cols = [col for col in X_train.columns if X_train[col].dtype == 'object']

# columns that can be ordinally encoded
good_categorical_cols = [col for col in categorical_cols if set(X_valid[col]).issubset(set(X_train[col]))]

# problematic columns
bad_categorical_cols = list(set(categorical_cols) - set(good_categorical_cols))
print(f"Categorical columns:\n{categorical_cols}")
print(f"\nGood categorical columns:\n{good_categorical_cols}")
print(f"\nBad categorical columns: {bad_categorical_cols}")

In [None]:
from sklearn.preprocessing import OrdinalEncoder

# drop bad categorical columns
drop_bad_cat_X_train = X_train.drop(bad_categorical_cols, axis=1)
drop_bad_cat_X_valid = X_valid.drop(bad_categorical_cols, axis=1)

# apply ordinal encoder in good categorical columns
ordinal_encoder = OrdinalEncoder()
drop_bad_cat_X_train[good_categorical_cols] = ordinal_encoder.fit_transform(X_train[good_categorical_cols])
drop_bad_cat_X_valid[good_categorical_cols] = ordinal_encoder.transform(X_valid[good_categorical_cols])

mae_ordinal_encoding = score_dataset(drop_bad_cat_X_train, drop_bad_cat_X_valid, y_train, y_valid)
print(f"MAE from approach 2: {mae_ordinal_encoding}")


### 3. One-hot Encoding

#### number of unique entries in each categorical column

In [None]:
categorical_nunique = list(map(lambda col: X_train[col].nunique(), categorical_cols))
dict_cat_nunique = dict(zip(categorical_cols, categorical_nunique))

sorted(dict_cat_nunique.items(), key=lambda x: x[1])


In [None]:
# columns with low cardinality
low_cardinality_cols = [col for col in categorical_cols if X_train[col].nunique() < 10]

# high cardinality columns
high_cardinality_cols = list(set(categorical_cols) - set(low_cardinality_cols))

print(f"Columns that are one-hot encoded:\n {low_cardinality_cols}")
print(f"\nColumns dropped due to high cardinality:\n {high_cardinality_cols}")

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

simple_imputer = SimpleImputer()

# apply one-hot encoding to each low cardinality column
one_hot_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
oh_cols_train = pd.DataFrame(one_hot_encoder.fit_transform(X_train[low_cardinality_cols]))
oh_cols_valid = pd.DataFrame(one_hot_encoder.transform(X_valid[low_cardinality_cols]))

# one-hot removed index, put it back
oh_cols_train.index = X_train.index
oh_cols_valid.index = X_valid.index

# remove categorical columns
num_X_train = X_train.drop(categorical_cols, axis=1)
num_X_valid = X_valid.drop(categorical_cols, axis=1)

# add one-hot encoded columns to numerical featured columns
oh_X_train = pd.concat([num_X_train, oh_cols_train], axis=1)
oh_X_valid = pd.concat([num_X_valid, oh_cols_valid], axis=1)

# ensure all columns having string type
oh_X_train.columns = oh_X_train.columns.astype(str)
oh_X_valid.columns = oh_X_valid.columns.astype(str)
# imputed_oh_X_train = pd.DataFrame(simple_imputer.fit_transform(oh_X_train))

mae_oh_encoding = score_dataset(oh_X_train, oh_X_valid, y_train, y_valid)
print(f"MAE from approach 3 (One-hot encoding): {mae_oh_encoding}")

### One-hot encoding for test data

In [None]:

oh_cols_test = pd.DataFrame(one_hot_encoder.transform(X_test[low_cardinality_cols]))
oh_cols_test.index = X_test.index
num_X_test = X_test.drop(categorical_cols, axis=1)
oh_X_test = pd.concat([num_X_test, oh_cols_test], axis=1)
oh_X_test.columns = oh_X_test.columns.astype(str)


In [None]:
oh_X_test.head()

In [None]:
# number of missing value in each column of training data
missing_val_count_by_column = (oh_X_test.isnull().sum())
print(missing_val_count_by_column[missing_val_count_by_column > 0])

In [None]:
oh_X_test.fillna(0, inplace=True)
# define model
model = RandomForestRegressor(n_estimators=100, random_state=0)
model.fit(oh_X_train, y_train)
preds_test = model.predict(oh_X_test)

# save prediction into a file
output = pd.DataFrame({'Id': X_test.index, 'SalePrice': preds_test})
output.to_csv('prediction_categorical_variable.csv', index=False)