<a href="https://colab.research.google.com/github/ShuHwaiTeoh/ML_practice_project/blob/master/SKLearn/categorical_value.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import pandas as pd

# Path of the file to read
file_path = '/content/drive/My Drive/melb_data.csv'

# read the file into a dataframe
data = pd.read_csv(file_path, encoding ='Windows-1252')
data.head()

In [0]:
y = data.Price
y.head()

In [0]:
# Drop target column from dataset
X = data.drop(['Price'], axis=1)
X.head()

In [0]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(X, y, random_state=0)

In [0]:
# view the data of categorical variables
ob = X_train.select_dtypes(include=['object'])
print(ob.head())

# Get list of categorical variables
c = (X_train.dtypes == 'object')
object_cols = list(c[c].index)

In [0]:
# Drop Categorical Variables
drop_X_train = X_train.select_dtypes(exclude=['object'])
drop_X_valid = X_valid.select_dtypes(exclude=['object'])

In [0]:
# Columns that can be safely label encoded
good_label_cols = [col for col in object_cols if set(X_train[col]) == set(X_valid[col])]
        
# Problematic columns that will be dropped from the dataset
bad_label_cols = list(set(object_cols)-set(good_label_cols))

# Drop categorical columns that will not be encoded
good_X_train = X_train.drop(bad_label_cols, axis=1)
good_X_valid = X_valid.drop(bad_label_cols, axis=1)

In [0]:
from sklearn.preprocessing import LabelEncoder

# Make copy to avoid changing original data 
label_X_train = good_X_train.copy()
label_X_valid = good_X_valid.copy()

# Apply label encoder to each column with categorical data
# Randomly assign each unique value to a different integer.
label_encoder = LabelEncoder()
for col in good_label_cols:
    label_X_train[col] = label_encoder.fit_transform(good_X_train[col])
    label_X_valid[col] = label_encoder.transform(good_X_valid[col])

In [0]:
from sklearn.preprocessing import OrdinalEncoder

# Select Date as the ordinal variables 
label_X_train = good_X_train.copy()
label_X_valid = good_X_valid.copy()

# Ordinal encoding
# Encode values of ordinal variables with value between 0 and n_classes-1
ordinal_encoder = OrdinalEncoder()
for col in good_label_cols:
  label_X_train[ordinal_cols] = ordinal_encoder.fit_transform(good_X_train[ordinal_cols])
  label_X_valid[ordinal_cols] = ordinal_encoder.transform(good_X_valid[ordinal_cols])

In [0]:
# Get number of unique entries in each column with categorical data
object_nunique = list(map(lambda col: X_train[col].nunique(), object_cols))
d = dict(zip(object_cols, object_nunique))

# Print number of unique entries by column, in ascending order
sorted(d.items(), key=lambda x: x[1])

# Columns that will be one-hot encoded
low_cardinality_cols = [col for col in object_cols if X_train[col].nunique() < 10]

# Columns that will be dropped from the dataset
high_cardinality_cols = list(set(object_cols)-set(low_cardinality_cols))

In [0]:
from sklearn.preprocessing import OneHotEncoder

# Apply one-hot encoder to each column with categorical data
# handle_unknown='ignore': avoid errors when the validation data contains classes that aren't represented in the training data
# sparse=False: ensures that the encoded columns are returned as a numpy array (instead of a sparse matrix).
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[low_cardinality_cols]))
OH_cols_valid = pd.DataFrame(OH_encoder.transform(X_valid[low_cardinality_cols]))

# One-hot encoding removed index; put it back
OH_cols_train.index = X_train.index
OH_cols_valid.index = X_valid.index

# Remove categorical columns (will replace with one-hot encoding)
num_X_train = X_train.drop(object_cols, axis=1)
num_X_valid = X_valid.drop(object_cols, axis=1)

# Add one-hot encoded columns to numerical features
OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)