# CPSC529: 04_DataPreparation_2_Encoding_Imputation

In [36]:
import pandas as pd
import scipy
import numpy

# to display nice model diagram
from sklearn import set_config
set_config(display='diagram')

In [37]:
import warnings
warnings.filterwarnings("ignore")

In [38]:
import pandas as pd
import numpy as np

class display(object):
    """Display HTML representation of multiple objects"""
    template = """<div style="float: left; padding: 10px;">
    <p style='font-family:"Courier New", Courier, monospace'>{0}</p>{1}
    </div>"""
    def __init__(self, *args):
        self.args = args
        
    def _repr_html_(self):
        return '\n'.join(self.template.format(a, eval(a)._repr_html_())
                         for a in self.args)
    
    def __repr__(self):
        return '\n\n'.join(a + '\n' + repr(eval(a))
                           for a in self.args)

## 1. Encoding of categorical variables

- https://inria.github.io/scikit-learn-mooc/python_scripts/03_categorical_pipeline.html
- https://pbpython.com/categorical-encoding.html

In [39]:
#importing the necassary libraries
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder,OneHotEncoder

#reading the dataset
df=pd.read_csv(os.path.join('data',"50_Startups.csv"))
df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [40]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   R&D Spend        50 non-null     float64
 1   Administration   50 non-null     float64
 2   Marketing Spend  50 non-null     float64
 3   State            50 non-null     object 
 4   Profit           50 non-null     float64
dtypes: float64(4), object(1)
memory usage: 2.1+ KB


## 1.1 Why Encoding?

In [55]:
# 
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor

from sklearn.model_selection import train_test_split

# get the X (feature matrix) and y (target)
X = df.iloc[:, 0:-1]
y = df.iloc[:, -1]

X_train, X_test, y_train, y_test = train_test_split(X,y)

# will generate error if data is fed into linear regression model
# linear_model = LinearRegression(X_train, y_train)
# linear_model.fit()

# will generate error if data is fed into tree regression model
# tree_model = DecisionTreeRegressor()
# tree_model.fit(X_train, y_train)

## 1.2 OrinalEncoder

In [42]:
# Ordinal_encoder object
ordinal_encoder =OrdinalEncoder()

# Encode labels in column. 
df['State_Label'] = ordinal_encoder.fit_transform(df[['State']])
df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit,State_Label
0,165349.2,136897.8,471784.1,New York,192261.83,2.0
1,162597.7,151377.59,443898.53,California,191792.06,0.0
2,153441.51,101145.55,407934.54,Florida,191050.39,1.0
3,144372.41,118671.85,383199.62,New York,182901.99,2.0
4,142107.34,91391.77,366168.42,Florida,166187.94,1.0


## 1.3 OnHotEncoder

### 1.3.2 OneHotEncoder without Spare matrix

In [43]:
# creating instance of one-hot-encoder ()
# The output will be a Numpy array if specifying sparse=False. 
encoder = OneHotEncoder(sparse=False)
state_encoded = encoder.fit_transform(df[['State']])
print (encoder.__dict__, "\n")
print (encoder.categories_,"\n")
print (state_encoded[:5])

{'categories': 'auto', 'sparse': False, 'dtype': <class 'numpy.float64'>, 'handle_unknown': 'error', 'drop': None, 'n_features_in_': 1, 'feature_names_in_': array(['State'], dtype=object), 'categories_': [array(['California', 'Florida', 'New York'], dtype=object)], 'drop_idx_': None} 

[array(['California', 'Florida', 'New York'], dtype=object)] 

[[0. 0. 1.]
 [1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]
 [0. 1. 0.]]


In [44]:
# automatically generate feature names for each column (State_category1, State_category2....)
feature_names = encoder.get_feature_names_out(input_features=["State"])
print(feature_names)

# create a dataframe given data and the feature names
state_encoded = pd.DataFrame(state_encoded, columns=feature_names)

# # merge with main df bridge_df on key values
df.join(state_encoded).head()

['State_California' 'State_Florida' 'State_New York']


Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit,State_Label,State_California,State_Florida,State_New York
0,165349.2,136897.8,471784.1,New York,192261.83,2.0,0.0,0.0,1.0
1,162597.7,151377.59,443898.53,California,191792.06,0.0,1.0,0.0,0.0
2,153441.51,101145.55,407934.54,Florida,191050.39,1.0,0.0,1.0,0.0
3,144372.41,118671.85,383199.62,New York,182901.99,2.0,0.0,0.0,1.0
4,142107.34,91391.77,366168.42,Florida,166187.94,1.0,0.0,1.0,0.0


### 1.3.2 OneHotEncoder with Spare matrix

In [45]:
# creating instance of one-hot-encoder
# The output is SciPy sparse matrix, instead of a Numpy array (faster). 
# The sparse matrix can be converted into Numpy array using toarray() method 
encoder2 = OneHotEncoder()
state_encoded2 = encoder2.fit_transform(df[['State']])
print("Sparse matrix representation: ")
print(state_encoded2[:5])

state_encoded2 = encoder2.fit_transform(df[['State']]).toarray()
print("\nSparse matrix => numpy array representation: ")
print(state_encoded2[:5])

Sparse matrix representation: 
  (0, 2)	1.0
  (1, 0)	1.0
  (2, 1)	1.0
  (3, 2)	1.0
  (4, 1)	1.0

Sparse matrix => numpy array representation: 
[[0. 0. 1.]
 [1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]
 [0. 1. 0.]]


In [46]:
# create a dataframe with data and feature names
feature_names2 = encoder2.get_feature_names_out(input_features=["State"])
state_encoded2 = pd.DataFrame(state_encoded2, columns=feature_names2)

# # merge with main df bridge_df on key values
df.join(state_encoded2).head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit,State_Label,State_California,State_Florida,State_New York
0,165349.2,136897.8,471784.1,New York,192261.83,2.0,0.0,0.0,1.0
1,162597.7,151377.59,443898.53,California,191792.06,0.0,1.0,0.0,0.0
2,153441.51,101145.55,407934.54,Florida,191050.39,1.0,0.0,1.0,0.0
3,144372.41,118671.85,383199.62,New York,182901.99,2.0,0.0,0.0,1.0
4,142107.34,91391.77,366168.42,Florida,166187.94,1.0,0.0,1.0,0.0


## 1.4 Encoding Properties

In [47]:
# list of useful meta information
print(encoder2, "\n")
print(encoder2.__dict__, "\n")     # hyperparameters and attributes
print(encoder2.get_params(), "\n") # hyperparameters
print([attr for attr in encoder2.__dict__ if attr.endswith("_")], "\n") # attributes
print(dir(encoder2), "\n")         # methods and attributes

OneHotEncoder() 

{'categories': 'auto', 'sparse': True, 'dtype': <class 'numpy.float64'>, 'handle_unknown': 'error', 'drop': None, 'n_features_in_': 1, 'feature_names_in_': array(['State'], dtype=object), 'categories_': [array(['California', 'Florida', 'New York'], dtype=object)], 'drop_idx_': None} 

{'categories': 'auto', 'drop': None, 'dtype': <class 'numpy.float64'>, 'handle_unknown': 'error', 'sparse': True} 

['n_features_in_', 'feature_names_in_', 'categories_', 'drop_idx_'] 

['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setstate__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_check_X', '_check_feature_names', '_check_n_features', '_compute_drop_idx', '_fit', '_get_feature', '_get_param_names', '_get_tags'

## 1.5 A little more on understores

**Side note: leading and trailing underscores in sklearn**

- *the trailing underscore* (e.g., self.gamma_) in class attributes is a scikit-learn convention to denote "estimated" or "fitted" attributes. 
- *The leading underscores* are (e.g., _sigmoid(self, z)) denote private methods that the user should not bother with

Read more:https://github.com/rasbt/python-machine-learning-book/blob/master/faq/underscore-convention.md

**Meaning of Underscores in Python**

- *Single Leading Underscore(_var)*: Naming convention indicating a name is meant for internal use. Generally not enforced by the Python interpreter (except in wildcard imports) and meant as a hint to the programmer only.
- *Single Trailing Underscore(var_)*: Used by convention to avoid naming conflicts with Python keywords.
- *Double Leading Underscore(__var)*: Triggers name mangling when used in a class context. Enforced by the Python interpreter.
- *Double Leading and Trailing Underscore(__var__)*: Indicates special methods defined by the Python language. Avoid this naming scheme for your own attributes.
- *Single Underscore(_)*: Sometimes used as a name for temporary or insignificant variables (“don’t care”). Also: The result of the last expression in a Python REPL.

Read more: https://stackoverflow.com/questions/1301346/what-is-the-meaning-of-single-and-double-underscore-before-an-object-name

## 2. Imputing Missing Data 

https://dzone.com/articles/imputing-missing-data-using-sklearn-simpleimputer

In [48]:
import pandas as pd
import numpy as np

students = [[85, 'M', 'verygood'],
           [95, 'F', 'excellent'],
           [75, None,'good'],
           [np.NaN, 'M', 'average'],
           [70, 'M', 'good'],
           [np.NaN, None, 'verygood']]

df1 = pd.DataFrame(students)
df1.columns = ['marks', 'gender', 'result']
df1

Unnamed: 0,marks,gender,result
0,85.0,M,verygood
1,95.0,F,excellent
2,75.0,,good
3,,M,average
4,70.0,M,good
5,,,verygood


### 2.2 Imputing on multiple columns

In [49]:
from sklearn.impute import SimpleImputer
df2=df1.copy()

# do imputation for the column of gender
imputer = SimpleImputer(missing_values=None, strategy='most_frequent')
df2.gender = imputer.fit_transform(df2['gender'].values.reshape(-1,1))

# do imputation for the column of marks
imputer2 = SimpleImputer(missing_values=np.NaN, strategy='mean')
df2.marks = imputer2.fit_transform(df2['marks'].values.reshape(-1,1))[:,0]

display('df1', 'df2')


Unnamed: 0,marks,gender,result
0,85.0,M,verygood
1,95.0,F,excellent
2,75.0,,good
3,,M,average
4,70.0,M,good
5,,,verygood

Unnamed: 0,marks,gender,result
0,85.0,M,verygood
1,95.0,F,excellent
2,75.0,M,good
3,81.25,M,average
4,70.0,M,good
5,81.25,M,verygood


In [50]:
df3=df1.copy()

# do imputation for the column of gender
imputer = SimpleImputer(strategy='most_frequent')
imputer.fit_transform(df3[['gender', 'marks']])

array([['M', 85.0],
       ['F', 95.0],
       [None, 75.0],
       ['M', 70.0],
       ['M', 70.0],
       [None, 70.0]], dtype=object)

### 2.2 Imputation strategies

In [51]:

# Imputing with mean value
imputer = SimpleImputer(missing_values=np.NaN, strategy='mean')
df2['marks_mean'] = imputer.fit_transform(df2['marks'].values.reshape(-1,1))[:,0]

# Imputing with median value
imputer = SimpleImputer(missing_values=np.NaN, strategy='median')
df2['marks_median'] = imputer.fit_transform(df2['marks'].values.reshape(-1,1))[:,0]

# Imputing with most frequent / mode value
imputer = SimpleImputer(missing_values=np.NaN, strategy='most_frequent')
df2['marks_most_frequent'] = imputer.fit_transform(df2['marks'].values.reshape(-1,1))[:,0]

# Imputing with constant value; The command below replaces the missing
#value with constant value such as 80
imputer = SimpleImputer(missing_values=np.NaN, strategy='constant', fill_value=80)
df2['marks_constant'] = imputer.fit_transform(df2['marks'].values.reshape(-1,1))[:,0]

df2.drop(columns=['marks'])

Unnamed: 0,gender,result,marks_mean,marks_median,marks_most_frequent,marks_constant
0,M,verygood,85.0,85.0,85.0,85.0
1,F,excellent,95.0,95.0,95.0,95.0
2,M,good,75.0,75.0,75.0,75.0
3,M,average,81.25,81.25,81.25,81.25
4,M,good,70.0,70.0,70.0,70.0
5,M,verygood,81.25,81.25,81.25,81.25


## 6. Polynomial Features

In [52]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import PolynomialFeatures

df_data = pd.DataFrame({
    'x': np.random.randint(low=1, high=10, size=5),
    'y': np.random.randint(low=-1, high=1, size=5)})

PolyFeats = PolynomialFeatures(degree=2,include_bias=False)
poly_data = PolyFeats.fit_transform(df_data)
poly_names= PolyFeats.get_feature_names_out(df_data.columns)
df_poly_data = pd.DataFrame(poly_data, columns=poly_names)

In [53]:
display('df_data', 'df_poly_data')

Unnamed: 0,x,y
0,5,0
1,8,0
2,9,0
3,6,-1
4,7,0

Unnamed: 0,x,y,x^2,x y,y^2
0,5.0,0.0,25.0,0.0,0.0
1,8.0,0.0,64.0,0.0,0.0
2,9.0,0.0,81.0,0.0,0.0
3,6.0,-1.0,36.0,-6.0,1.0
4,7.0,0.0,49.0,0.0,0.0
