In [72]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from pandas.api.types import is_numeric_dtype

Test Data
==

In [11]:
data = pd.read_csv("Run1/test.csv")
#test_data.describe()

Simple Version
==

In [23]:
def to_numeric(data):
    
    if not isinstance(data, pd.core.frame.DataFrame):
        raise TypeError("Expected data to be a pandas data frame!")
    
    result = data.copy()
    result = result.fillna("NA")
    
    for field in result:
        
        column = result[field]
        
        if is_numeric_dtype(column.dtype):
            continue
        
        column.apply(str)
        column = column.astype('string')
    
        encoder = LabelEncoder()
        column = encoder.fit_transform(column.values)
        
        result[field] = column
        
    return result

Tests
--

Simple Test:

In [24]:
to_numeric(data).head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,3,95,11622,1,1,3,3,0,...,120,0,2,2,1,0,6,2010,9,4
1,1462,20,4,96,14267,1,1,0,3,0,...,0,0,2,4,0,12500,6,2010,9,4
2,1463,60,4,89,13830,1,1,0,3,0,...,0,0,2,2,1,0,3,2010,9,4
3,1464,60,4,93,9978,1,1,0,3,0,...,0,0,2,4,1,0,6,2010,9,4
4,1465,120,4,58,5005,1,1,0,1,0,...,144,0,2,4,1,0,1,2010,9,4


A relevant error should be raised if the input is an unexpected type

In [25]:
to_numeric(data.Id)

TypeError: Expected data to be a pandas data frame!

Customization
==

In [86]:
def to_numeric(data, encoder, *args, **kwargs):
    
    if not isinstance(data, pd.core.frame.DataFrame):
        raise TypeError("Expected data to be a pandas data frame!")
    
    result = data.copy()
    result = result.fillna("NA")
    
    for field in result:
        
        column = result[field]
        
        if is_numeric_dtype(column.dtype):
            continue
        
        column = column.apply(str)
        column = column.astype('string')
    
        try:
            column = encoder(column, *args, **kwargs)
            result[field] = column
        
        except:
            raise ValueError("Error while executing encoder")
        
    return result

Tests
--

Same ol' encoder

In [69]:
def scikit_encoder(column):
    encoder = LabelEncoder()
    return encoder.fit_transform(column.values)

to_numeric(data, scikit_encoder).head(3)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,3,95,11622,1,1,3,3,0,...,120,0,2,2,1,0,6,2010,9,4
1,1462,20,4,96,14267,1,1,0,3,0,...,0,0,2,4,0,12500,6,2010,9,4
2,1463,60,4,89,13830,1,1,0,3,0,...,0,0,2,2,1,0,3,2010,9,4


Extra arguments are passed to the encoder

In [70]:
def constant_encoder(column, constant):
    return column.apply(lambda x: constant)

to_numeric(data, constant_encoder, 0).head(3)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,0,0,11622,0,0,0,0,0,...,120,0,0,0,0,0,6,2010,0,0
1,1462,20,0,0,14267,0,0,0,0,0,...,0,0,0,0,0,12500,6,2010,0,0
2,1463,60,0,0,13830,0,0,0,0,0,...,0,0,0,0,0,0,3,2010,0,0


Broken encoder

In [None]:
def broken_encoder(column):
    raise ValueError("This is broken")

to_numeric(data, broken_encoder)

Manual Encoder
==

In [105]:
def manual(column, override):
    
    if not isinstance(override, dict):
        raise TypeError("Expected a dictionary")
        
    if not column.name in override:
        return None
        
    lookup = override[column.name]
    
    if not isinstance(lookup, dict):
        raise TypeError(f"Expected entry for {column.name} to be a dictionary")
    
    new_values = np.zeros(len(column), dtype=str)
    missing_values = set()
    
    for i, value in enumerate(column):
        
        if not value in lookup:
            missing_values.add(value)
            continue
        
        new_values[i] = lookup[value]
    
    if len(missing_values) > 0:
        msg = f"Missing entries for column {column.name}: "
        msg += ", ".join(missing_values)
        raise ValueError(msg)
        
    return new_values

Tests
--

Simple test

In [109]:
override = {
    "MSZoning": {
        "RH": 0,
        "RL": 1,
        "RM": 2,
        "NA": 3,
        "C (all)": 4,
        "FV": 5
    }
}

to_numeric(data, manual, override).head(3)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,0,,11622,,,,,,...,120,0,,,,0,6,2010,,
1,1462,20,1,,14267,,,,,,...,0,0,,,,12500,6,2010,,
2,1463,60,1,,13830,,,,,,...,0,0,,,,0,3,2010,,


Missing entries for a column raise an error.
Missing columns are ignored.

In [None]:
override = {
    "MSZoning": {}
}

to_numeric(data, manual, override).head(3)

Final Version - Combined
==

In [136]:
def scikit_encoder(column):
    encoder = LabelEncoder()
    return encoder.fit_transform(column.values)

def manual_encoder(column, lookup):
    
    if not isinstance(lookup, dict):
        raise TypeError(f"Expected entry for {column.name} to be a dictionary")
    
    new_values = np.zeros(len(column))
    missing_values = set()
    
    is_integral = True
    
    for i, value in enumerate(column):
        
        if not value in lookup:
            missing_values.add(value)
            continue
            
        new_value = lookup[value]
        
        if not is_numeric_dtype(type(new_value)):
            msg = f"'{new_value}' given for '{value}'"
            msg += f" in column {column.name} is not numeric!"
            raise ValueError(msg)
        
        new_values[i] = lookup[value]
        
        if not np.issubdtype(type(new_value), np.integer):
            is_integral = False
    
    if len(missing_values) > 0:
        msg = f"Missing entries for column {column.name}: "
        msg += ", ".join(missing_values)
        raise ValueError(msg)
        
    if is_integral:
        new_values = new_values.astype(int)
        
    return new_values

def to_numeric(data, override = None):
    
    if not isinstance(data, pd.core.frame.DataFrame):
        raise TypeError("Expected data to be a pandas data frame!")
    
    result = data.copy()
    result = result.fillna("NA")
    
    for field in result:
        
        column = result[field]
        
        if is_numeric_dtype(column.dtype):
            continue
        
        column.apply(str)
        column = column.astype('string')
    
        if column.name in override:
            lookup = override[column.name]
            print(column.name)
            column = manual_encoder(column, lookup)
        else:
            column = scikit_encoder(column)
        
        result[field] = column
    
    return result

In [141]:
override = {
    "MSZoning": {
        "RH": 40,
        "RL": 41,
        "RM": 42,
        "NA": 40,
        "C (all)": 44,
        "FV": 45
    }
}

to_numeric(data, override).head(3)

MSZoning


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,40,95,11622,1,1,3,3,0,...,120,0,2,2,1,0,6,2010,9,4
1,1462,20,41,96,14267,1,1,0,3,0,...,0,0,2,4,0,12500,6,2010,9,4
2,1463,60,41,89,13830,1,1,0,3,0,...,0,0,2,2,1,0,3,2010,9,4


In [135]:
np.zeros(3).dtype

dtype('float64')