In [13]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from pandas.api.types import is_numeric_dtype

def scikit_encoder(column):
    encoder = LabelEncoder()
    return encoder.fit_transform(column.values)

def manual_encoder(column, lookup):
    
    if not isinstance(lookup, dict):
        raise TypeError(f"Expected entry for {column.name} to be a dictionary")
    
    new_values = np.zeros(len(column))
    missing_values = set()
    
    is_integral = True
    
    for i, value in enumerate(column):
        
        if not value in lookup:
            missing_values.add(value)
            continue
            
        new_value = lookup[value]
        
        if not is_numeric_dtype(type(new_value)):
            msg = f"'{new_value}' given for '{value}'"
            msg += f" in column {column.name} is not numeric!"
            raise ValueError(msg)
        
        new_values[i] = lookup[value]
        
        if not np.issubdtype(type(new_value), np.integer):
            is_integral = False
    
    if len(missing_values) > 0:
        msg = f"Missing entries for column {column.name}: "
        msg += ", ".join(missing_values)
        raise ValueError(msg)
        
    if is_integral:
        new_values = new_values.astype(int)
        
    return new_values

def encode(data, override = None):
    
    if not isinstance(data, pd.core.frame.DataFrame):
        raise TypeError("Expected data to be a pandas data frame!")
    
    result = data.copy()
    result = result.fillna("NA")
    
    for field in result:
        
        column = result[field]
        
        if is_numeric_dtype(column.dtype):
            continue
        
        column.apply(str)
        column = column.astype('string')
    
        if column.name in override:
            lookup = override[column.name]
            column = manual_encoder(column, lookup)
        else:
            column = scikit_encoder(column)
        
        result[field] = column
    
    return result

Example
==

In [9]:
data = pd.read_csv("Run1/test.csv")
data.head(3)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal


Below, overrides are given for the "MSZoning" and "LandContour" features. The rest are automatically converted with no order guaranteed.

In [10]:
override = {
    "MSZoning": {
        "RH": 40,
        "RL": 41,
        "RM": 42,
        "NA": 40,
        "C (all)": 44,
        "FV": 45
    },
    "LandContour": {
        "Bnk": 0,
        "Low": 1,
        "HLS": 2,
        "Lvl": 3
    }
}

result = encode(data, override).head(3)
result

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,40,95,11622,1,1,3,3,0,...,120,0,2,2,1,0,6,2010,9,4
1,1462,20,41,96,14267,1,1,0,3,0,...,0,0,2,4,0,12500,6,2010,9,4
2,1463,60,41,89,13830,1,1,0,3,0,...,0,0,2,2,1,0,3,2010,9,4


In [11]:
result.dtypes.apply(str).values

array(['int64', 'int64', 'int32', 'int32', 'int64', 'int32', 'int32',
       'int32', 'int32', 'int32', 'int32', 'int32', 'int32', 'int32',
       'int32', 'int32', 'int32', 'int64', 'int64', 'int64', 'int64',
       'int32', 'int32', 'int32', 'int32', 'int32', 'int32', 'int32',
       'int32', 'int32', 'int32', 'int32', 'int32', 'int32', 'int32',
       'int32', 'int32', 'int32', 'int32', 'int32', 'int32', 'int32',
       'int32', 'int64', 'int64', 'int64', 'int64', 'int32', 'int32',
       'int64', 'int64', 'int64', 'int64', 'int32', 'int64', 'int32',
       'int64', 'int32', 'int32', 'int32', 'int32', 'int32', 'int32',
       'int32', 'int32', 'int32', 'int64', 'int64', 'int64', 'int64',
       'int64', 'int64', 'int32', 'int32', 'int32', 'int64', 'int64',
       'int64', 'int32', 'int32'], dtype=object)

To discover what values need to be converted for a given feature, give an empty lookup, and the error message will contain the list.

In [12]:
override = {
    "MSZoning": {}
}

encode(data, override).head(3)

ValueError: Missing entries for column MSZoning: RH, NA, RM, C (all), RL, FV