In [4]:
from sklearn.preprocessing import OrdinalEncoder,LabelEncoder,OneHotEncoder
from sklearn.model_selection import train_test_split
import pandas as pd; import numpy as np
from seedtools.seed_file import load_seed
from seedtools import configure_data_path,DATA_PATH

In [5]:
data = load_seed("adult.csv",quiet=True).data
data =  data.drop(columns=["Age","fnlgwt","Education Num","Capital Loss","Hours/Week","Capital Gain"])

In [6]:
data.head()

Unnamed: 0,Workclass,Education,Marital Status,Occupation,Relationship,Race,Sex,Country,Above/Below 50k
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,United-States,<=50K
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States,<=50K
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,United-States,<=50K
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,United-States,<=50K
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,Cuba,<=50K


In [7]:
data["Race"].unique()

array([' White', ' Black', ' Asian-Pac-Islander', ' Amer-Indian-Eskimo',
       ' Other'], dtype=object)

In [8]:
x_train,x_test,y_train,y_test = train_test_split(data.drop(columns=["Above/Below 50k"],axis=1),data["Above/Below 50k"],test_size=0.2)

#### LABEL ENCODER :  use only for Target columns not features 

In [9]:
le = LabelEncoder()
le.fit(y_train)
y_train = le.transform(y_train)
y_test =  le.transform(y_test)

In [10]:
le.classes_

array([' <=50K', ' >50K'], dtype=object)

In [11]:
y_train[:20]

array([0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1])

#### ORDINAL ENCODING 
##### It gives number based on value/importance like poor-> 0 good->1 great-> 2

In [12]:
df = data[["Race","Workclass","Above/Below 50k"]]
df.Race.replace({" Asian-Pac-Islander":"Other"," Amer-Indian-Eskimo":"Other"," Other":"Other"},inplace=True)
df.Workclass.replace({" State-gov":"gov"," Local-gov":"gov"," Federal-gov":"gov",
                     " Never-worked":"unemployed"," Without-pay":"unemployed",
                     " Private":"other"," Self-emp-inc":"other"," Self-emp-not-inc":"other",' ?':"other"},inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df.Race.replace({" Asian-Pac-Islander":"Other"," Amer-Indian-Eskimo":"Other"," Other":"Other"},inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.Race.replace({" Asian-Pac-Islander":"Other"," Amer-Indian-Eskimo":"Other"," Other":"Other"},inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 

In [13]:
x_train,x_test,y_train,y_test =  train_test_split(df.drop(columns=["Above/Below 50k"],axis=1),df["Above/Below 50k"])

In [14]:
oe = OrdinalEncoder(categories=[["Other"," Black"," White"],["unemployed","gov","other"]])

In [15]:
x_train_trans  = oe.fit_transform(x_train)


In [16]:
pd.DataFrame(x_train_trans,columns=["Race","Employed"])

Unnamed: 0,Race,Employed
0,2.0,2.0
1,2.0,2.0
2,2.0,1.0
3,2.0,2.0
4,2.0,2.0
...,...,...
24415,2.0,2.0
24416,2.0,2.0
24417,2.0,2.0
24418,2.0,2.0


In [17]:
pd.DataFrame(x_train)

Unnamed: 0,Race,Workclass
12003,White,other
844,White,other
19336,White,gov
17368,White,other
26079,White,other
...,...,...
1734,White,other
24299,White,other
29689,White,other
16750,White,other


#### One Hot Encoding and pandas dummies 
##### The main differnce b/w these two is that dummies does not remember  the encoding thus cannot be applied to test class whereas one hot encoding remembers it and same encoding can be applied to test class and other categories get ignored 

In [18]:
df = data[["Race","Workclass","Sex"]]
df.Race.replace({" Asian-Pac-Islander":"Other"," Amer-Indian-Eskimo":"Other"," Other":"Other"},inplace=True)
df.Workclass.replace({" State-gov":"gov"," Local-gov":"gov"," Federal-gov":"gov",
                     " Never-worked":"unemployed"," Without-pay":"unemployed",
                     " Private":"other"," Self-emp-inc":"other"," Self-emp-not-inc":"other",' ?':"other"},inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df.Race.replace({" Asian-Pac-Islander":"Other"," Amer-Indian-Eskimo":"Other"," Other":"Other"},inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.Race.replace({" Asian-Pac-Islander":"Other"," Amer-Indian-Eskimo":"Other"," Other":"Other"},inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 

In [19]:
x_train,x_test,y_train,y_test =  train_test_split(df.drop(columns=["Sex"],axis=1),df["Sex"])

In [20]:
pd.get_dummies(df,columns=["Race","Workclass"])

Unnamed: 0,Sex,Race_ Black,Race_ White,Race_Other,Workclass_gov,Workclass_other,Workclass_unemployed
0,Male,False,True,False,True,False,False
1,Male,False,True,False,False,True,False
2,Male,False,True,False,False,True,False
3,Male,True,False,False,False,True,False
4,Female,True,False,False,False,True,False
...,...,...,...,...,...,...,...
32556,Female,False,True,False,False,True,False
32557,Male,False,True,False,False,True,False
32558,Female,False,True,False,False,True,False
32559,Male,False,True,False,False,True,False


In [21]:
ohe = OneHotEncoder(handle_unknown="ignore")

In [22]:
x_train_new = ohe.fit_transform(x_train).toarray()
data_hash = data[["Race","Workclass"]]

In [23]:
data_hash.head()

Unnamed: 0,Race,Workclass
0,White,State-gov
1,White,Self-emp-not-inc
2,White,Private
3,Black,Private
4,Black,Private


##### SEE belowe table as in workclass nothing matches so it is given zero but in race something matches

In [24]:
ohe.transform(data_hash).toarray()[10:30]

array([[1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.]])