# Data Preprocessing
This example use the data income.csv

In [101]:
import pandas as pd

data = pd.read_csv('../data/income.csv')

In [102]:
data.head()

Unnamed: 0,age,JobType,EdType,maritalstatus,occupation,relationship,race,gender,capitalgain,capitalloss,hoursperweek,nativecountry,SalStat
0,45,Private,HS-grad,Divorced,Adm-clerical,Not-in-family,White,Female,0,0,28,United-States,"less than or equal to 50,000"
1,24,Federal-gov,HS-grad,Never-married,Armed-Forces,Own-child,White,Male,0,0,40,United-States,"less than or equal to 50,000"
2,44,Private,Some-college,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,40,United-States,"greater than 50,000"
3,27,Private,9th,Never-married,Craft-repair,Other-relative,White,Male,0,0,40,Mexico,"less than or equal to 50,000"
4,20,Private,Some-college,Never-married,Sales,Not-in-family,White,Male,0,0,35,United-States,"less than or equal to 50,000"


In [104]:
data['SalStat'].values

array([' less than or equal to 50,000', ' less than or equal to 50,000',
       ' greater than 50,000', ..., ' less than or equal to 50,000',
       ' less than or equal to 50,000', ' less than or equal to 50,000'],
      shape=(31978,), dtype=object)

## Features in Machine Learning

In [105]:
data.columns

Index(['age', 'JobType', 'EdType', 'maritalstatus', 'occupation',
       'relationship', 'race', 'gender', 'capitalgain', 'capitalloss',
       'hoursperweek', 'nativecountry', 'SalStat'],
      dtype='object')

### Data Type

In [106]:
data.dtypes

age               int64
JobType          object
EdType           object
maritalstatus    object
occupation       object
relationship     object
race             object
gender           object
capitalgain       int64
capitalloss       int64
hoursperweek      int64
nativecountry    object
SalStat          object
dtype: object

In [107]:
data['age'].dtype

dtype('int64')

In [108]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31978 entries, 0 to 31977
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   age            31978 non-null  int64 
 1   JobType        31978 non-null  object
 2   EdType         31978 non-null  object
 3   maritalstatus  31978 non-null  object
 4   occupation     31978 non-null  object
 5   relationship   31978 non-null  object
 6   race           31978 non-null  object
 7   gender         31978 non-null  object
 8   capitalgain    31978 non-null  int64 
 9   capitalloss    31978 non-null  int64 
 10  hoursperweek   31978 non-null  int64 
 11  nativecountry  31978 non-null  object
 12  SalStat        31978 non-null  object
dtypes: int64(4), object(9)
memory usage: 3.2+ MB


In [109]:
import numpy as np

print(np.unique(data['JobType']))
print(np.unique(data['occupation']))
print(data['JobType'].value_counts())
print(data['occupation'].value_counts())

[' ?' ' Federal-gov' ' Local-gov' ' Never-worked' ' Private'
 ' Self-emp-inc' ' Self-emp-not-inc' ' State-gov' ' Without-pay']
[' ?' ' Adm-clerical' ' Armed-Forces' ' Craft-repair' ' Exec-managerial'
 ' Farming-fishing' ' Handlers-cleaners' ' Machine-op-inspct'
 ' Other-service' ' Priv-house-serv' ' Prof-specialty' ' Protective-serv'
 ' Sales' ' Tech-support' ' Transport-moving']
JobType
Private             22286
Self-emp-not-inc     2499
Local-gov            2067
?                    1809
State-gov            1279
Self-emp-inc         1074
Federal-gov           943
Without-pay            14
Never-worked            7
Name: count, dtype: int64
occupation
Prof-specialty       4038
Craft-repair         4030
Exec-managerial      3992
Adm-clerical         3721
Sales                3584
Other-service        3212
Machine-op-inspct    1966
?                    1816
Transport-moving     1572
Handlers-cleaners    1350
Farming-fishing       989
Tech-support          912
Protective-serv       644


## Missing Data

In [110]:
data.isnull().sum()

age              0
JobType          0
EdType           0
maritalstatus    0
occupation       0
relationship     0
race             0
gender           0
capitalgain      0
capitalloss      0
hoursperweek     0
nativecountry    0
SalStat          0
dtype: int64

### Exploring Data

In [111]:
data.describe()

Unnamed: 0,age,capitalgain,capitalloss,hoursperweek
count,31978.0,31978.0,31978.0,31978.0
mean,38.579023,1064.360623,86.739352,40.41785
std,13.662085,7298.596271,401.594301,12.345285
min,17.0,0.0,0.0,1.0
25%,28.0,0.0,0.0,40.0
50%,37.0,0.0,0.0,40.0
75%,48.0,0.0,0.0,45.0
max,90.0,99999.0,4356.0,99.0


In [112]:
data.describe(include= 'O')

Unnamed: 0,JobType,EdType,maritalstatus,occupation,relationship,race,gender,nativecountry,SalStat
count,31978,31978,31978,31978,31978,31978,31978,31978,31978
unique,9,16,7,15,6,5,2,41,2
top,Private,HS-grad,Married-civ-spouse,Prof-specialty,Husband,White,Male,United-States,"less than or equal to 50,000"
freq,22286,10368,14692,4038,12947,27430,21370,29170,24283


'O' means Object.
And we see there are imputities in the dataset. Now we convert "?" into NaN.

In [113]:
data = pd.read_csv('../data/income.csv', na_values=["?"], skipinitialspace=True)
data.head()

Unnamed: 0,age,JobType,EdType,maritalstatus,occupation,relationship,race,gender,capitalgain,capitalloss,hoursperweek,nativecountry,SalStat
0,45,Private,HS-grad,Divorced,Adm-clerical,Not-in-family,White,Female,0,0,28,United-States,"less than or equal to 50,000"
1,24,Federal-gov,HS-grad,Never-married,Armed-Forces,Own-child,White,Male,0,0,40,United-States,"less than or equal to 50,000"
2,44,Private,Some-college,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,40,United-States,"greater than 50,000"
3,27,Private,9th,Never-married,Craft-repair,Other-relative,White,Male,0,0,40,Mexico,"less than or equal to 50,000"
4,20,Private,Some-college,Never-married,Sales,Not-in-family,White,Male,0,0,35,United-States,"less than or equal to 50,000"


In [114]:
print(data['JobType'].value_counts())

JobType
Private             22286
Self-emp-not-inc     2499
Local-gov            2067
State-gov            1279
Self-emp-inc         1074
Federal-gov           943
Without-pay            14
Never-worked            7
Name: count, dtype: int64


In [115]:
data.isnull().sum()

age                 0
JobType          1809
EdType              0
maritalstatus       0
occupation       1816
relationship        0
race                0
gender              0
capitalgain         0
capitalloss         0
hoursperweek        0
nativecountry       0
SalStat             0
dtype: int64

In [116]:
data.describe()

Unnamed: 0,age,capitalgain,capitalloss,hoursperweek
count,31978.0,31978.0,31978.0,31978.0
mean,38.579023,1064.360623,86.739352,40.41785
std,13.662085,7298.596271,401.594301,12.345285
min,17.0,0.0,0.0,1.0
25%,28.0,0.0,0.0,40.0
50%,37.0,0.0,0.0,40.0
75%,48.0,0.0,0.0,45.0
max,90.0,99999.0,4356.0,99.0


In [117]:
data.describe(include= 'O')

Unnamed: 0,JobType,EdType,maritalstatus,occupation,relationship,race,gender,nativecountry,SalStat
count,30169,31978,31978,30162,31978,31978,31978,31978,31978
unique,8,16,7,14,6,5,2,41,2
top,Private,HS-grad,Married-civ-spouse,Prof-specialty,Husband,White,Male,United-States,"less than or equal to 50,000"
freq,22286,10368,14692,4038,12947,27430,21370,29170,24283


In [118]:
data['JobType']=data['JobType'].fillna(data['JobType'].describe().top)
data['occupation']=data['occupation'].fillna(data['occupation'].describe().top)
print(data.isnull().sum())

age              0
JobType          0
EdType           0
maritalstatus    0
occupation       0
relationship     0
race             0
gender           0
capitalgain      0
capitalloss      0
hoursperweek     0
nativecountry    0
SalStat          0
dtype: int64


In [119]:
data.describe(include= 'O')

Unnamed: 0,JobType,EdType,maritalstatus,occupation,relationship,race,gender,nativecountry,SalStat
count,31978,31978,31978,31978,31978,31978,31978,31978,31978
unique,8,16,7,14,6,5,2,41,2
top,Private,HS-grad,Married-civ-spouse,Prof-specialty,Husband,White,Male,United-States,"less than or equal to 50,000"
freq,24095,10368,14692,5854,12947,27430,21370,29170,24283


## Feature Encoding
Now we set the values in the 'SalStat' as array Y. The rest features in the dataset will act as array X.

In [120]:
data['SalStat'] = data['SalStat'].map({" less than or equal to 50,000":0, " greater than 50,000":1})
data['SalStat'].values

array([0, 0, 1, ..., 0, 0, 0], shape=(31978,))

We generate a new dataset.

In [121]:
new_data = pd.get_dummies(data, drop_first=True)
new_data.head()

Unnamed: 0,age,capitalgain,capitalloss,hoursperweek,SalStat,JobType_Local-gov,JobType_Never-worked,JobType_Private,JobType_Self-emp-inc,JobType_Self-emp-not-inc,...,nativecountry_Portugal,nativecountry_Puerto-Rico,nativecountry_Scotland,nativecountry_South,nativecountry_Taiwan,nativecountry_Thailand,nativecountry_Trinadad&Tobago,nativecountry_United-States,nativecountry_Vietnam,nativecountry_Yugoslavia
0,45,0,0,28,0,False,False,True,False,False,...,False,False,False,False,False,False,False,True,False,False
1,24,0,0,40,0,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
2,44,0,0,40,1,False,False,True,False,False,...,False,False,False,False,False,False,False,True,False,False
3,27,0,0,40,0,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
4,20,0,0,35,0,False,False,True,False,False,...,False,False,False,False,False,False,False,True,False,False


In [122]:
columns_list = list(new_data.columns)
columns_list

['age',
 'capitalgain',
 'capitalloss',
 'hoursperweek',
 'SalStat',
 'JobType_Local-gov',
 'JobType_Never-worked',
 'JobType_Private',
 'JobType_Self-emp-inc',
 'JobType_Self-emp-not-inc',
 'JobType_State-gov',
 'JobType_Without-pay',
 'EdType_11th',
 'EdType_12th',
 'EdType_1st-4th',
 'EdType_5th-6th',
 'EdType_7th-8th',
 'EdType_9th',
 'EdType_Assoc-acdm',
 'EdType_Assoc-voc',
 'EdType_Bachelors',
 'EdType_Doctorate',
 'EdType_HS-grad',
 'EdType_Masters',
 'EdType_Preschool',
 'EdType_Prof-school',
 'EdType_Some-college',
 'maritalstatus_Married-AF-spouse',
 'maritalstatus_Married-civ-spouse',
 'maritalstatus_Married-spouse-absent',
 'maritalstatus_Never-married',
 'maritalstatus_Separated',
 'maritalstatus_Widowed',
 'occupation_Armed-Forces',
 'occupation_Craft-repair',
 'occupation_Exec-managerial',
 'occupation_Farming-fishing',
 'occupation_Handlers-cleaners',
 'occupation_Machine-op-inspct',
 'occupation_Other-service',
 'occupation_Priv-house-serv',
 'occupation_Prof-specialt

In [123]:
features = list(set(columns_list) - set(["SalStat"]))
features

['EdType_5th-6th',
 'nativecountry_Guatemala',
 'JobType_Local-gov',
 'EdType_Prof-school',
 'nativecountry_Puerto-Rico',
 'EdType_11th',
 'occupation_Other-service',
 'nativecountry_Jamaica',
 'EdType_12th',
 'EdType_Assoc-voc',
 'JobType_Self-emp-inc',
 'capitalgain',
 'maritalstatus_Married-civ-spouse',
 'nativecountry_Germany',
 'occupation_Armed-Forces',
 'maritalstatus_Separated',
 'maritalstatus_Widowed',
 'EdType_Masters',
 'nativecountry_Iran',
 'EdType_9th',
 'nativecountry_El-Salvador',
 'race_Black',
 'nativecountry_Dominican-Republic',
 'nativecountry_Peru',
 'race_Asian-Pac-Islander',
 'EdType_Bachelors',
 'nativecountry_Cuba',
 'nativecountry_Ireland',
 'nativecountry_Scotland',
 'nativecountry_Portugal',
 'relationship_Not-in-family',
 'nativecountry_Taiwan',
 'relationship_Other-relative',
 'JobType_Without-pay',
 'nativecountry_Italy',
 'race_White',
 'nativecountry_Mexico',
 'nativecountry_Columbia',
 'nativecountry_Laos',
 'nativecountry_Haiti',
 'occupation_Prof-sp

In [124]:
x = new_data[features].values
x

array([[False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       ...,
       [False, False, False, ..., False, False, False],
       [False, False, True, ..., False, False, False],
       [False, False, False, ..., False, False, False]],
      shape=(31978, 95), dtype=object)

In [125]:
y = new_data["SalStat"].values
y

array([0, 0, 1, ..., 0, 0, 0], shape=(31978,))

In [126]:
from sklearn.model_selection import train_test_split

In [127]:
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=0.2, random_state=42)