In [1]:
import json
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
import joblib


In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/pplonski/datasets-for-start/master/adult/data.csv', skipinitialspace=True)
x_cols = [c for c in df.columns if c!='income']
X = df[x_cols]
y = df['income']
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [35]:
df.isna().sum()

age                  0
workclass         1836
fnlwgt               0
education            0
education-num        0
marital-status       0
occupation        1843
relationship         0
race                 0
sex                  0
capital-gain         0
capital-loss         0
hours-per-week       0
native-country     583
income               0
dtype: int64

In [3]:
y = LabelEncoder().fit_transform(y)

array([0, 0, 0, ..., 0, 0, 1])

In [4]:
cols_with_null = ['workclass','occupation','native-country']
preprocessor = ColumnTransformer(
    transformers=[
        (
            'imputer', 
            SimpleImputer(missing_values=np.nan, strategy='most_frequent'),
            cols_with_null),
    ])

preprocessor.fit(X)
X_new = preprocessor.transform(X)

for icol, col in enumerate(cols_with_null):
    X.loc[:, col] = X_new[:, icol]

# confirm no null values in these columns:
X.isna().sum()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
dtype: int64

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3)

In [6]:
X_test.isna().sum()

age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
dtype: int64

In [8]:
preprocessor = ColumnTransformer(
transformers=[
    ('onehot', OneHotEncoder(), ['workclass', 'education', 'marital-status',
                'occupation', 'relationship', 'race', 'sex','native-country'])])

clf1 = Pipeline([('preprocessor', preprocessor),
                ('classifier', RandomForestClassifier())])



In [9]:
clf1.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('onehot', OneHotEncoder(),
                                                  ['workclass', 'education',
                                                   'marital-status',
                                                   'occupation', 'relationship',
                                                   'race', 'sex',
                                                   'native-country'])])),
                ('classifier', RandomForestClassifier())])

In [10]:
clf1.score(X_test, y_test)

0.8230115672023749

In [14]:
preprocessor = ColumnTransformer(
transformers=[
    ('onehot', OneHotEncoder(), ['workclass', 'education', 'marital-status',
                'occupation', 'relationship', 'race', 'sex','native-country'])])

clf2 = Pipeline([('preprocessor', preprocessor),
                ('classifier', ExtraTreesClassifier(n_estimators=100))])



In [16]:
clf2.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('onehot', OneHotEncoder(),
                                                  ['workclass', 'education',
                                                   'marital-status',
                                                   'occupation', 'relationship',
                                                   'race', 'sex',
                                                   'native-country'])])),
                ('classifier', ExtraTreesClassifier())])

In [17]:
clf2.score(X_train, y_train)

0.8659178659178659

In [20]:
joblib.dump(clf1, "./random_forest.joblib", compress=True)
joblib.dump(clf2, "./extra_trees.joblib", compress=True)

['./extra_trees.joblib']