In [30]:
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, OneHotEncoder
from sklearn.feature_selection import VarianceThreshold
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split

### Data treatment

In [3]:
base = pd.read_csv('../Classificação/data/census.csv')

In [4]:
columns = base.columns[:-1]
columns

Index(['age', 'workclass', 'final-weight', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loos', 'hour-per-week', 'native-country'],
      dtype='object')

In [5]:
X = base.iloc[:, 0:14].values
y = base.iloc[:, 14].values

In [7]:
label_encoder_workclass = LabelEncoder()
label_encoder_education = LabelEncoder()
label_encoder_marital = LabelEncoder()
label_encoder_occupation = LabelEncoder()
label_encoder_relationship = LabelEncoder()
label_encoder_race = LabelEncoder()
label_encoder_sex = LabelEncoder()
label_encoder_country = LabelEncoder()

X[:, 1] = label_encoder_workclass.fit_transform(X[:, 1])
X[:, 3] = label_encoder_education.fit_transform(X[:, 3])
X[:, 5] = label_encoder_marital.fit_transform(X[:, 5])
X[:, 6] = label_encoder_occupation.fit_transform(X[:, 6])
X[:, 7] = label_encoder_relationship.fit_transform(X[:, 7])
X[:, 8] = label_encoder_race.fit_transform(X[:, 8])
X[:, 9] = label_encoder_sex.fit_transform(X[:, 9])
X[:, 13] = label_encoder_country.fit_transform(X[:, 13])

In [9]:
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

### Low variance

In [11]:
for i in range(X.shape[1]):
    print(X[:, i].var())

0.034913808595952486
0.03312115190663569
0.005138537590667898
0.06657103564450892
0.029416385024073417
0.06301761677301636
0.09123816653931152
0.10326534394406342
0.04502805169292987
0.22136950173699113
0.00545419549240862
0.008557270623428908
0.015874043397822807
0.03641266114220053


In [13]:
selection = VarianceThreshold(threshold=0.05)
X_variance = selection.fit_transform(X)
X_variance.shape

(32561, 5)

In [14]:
selection.variances_

array([0.03491381, 0.03312115, 0.00513854, 0.06657104, 0.02941639,
       0.06301762, 0.09123817, 0.10326534, 0.04502805, 0.2213695 ,
       0.0054542 , 0.00855727, 0.01587404, 0.03641266])

In [15]:
index = np.where(selection.variances_ > 0.05)
index

(array([3, 5, 6, 7, 9], dtype=int64),)

In [17]:
columns[index]

Index(['education', 'marital-status', 'occupation', 'relationship', 'sex'], dtype='object')

In [20]:
base_variance = base.drop(columns=['age', 'workclass', 
                                   'final-weight', 'education-num', 
                                   'race', 'capital-gain', 'capital-loos', 
                                   'hour-per-week', 'native-country'], axis=1)
base_variance

Unnamed: 0,education,marital-status,occupation,relationship,sex,income
0,Bachelors,Never-married,Adm-clerical,Not-in-family,Male,<=50K
1,Bachelors,Married-civ-spouse,Exec-managerial,Husband,Male,<=50K
2,HS-grad,Divorced,Handlers-cleaners,Not-in-family,Male,<=50K
3,11th,Married-civ-spouse,Handlers-cleaners,Husband,Male,<=50K
4,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Female,<=50K
...,...,...,...,...,...,...
32556,Assoc-acdm,Married-civ-spouse,Tech-support,Wife,Female,<=50K
32557,HS-grad,Married-civ-spouse,Machine-op-inspct,Husband,Male,>50K
32558,HS-grad,Widowed,Adm-clerical,Unmarried,Female,<=50K
32559,HS-grad,Never-married,Adm-clerical,Own-child,Male,<=50K


In [21]:
X_variance = base_variance.iloc[:, 0:5].values
y_variance = base_variance.iloc[:, 5].values

In [22]:
X_variance[:,0] = label_encoder_education.fit_transform(X_variance[:,0])
X_variance[:,1] = label_encoder_marital.fit_transform(X_variance[:,1])
X_variance[:,2] = label_encoder_occupation.fit_transform(X_variance[:,2])
X_variance[:,3] = label_encoder_relationship.fit_transform(X_variance[:,3])
X_variance[:,4] = label_encoder_sex.fit_transform(X_variance[:,4])

In [23]:
X_variance

array([[9, 4, 1, 1, 1],
       [9, 2, 4, 0, 1],
       [11, 0, 6, 1, 1],
       ...,
       [11, 6, 1, 4, 0],
       [11, 4, 1, 3, 1],
       [11, 2, 4, 5, 0]], dtype=object)

In [25]:
onehotencoder = ColumnTransformer(transformers=[("OneHot", OneHotEncoder(), [0, 1, 2, 3, 4])], remainder='passthrough')
X_variance = onehotencoder.fit_transform(X_variance).toarray()
X_variance

array([[0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.],
       ...,
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 1., 1., 0.]])

In [26]:
scaler = MinMaxScaler()
X_variance = scaler.fit_transform(X_variance)
X_variance

array([[0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.],
       ...,
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 1., 1., 0.]])

In [27]:
X_train_variance, X_test_variance, y_train_variance, y_test_variance = train_test_split(X_variance, y_variance, test_size=0.3)
X_train_variance.shape, X_test_variance.shape

((22792, 46), (9769, 46))

In [28]:
random_forest_variance = RandomForestClassifier(n_estimators=40, criterion='entropy', min_samples_split=5, min_samples_leaf=1, random_state=0)
random_forest_variance.fit(X_train_variance, y_train_variance)

In [29]:
predicts = random_forest_variance.predict(X_test_variance)
accuracy_score(y_test_variance, predicts)

0.8284368922100522

### Extra tree

In [32]:
selection = ExtraTreesClassifier()
selection.fit(X, y)

In [33]:
importances = selection.feature_importances_
importances

array([0.15340737, 0.04443164, 0.16467193, 0.03815927, 0.08608757,
       0.07468104, 0.076271  , 0.09137286, 0.01515347, 0.0273316 ,
       0.08644356, 0.02849798, 0.0959103 , 0.01758041])

In [34]:
index = []
for i in range(len(importances)):
    if importances[i] >= 0.029:
        index.append(i)

In [35]:
index

[0, 1, 2, 3, 4, 5, 6, 7, 10, 12]

In [36]:
X_extra = X[:, index]
X_extra

array([[0.30136986, 0.875     , 0.0443019 , ..., 0.2       , 0.02174022,
        0.39795918],
       [0.45205479, 0.75      , 0.0482376 , ..., 0.        , 0.        ,
        0.12244898],
       [0.28767123, 0.5       , 0.13811345, ..., 0.2       , 0.        ,
        0.39795918],
       ...,
       [0.56164384, 0.5       , 0.09482688, ..., 0.8       , 0.        ,
        0.39795918],
       [0.06849315, 0.5       , 0.12849934, ..., 0.6       , 0.        ,
        0.19387755],
       [0.47945205, 0.625     , 0.18720338, ..., 1.        , 0.1502415 ,
        0.39795918]])

In [37]:
onehotencoder = ColumnTransformer(transformers=[("OneHot", OneHotEncoder(), [1, 3, 5, 6, 7])], remainder='passthrough')
X_extra = onehotencoder.fit_transform(X_extra).toarray()
X_extra

array([[0.        , 0.        , 0.        , ..., 0.8       , 0.02174022,
        0.39795918],
       [0.        , 0.        , 0.        , ..., 0.8       , 0.        ,
        0.12244898],
       [0.        , 0.        , 0.        , ..., 0.53333333, 0.        ,
        0.39795918],
       ...,
       [0.        , 0.        , 0.        , ..., 0.53333333, 0.        ,
        0.39795918],
       [0.        , 0.        , 0.        , ..., 0.53333333, 0.        ,
        0.19387755],
       [0.        , 0.        , 0.        , ..., 0.53333333, 0.1502415 ,
        0.39795918]])

In [38]:
X_extra.shape

(32561, 58)

In [39]:
X_train_extra, X_test_extra, y_train_extra, y_test_extra = train_test_split(X_extra, y, test_size=0.3)
X_train_extra.shape, X_test_extra.shape

((22792, 58), (9769, 58))

In [40]:
random_forest_extra = RandomForestClassifier(n_estimators=40, criterion='entropy', min_samples_split=5, min_samples_leaf=1, random_state=0)
random_forest_extra.fit(X_train_extra, y_train_extra)

In [41]:
predicts = random_forest_extra.predict(X_test_extra)
accuracy_score(y_test_extra, predicts)

0.8503429214863343