## Income classification 
### Prediction task is to determine whether a person makes over 50K a year.

https://www.kaggle.com/lodetomasi1995/income-classification

Listing of attributes:
- age: continuous.
- workclass: Private, Self-emp-not-inc, Self-emp-inc, Federal-gov, Local-gov, 
    State-gov, Without-pay, Never-worked.
- fnlwgt: continuous.
- education: Bachelors, Some-college, 11th, HS-grad, Prof-school, Assoc-acdm, 
    Assoc-voc, 9th, 7th-8th, 12th, Masters, 1st-4th, 10th, Doctorate, 5th-6th, Preschool.
- education-num: continuous.
- marital-status: Married-civ-spouse, Divorced, Never-married, Separated, 
    Widowed, Married-spouse-absent, Married-AF-spouse.
- occupation: Tech-support, Craft-repair, Other-service, Sales, Exec-managerial, 
    Prof-specialty, Handlers-cleaners, Machine-op-inspct, Adm-clerical, Farming-fishing, 
    Transport-moving, Priv-house-serv, Protective-serv, Armed-Forces.
- relationship: Wife, Own-child, Husband, Not-in-family, Other-relative, Unmarried.
- race: White, Asian-Pac-Islander, Amer-Indian-Eskimo, Other, Black.
- sex: Female, Male.
- capital-gain: continuous.
- capital-loss: continuous.
- hours-per-week: continuous.
- native-country: United-States, Cambodia, England, Puerto-Rico, Canada, Germany, 
    Outlying-US(Guam-USVI-etc), India, Japan, Greece, South, China, Cuba, Iran, Honduras, 
    Philippines, Italy, Poland, Jamaica, Vietnam, Mexico, Portugal, Ireland, France, 
    Dominican-Republic, Laos, Ecuador, Taiwan, Haiti, Columbia, Hungary, Guatemala, 
    Nicaragua, Scotland, Thailand, Yugoslavia, El-Salvador, Trinadad&Tobago, Peru, Hong, Holand-Netherlands.

## Preprocessing data

In [1]:
# Importing the libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Importing the dataset
dataset = pd.read_csv('income_evaluation.csv')

# Remove Nan value
dataset = dataset.replace(' ?', np.nan).dropna()

# Encoding target field (feature 'income')
dataset = dataset.replace(' >50K', 1)
dataset = dataset.replace(' <=50K', 0)

# Remove feature 'fnlwgt'
dataset = dataset.drop('fnlwgt', axis=1)

In [2]:
dataset

Unnamed: 0,age,workclass,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,0
1,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0
2,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0
3,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,0
4,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,0
32557,40,Private,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,1
32558,58,Private,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,0
32559,22,Private,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,0


In [3]:
# Setting features, targets
target = dataset['income']
feature = dataset.drop('income', axis=1)

# Categorizing variables
feature_dummies = pd.get_dummies(feature)

# Setting X, y
X = feature_dummies.values
y = target.values

In [4]:
# Splitting the dataset
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.15, random_state=0)

# Feature scaling
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [5]:
X_train_scaled

array([[0.2739726 , 0.6       , 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.28767123, 0.53333333, 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.2739726 , 0.6       , 0.05178052, ..., 1.        , 0.        ,
        0.        ],
       ...,
       [0.45205479, 0.8       , 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.43835616, 0.8       , 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.23287671, 0.53333333, 0.03325033, ..., 1.        , 0.        ,
        0.        ]])

In [6]:
X_test_scaled

array([[0.36986301, 0.53333333, 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.05479452, 0.6       , 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.26027397, 0.6       , 0.        , ..., 1.        , 0.        ,
        0.        ],
       ...,
       [0.76712329, 0.53333333, 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.46575342, 0.53333333, 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.31506849, 0.53333333, 0.        , ..., 1.        , 0.        ,
        0.        ]])

## Classification by Logistic Regression

In [7]:
# Classification by Logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

param_grid_lr = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}

lr = LogisticRegression(random_state=0)

grid_search_lr = GridSearchCV(lr, param_grid_lr, cv=5)
grid_search_lr.fit(X_train_scaled, y_train)

best_params_lr = grid_search_lr.best_params_



In [8]:
best_params_lr

{'C': 100}

In [9]:
# Rebuilding a model with best parameters
lr = LogisticRegression(C=best_params_lr['C'], random_state=0)
lr.fit(X_train_scaled, y_train)

test_score_train_lr = lr.score(X_train_scaled, y_train)
test_score_lr = lr.score(X_test_scaled, y_test)



In [11]:
test_score_train_lr

0.8489292818972579

In [12]:
test_score_lr

0.84353591160221

In [13]:
# Output coeficients
coeficients_lr = lr.coef_
intercept_lr = lr.intercept_

In [14]:
coeficients_lr

array([[ 1.88553221e+00,  3.37732530e+00,  3.14699067e+01,
         2.54855731e+00,  2.85987438e+00,  6.61519895e-01,
        -1.39687231e-01,  1.24981000e-01,  2.14351309e-01,
        -4.00359969e-01, -2.32841276e-01, -4.08651010e+00,
         2.14340268e-01,  9.09333526e-02,  7.82427540e-02,
         6.07474699e-01,  3.60927058e-01,  2.05069412e-01,
         8.42571050e-02, -2.12540732e-02,  1.98702337e-01,
         4.79547873e-01,  7.95470186e-01,  2.42770850e-01,
         5.97395495e-01, -9.13616874e+00,  9.94288740e-01,
         3.49456311e-01, -1.21179002e+00,  1.62368381e+00,
         8.46603917e-01, -1.19635192e+00, -1.72875120e+00,
        -1.18375394e+00, -1.00818702e+00,  2.36197580e-01,
        -2.55785056e+00,  2.69261012e-01,  1.02684930e+00,
        -8.14199363e-01, -5.05101119e-01, -6.77858770e-02,
        -6.56458826e-01, -3.77648199e+00,  7.15568901e-01,
         7.90364564e-01,  4.89471520e-01,  8.83820223e-01,
         1.07798271e-01, -7.56357294e-01, -3.42315394e-0

In [15]:
intercept_lr

array([-3.85854637])

In [16]:
# Predicting the Test set results
y_pred_lr = lr.predict(X_test_scaled)

In [17]:
y_pred_lr

array([0, 0, 0, ..., 0, 0, 0])

In [18]:
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix

cm_lr = confusion_matrix(y_test, y_pred_lr)

In [19]:
cm_lr

array([[3141,  240],
       [ 468,  676]])

## Classification by Random Forest

In [21]:
from sklearn.ensemble import RandomForestClassifier

param_grid_rf = {'n_estimators': [3, 10, 30, 50], 'max_features': [2, 20, 40]}

forest = RandomForestClassifier(random_state=0)

grid_search_rf = GridSearchCV(forest, param_grid_rf, cv=5)
grid_search_rf.fit(X_train_scaled, y_train)

best_params_rf = grid_search_rf.best_params_

In [22]:
best_params_rf

{'max_features': 20, 'n_estimators': 30}

In [23]:
# Rebuilding a model with best parameters
forest = RandomForestClassifier(
        n_estimators=best_params_rf['n_estimators'],
        max_features=best_params_rf['max_features'],
        random_state=0)
forest.fit(X_train_scaled, y_train)

test_score_train_rf = forest.score(X_train_scaled, y_train)
test_score_rf = forest.score(X_test_scaled, y_test)

In [24]:
test_score_train_rf

0.9758162031438936

In [25]:
test_score_rf

0.8353591160220994

In [26]:
# Output coeficients
feature_importances_rf = forest.feature_importances_

In [27]:
feature_importances_rf

array([2.11270484e-01, 8.54609606e-02, 1.15527881e-01, 3.42164839e-02,
       1.05768646e-01, 6.43935312e-03, 7.35788380e-03, 1.30680709e-02,
       6.20646310e-03, 1.02157537e-02, 5.69376104e-03, 9.28828376e-05,
       1.11054553e-03, 1.17246516e-03, 8.05617671e-04, 7.25303644e-05,
       4.07119299e-04, 1.20860321e-03, 8.46382607e-04, 2.74835186e-03,
       2.84537957e-03, 1.13789803e-02, 1.92305038e-03, 7.25451173e-03,
       5.41286720e-03, 4.24676129e-05, 3.69196615e-03, 5.52637424e-03,
       6.17456568e-03, 4.37621709e-04, 6.47218405e-02, 1.06807861e-03,
       2.39058022e-02, 1.86962621e-03, 1.31684602e-03, 6.84844923e-03,
       5.02581854e-06, 7.95514369e-03, 2.08225699e-02, 5.16852190e-03,
       3.49413146e-03, 4.44570214e-03, 6.34875387e-03, 1.23968709e-04,
       1.59610341e-02, 3.60796354e-03, 8.73381180e-03, 5.86081305e-03,
       5.85402981e-03, 7.05283656e-02, 8.95841280e-03, 1.11875600e-03,
       4.30301090e-03, 4.54615946e-03, 1.12228014e-02, 1.93867065e-03,
      

In [28]:
# Predicting the Test set results
y_pred_rf = forest.predict(X_test_scaled)

In [29]:
y_pred_rf

array([0, 0, 0, ..., 0, 1, 0])

In [30]:
# Making the Confusion Matrix
cm_rf = confusion_matrix(y_test, y_pred_rf)

In [31]:
cm_rf

array([[3102,  279],
       [ 466,  678]])

## Classification by Gradient Boosting Classifier

In [32]:
from sklearn.ensemble import GradientBoostingClassifier

gbc = GradientBoostingClassifier(random_state=0)
gbc.fit(X_train_scaled, y_train)

test_score_train_gbc = gbc.score(X_train_scaled, y_train)
test_score_gbc = gbc.score(X_test_scaled, y_test)

In [33]:
test_score_train_gbc

0.8656239029527636

In [34]:
test_score_gbc

0.8556906077348067

In [35]:
# Output coeficients
feature_importances_gbc = gbc.feature_importances_

In [36]:
feature_importances_gbc

array([5.97596322e-02, 2.00751155e-01, 2.03492126e-01, 5.53088505e-02,
       3.51435813e-02, 2.23300329e-03, 1.62960311e-03, 2.58262457e-04,
       5.97208140e-04, 4.38836152e-03, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 9.06134664e-06, 0.00000000e+00, 1.13883040e-04,
       1.04805218e-04, 1.34156983e-04, 0.00000000e+00, 1.75458707e-04,
       0.00000000e+00, 1.59167354e-04, 4.02504181e-04, 0.00000000e+00,
       0.00000000e+00, 8.15337011e-04, 3.80470400e-01, 0.00000000e+00,
       1.01738805e-03, 2.31281218e-04, 2.07803507e-05, 4.92942533e-04,
       0.00000000e+00, 3.76900047e-05, 1.77000003e-02, 6.84927405e-03,
       6.63580204e-04, 5.29103572e-04, 5.81586319e-03, 0.00000000e+00,
       5.59374209e-03, 4.60718234e-04, 8.24451612e-04, 2.29926136e-03,
       2.31638841e-04, 8.18417825e-04, 7.25943707e-04, 5.77113060e-05,
       3.29311918e-04, 1.97689060e-04, 4.69406921e-03, 4.64292654e-05,
      

In [37]:
# Predicting the Test set results
y_pred_gbc = gbc.predict(X_test_scaled)

In [38]:
y_pred_gbc

array([0, 0, 0, ..., 0, 0, 0])

In [39]:
# Making the Confusion Matrix
cm_gbc = confusion_matrix(y_test, y_pred_gbc)

In [40]:
cm_gbc

array([[3206,  175],
       [ 478,  666]])