In [1]:
!pip install imbalanced-learn

Collecting imbalanced-learn
  Downloading imbalanced_learn-0.9.0-py3-none-any.whl (199 kB)
Collecting scikit-learn>=1.0.1
  Downloading scikit_learn-1.0.2-cp39-cp39-win_amd64.whl (7.2 MB)
Installing collected packages: scikit-learn, imbalanced-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 0.24.2
    Uninstalling scikit-learn-0.24.2:
      Successfully uninstalled scikit-learn-0.24.2
Successfully installed imbalanced-learn-0.9.0 scikit-learn-1.0.2


In [3]:
import pandas as pd
import numpy as np

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.metrics import confusion_matrix, classification_report
from sklearn import metrics
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_predict
from sklearn.ensemble import RandomForestClassifier

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

In [4]:
income = pd.read_csv('Data/income.csv')
print(income)

       age         workclass   education  education-num      marital-status  \
0       39         State-gov   Bachelors             13       Never-married   
1       50  Self-emp-not-inc   Bachelors             13  Married-civ-spouse   
2       38           Private     HS-grad              9            Divorced   
3       53           Private        11th              7  Married-civ-spouse   
4       28           Private   Bachelors             13  Married-civ-spouse   
...    ...               ...         ...            ...                 ...   
32556   27           Private  Assoc-acdm             12  Married-civ-spouse   
32557   40           Private     HS-grad              9  Married-civ-spouse   
32558   58           Private     HS-grad              9             Widowed   
32559   22           Private     HS-grad              9       Never-married   
32560   52      Self-emp-inc     HS-grad              9  Married-civ-spouse   

              occupation   relationship   race     

In [6]:
income.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       30725 non-null  object
 2   education       32561 non-null  object
 3   education-num   32561 non-null  int64 
 4   marital-status  32561 non-null  object
 5   occupation      30718 non-null  object
 6   relationship    32561 non-null  object
 7   race            32561 non-null  object
 8   sex             32561 non-null  object
 9   capital-gain    32561 non-null  int64 
 10  capital-loss    32561 non-null  int64 
 11  hours-per-week  32561 non-null  int64 
 12  native-country  31978 non-null  object
 13  income >50K     32561 non-null  int64 
dtypes: int64(6), object(8)
memory usage: 3.5+ MB


In [7]:
income.columns[income.isnull().any()]

Index(['workclass', 'occupation', 'native-country'], dtype='object')

In [8]:
print(income['workclass'].value_counts(dropna= False))
print(income['occupation'].value_counts(dropna= False))
print(income['native-country'].value_counts(dropna= False))

Private             22696
Self-emp-not-inc     2541
Local-gov            2093
NaN                  1836
State-gov            1298
Self-emp-inc         1116
Federal-gov           960
Without-pay            14
Never-worked            7
Name: workclass, dtype: int64
Prof-specialty       4140
Craft-repair         4099
Exec-managerial      4066
Adm-clerical         3770
Sales                3650
Other-service        3295
Machine-op-inspct    2002
NaN                  1843
Transport-moving     1597
Handlers-cleaners    1370
Farming-fishing       994
Tech-support          928
Protective-serv       649
Priv-house-serv       149
Armed-Forces            9
Name: occupation, dtype: int64
United-States                 29170
Mexico                          643
NaN                             583
Philippines                     198
Germany                         137
Canada                          121
Puerto-Rico                     114
El-Salvador                     106
India                      

In [10]:
#looking at the missing amounts is low compared to the mode, using mode will work. Could use a different method for workclass but this should work for now.
#Saving the columns to keep a record
cols = list(income.columns)
print(cols)

['age', 'workclass', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income >50K']


In [11]:
mincome = income.fillna(income.agg(lambda x: pd.Series.mode(x)[0], axis=0))

In [12]:
# check again for missing values
mincome.columns[mincome.isnull().any()]

Index([], dtype='object')

In [13]:
mincome.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   education       32561 non-null  object
 3   education-num   32561 non-null  int64 
 4   marital-status  32561 non-null  object
 5   occupation      32561 non-null  object
 6   relationship    32561 non-null  object
 7   race            32561 non-null  object
 8   sex             32561 non-null  object
 9   capital-gain    32561 non-null  int64 
 10  capital-loss    32561 non-null  int64 
 11  hours-per-week  32561 non-null  int64 
 12  native-country  32561 non-null  object
 13  income >50K     32561 non-null  int64 
dtypes: int64(6), object(8)
memory usage: 3.5+ MB


In [21]:
for c in cols:
    if mincome[c].dtype == object:
        mincome[c] = pd.Series(pd.factorize( list(mincome[c]) ) [0])
print(mincome)

       age  workclass  education  education-num  marital-status  occupation  \
0       39          0          0             13               0           0   
1       50          1          0             13               1           1   
2       38          2          1              9               2           2   
3       53          2          2              7               1           2   
4       28          2          0             13               1           3   
...    ...        ...        ...            ...             ...         ...   
32556   27          2          6             12               1          10   
32557   40          2          1              9               1           9   
32558   58          2          1              9               6           0   
32559   22          2          1              9               0           0   
32560   52          5          1              9               1           1   

       relationship  race  sex  capital-gain  capit

In [22]:
print(mincome['income >50K'].value_counts())

0    24720
1     7841
Name: income >50K, dtype: int64


In [24]:
#Decision Tree

In [27]:
#Split Train Test Eval
X, y = mincome.iloc[:,:-1], mincome['income >50K']
X.shape
print(X)
#X are the features besides income, which is our label

       age  workclass  education  education-num  marital-status  occupation  \
0       39          0          0             13               0           0   
1       50          1          0             13               1           1   
2       38          2          1              9               2           2   
3       53          2          2              7               1           2   
4       28          2          0             13               1           3   
...    ...        ...        ...            ...             ...         ...   
32556   27          2          6             12               1          10   
32557   40          2          1              9               1           9   
32558   58          2          1              9               6           0   
32559   22          2          1              9               0           0   
32560   52          5          1              9               1           1   

       relationship  race  sex  capital-gain  capit

In [29]:
#define feature selection
feat = SelectKBest(score_func = f_classif, k=4)
# Act on feature selection
X_selected = pd.DataFrame(feat.fit_transform(X, y))
X_selected.columns = X.columns[feat.get_support(indices=True)]
print(X_selected)
print(X.columns[feat.get_support(indices=True)])

       age  education-num  capital-gain  hours-per-week
0       39             13          2174              40
1       50             13             0              13
2       38              9             0              40
3       53              7             0              40
4       28             13             0              40
...    ...            ...           ...             ...
32556   27             12             0              38
32557   40              9             0              40
32558   58              9             0              40
32559   22              9             0              20
32560   52              9         15024              40

[32561 rows x 4 columns]
Index(['age', 'education-num', 'capital-gain', 'hours-per-week'], dtype='object')


In [31]:
X_train, X_test, y_train, y_test = train_test_split(
    X_selected, y, test_size=0.3)

In [32]:
oversample = SMOTE()
X_train, y_train = oversample.fit_resample(X_train, y_train)
print(np.count_nonzero(y_train == 0))
print(np.count_nonzero(y_train == 1))

17360
17360


In [35]:
#decision tree
clf = DecisionTreeClassifier(random_state=0, max_leaf_nodes=20, min_samples_leaf=10, )
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [34]:
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(roc_auc_score(y_test, y_pred))

[[5593 1767]
 [ 630 1779]]
              precision    recall  f1-score   support

           0       0.90      0.76      0.82      7360
           1       0.50      0.74      0.60      2409

    accuracy                           0.75      9769
   macro avg       0.70      0.75      0.71      9769
weighted avg       0.80      0.75      0.77      9769

0.7491995878228381


In [None]:
# Notes - Accuracy is 75%, not great. I would focus on precision instead or recall. Would be more interested in how well the model was classifying
# those who are actually in 50k + that the model predicts. Precision is high for No but weak for yes. Recall ok for both.

In [36]:
#cross validation
crosspred = cross_val_predict(clf, X, y, cv=20)
print(confusion_matrix(y, crosspred))
print(classification_report(y, crosspred))
print(roc_auc_score(y, crosspred))

[[23390  1330]
 [ 3504  4337]]
              precision    recall  f1-score   support

           0       0.87      0.95      0.91     24720
           1       0.77      0.55      0.64      7841

    accuracy                           0.85     32561
   macro avg       0.82      0.75      0.77     32561
weighted avg       0.84      0.85      0.84     32561

0.7496578178597357


In [None]:
# Much better numbers here. 85% accuracy, very good. 

In [59]:
#split train test eval
X, y = mincome.iloc[:,:-1], mincome['income >50K']
print(X, y)

       age  workclass  education  education-num  marital-status  occupation  \
0       39          0          0             13               0           0   
1       50          1          0             13               1           1   
2       38          2          1              9               2           2   
3       53          2          2              7               1           2   
4       28          2          0             13               1           3   
...    ...        ...        ...            ...             ...         ...   
32556   27          2          6             12               1          10   
32557   40          2          1              9               1           9   
32558   58          2          1              9               6           0   
32559   22          2          1              9               0           0   
32560   52          5          1              9               1           1   

       relationship  race  sex  capital-gain  capit

In [60]:
# define feature selection
feat = SelectKBest(score_func=f_classif, k=4)
# apply feature selection
X_selected = pd.DataFrame(feat.fit_transform(X, y))
X_selected.columns = X.columns[feat.get_support(indices=True)]
print(X_selected)
print(X.columns[feat.get_support(indices=True)])
#not sure but it is only returning one column. Age but nothing else.

       age  education-num  capital-gain  hours-per-week
0       39             13          2174              40
1       50             13             0              13
2       38              9             0              40
3       53              7             0              40
4       28             13             0              40
...    ...            ...           ...             ...
32556   27             12             0              38
32557   40              9             0              40
32558   58              9             0              40
32559   22              9             0              20
32560   52              9         15024              40

[32561 rows x 4 columns]
Index(['age', 'education-num', 'capital-gain', 'hours-per-week'], dtype='object')


In [61]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3)

In [45]:
oversample = SMOTE()
X_train, y_train = oversample.fit_resample(X_train, y_train)
print(np.count_nonzero(y_train == 0))
print(np.count_nonzero(y_train == 1))

17328
17328


In [46]:
#Random Forest Classifier

In [62]:
classifierRF = RandomForestClassifier(n_estimators=1000,
                                      verbose=0,
                                      n_jobs=-1)     
classifierRF.fit(X_train, y_train)
y_pred = classifierRF.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(roc_auc_score(y, crosspred))

[[6894  594]
 [ 874 1407]]
              precision    recall  f1-score   support

           0       0.89      0.92      0.90      7488
           1       0.70      0.62      0.66      2281

    accuracy                           0.85      9769
   macro avg       0.80      0.77      0.78      9769
weighted avg       0.84      0.85      0.85      9769

0.4999190938511327


In [None]:
# Precision and recall for 0 are good, ok numbers for 1. f1-score is good for 0 not great for 1

In [63]:
crosspred = cross_val_predict(classifierRF, X, y, cv=10)
print(confusion_matrix(y, crosspred))
print(classification_report(y, crosspred))
print(roc_auc_score(y, crosspred))

[[22810  1910]
 [ 2930  4911]]
              precision    recall  f1-score   support

           0       0.89      0.92      0.90     24720
           1       0.72      0.63      0.67      7841

    accuracy                           0.85     32561
   macro avg       0.80      0.77      0.79     32561
weighted avg       0.85      0.85      0.85     32561

0.7745289004481877


In [None]:
#better AUC score. other numbers about the same, minor increases.