# Import Libraries

In [1]:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier

## Investigate The Data

In [2]:
income_data = pd.read_csv('income.csv',header = 0,delimiter = ', ')
income_data.sex = income_data.sex.apply(lambda x: 0 if x == 'Male' else 1)
print(income_data)

       age         workclass  fnlwgt   education  education-num  \
0       39         State-gov   77516   Bachelors             13   
1       50  Self-emp-not-inc   83311   Bachelors             13   
2       38           Private  215646     HS-grad              9   
3       53           Private  234721        11th              7   
4       28           Private  338409   Bachelors             13   
...    ...               ...     ...         ...            ...   
32556   27           Private  257302  Assoc-acdm             12   
32557   40           Private  154374     HS-grad              9   
32558   58           Private  151910     HS-grad              9   
32559   22           Private  201490     HS-grad              9   
32560   52      Self-emp-inc  287927     HS-grad              9   

           marital-status         occupation   relationship   race  sex  \
0           Never-married       Adm-clerical  Not-in-family  White    0   
1      Married-civ-spouse    Exec-managerial 

In [3]:
# take a look at row with index 0
print(income_data.iloc[0])

age                          39
workclass             State-gov
fnlwgt                    77516
education             Bachelors
education-num                13
marital-status    Never-married
occupation         Adm-clerical
relationship      Not-in-family
race                      White
sex                           0
capital-gain               2174
capital-loss                  0
hours-per-week               40
native-country    United-States
income                    <=50K
Name: 0, dtype: object


In [4]:
income_data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,0,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,0,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,0,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,0,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,1,0,0,40,Cuba,<=50K


## Separating labels from data

In [5]:
# Select income column
labels = income_data[['income']]
#labels.head()
labels = labels.income.apply(lambda x: 0 if x == '<=50K' else 1)
print(labels.head())

0    0
1    0
2    0
3    0
4    0
Name: income, dtype: int64


In [6]:
# features
data = income_data[['age','capital-gain','capital-loss','hours-per-week','sex']]
#print(data)

#### Train test and split data

In [7]:
train_data,test_data,train_labels,test_labels = train_test_split(data,labels,random_state= 1)

## Creating The Random Forest

In [8]:
forest = RandomForestClassifier(random_state = 1)

In [9]:
# fit the model
forest.fit(train_data,train_labels)

RandomForestClassifier(random_state=1)

## Test Accuracy 

In [10]:
print(forest.score(test_data,test_labels))

0.8272939442328953


### Consider other features

In [11]:
print(income_data['native-country'].value_counts())

United-States                 29170
Mexico                          643
?                               583
Philippines                     198
Germany                         137
Canada                          121
Puerto-Rico                     114
El-Salvador                     106
India                           100
Cuba                             95
England                          90
Jamaica                          81
South                            80
China                            75
Italy                            73
Dominican-Republic               70
Vietnam                          67
Guatemala                        64
Japan                            62
Poland                           60
Columbia                         59
Taiwan                           51
Haiti                            44
Iran                             43
Portugal                         37
Nicaragua                        34
Peru                             31
Greece                      

In [12]:
# Since the majority of the data comes from "United-States", it might make sense to make a column where every row that contains "United-States" becomes a 0 and any other country becomes a 1.
income_data['native-country'] = income_data['native-country'].apply(lambda x: 0 if x == 'United-States' else 1)

In [13]:
# fit the model
forest.fit(train_data,train_labels)

RandomForestClassifier(random_state=1)

In [14]:
# adding native country feature did not affect accuracy in any way
print(forest.score(test_data,test_labels))

0.8272939442328953
