In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
%matplotlib inline


In [2]:
train_data = pd.read_csv("Census_income_train.csv")
train_data.head()

Unnamed: 0,Age,Workclass,fnlwgt,Education,Education-num,Marital status,Occupation,Relationship,Race,Sex,Capital-gain,Capital-loss,Hours-per-week,Native-country,Income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [3]:
len(train_data)

32560

In [4]:
train_data.isna().sum()

Age               0
Workclass         0
fnlwgt            0
Education         0
Education-num     0
Marital status    0
Occupation        0
Relationship      0
Race              0
Sex               0
Capital-gain      0
Capital-loss      0
Hours-per-week    0
Native-country    0
Income            0
dtype: int64

### Removing rows with "?"

In [5]:
train_data["Workclass"].str.contains("\?")

0        False
1        False
2        False
3        False
4        False
         ...  
32555    False
32556    False
32557    False
32558    False
32559    False
Name: Workclass, Length: 32560, dtype: bool

In [6]:
train_data["Occupation"].str.contains("\?")

0        False
1        False
2        False
3        False
4        False
         ...  
32555    False
32556    False
32557    False
32558    False
32559    False
Name: Occupation, Length: 32560, dtype: bool

In [7]:
train_data["Occupation"].str.contains("\?")==False

0        True
1        True
2        True
3        True
4        True
         ... 
32555    True
32556    True
32557    True
32558    True
32559    True
Name: Occupation, Length: 32560, dtype: bool

In [8]:
train_data["Native-country"].str.contains("\?")
train_data["Native-country"].str.contains("\?")==False


0        True
1        True
2        True
3        True
4        True
         ... 
32555    True
32556    True
32557    True
32558    True
32559    True
Name: Native-country, Length: 32560, dtype: bool

In [9]:
# Lets reverse all the boolean values
train_data["Workclass"].str.contains("\?")==False

0        True
1        True
2        True
3        True
4        True
         ... 
32555    True
32556    True
32557    True
32558    True
32559    True
Name: Workclass, Length: 32560, dtype: bool

In [10]:
clean_train_data = train_data[train_data['Workclass'].str.contains("\?")==False]

In [11]:
len(clean_train_data)

30724

In [12]:
clean_train_data = train_data[train_data['Occupation'].str.contains("\?")==False]

In [13]:
len(clean_train_data)

30717

In [70]:
clean_train_data = train_data[train_data['Native-country'].str.contains("\?")==False]

In [71]:
len(clean_train_data)

31977

In [14]:
clean_train_data = clean_train_data.reset_index(drop=True)

In [15]:
clean_train_data.shape

(30717, 15)

#### Creating dummy variables and separable inputs and targets

In [16]:
train_dummies = pd.get_dummies(clean_train_data, drop_first= False)

In [17]:
train_dummies.head()

Unnamed: 0,Age,fnlwgt,Education-num,Capital-gain,Capital-loss,Hours-per-week,Workclass_ Federal-gov,Workclass_ Local-gov,Workclass_ Private,Workclass_ Self-emp-inc,...,Native-country_ Scotland,Native-country_ South,Native-country_ Taiwan,Native-country_ Thailand,Native-country_ Trinadad&Tobago,Native-country_ United-States,Native-country_ Vietnam,Native-country_ Yugoslavia,Income_ <=50K,Income_ >50K
0,39,77516,13,2174,0,40,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
1,50,83311,13,0,0,13,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
2,38,215646,9,0,0,40,0,0,1,0,...,0,0,0,0,0,1,0,0,1,0
3,53,234721,7,0,0,40,0,0,1,0,...,0,0,0,0,0,1,0,0,1,0
4,28,338409,13,0,0,40,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0


In [18]:
train_dummies = train_dummies.drop(['Income_ <=50K'], axis=1)

In [19]:
train_dummies.head()

Unnamed: 0,Age,fnlwgt,Education-num,Capital-gain,Capital-loss,Hours-per-week,Workclass_ Federal-gov,Workclass_ Local-gov,Workclass_ Private,Workclass_ Self-emp-inc,...,Native-country_ Puerto-Rico,Native-country_ Scotland,Native-country_ South,Native-country_ Taiwan,Native-country_ Thailand,Native-country_ Trinadad&Tobago,Native-country_ United-States,Native-country_ Vietnam,Native-country_ Yugoslavia,Income_ >50K
0,39,77516,13,2174,0,40,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,50,83311,13,0,0,13,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,38,215646,9,0,0,40,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
3,53,234721,7,0,0,40,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
4,28,338409,13,0,0,40,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
#3 splitting the train dataset 

In [21]:
train_input = train_dummies.iloc[:,:-1]
train_target = train_dummies.iloc[:,-1]

In [22]:
train_input.head()

Unnamed: 0,Age,fnlwgt,Education-num,Capital-gain,Capital-loss,Hours-per-week,Workclass_ Federal-gov,Workclass_ Local-gov,Workclass_ Private,Workclass_ Self-emp-inc,...,Native-country_ Portugal,Native-country_ Puerto-Rico,Native-country_ Scotland,Native-country_ South,Native-country_ Taiwan,Native-country_ Thailand,Native-country_ Trinadad&Tobago,Native-country_ United-States,Native-country_ Vietnam,Native-country_ Yugoslavia
0,39,77516,13,2174,0,40,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,50,83311,13,0,0,13,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,38,215646,9,0,0,40,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
3,53,234721,7,0,0,40,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
4,28,338409,13,0,0,40,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
train_target.head()

0    0
1    0
2    0
3    0
4    0
Name: Income_ >50K, dtype: uint8

### Test dataset

In [24]:
test_data = pd.read_csv("Census_income_test.csv")

In [25]:
test_data.head()

Unnamed: 0,Age,Workclass,fnlwgt,Education,Education-num,Marital status,Occupation,Relationship,Race,Sex,Capital-gain,Capital-loss,Hours-per-week,Native-country,Income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K.
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K.
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K.
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K.
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K.


In [26]:
test_data.shape

(16281, 15)

In [27]:
test_data.isnull().sum()

Age               0
Workclass         0
fnlwgt            0
Education         0
Education-num     0
Marital status    0
Occupation        0
Relationship      0
Race              0
Sex               0
Capital-gain      0
Capital-loss      0
Hours-per-week    0
Native-country    0
Income            0
dtype: int64

#### Cleaning unknown("?")values

In [28]:
clean_test_data = test_data[test_data['Workclass'].str.contains("\?")==False]

In [29]:
len(clean_test_data)

15318

In [30]:
clean_test_data = test_data[test_data['Occupation'].str.contains("\?")==False]

In [31]:
len(clean_test_data)

15315

In [90]:
clean_test_data = test_data[test_data['Native-country'].str.contains("\?")==False]

In [91]:
len(clean_test_data)

16007

In [32]:
clean_test_data = clean_test_data.reset_index(drop=True)

In [33]:
clean_test_data.shape

(15315, 15)

### Creating dummy variables and separating inputs and targets

In [34]:
test_dummies = pd.get_dummies(clean_test_data, drop_first=True)

In [35]:
test_dummies.head()

Unnamed: 0,Age,fnlwgt,Education-num,Capital-gain,Capital-loss,Hours-per-week,Workclass_ Local-gov,Workclass_ Private,Workclass_ Self-emp-inc,Workclass_ Self-emp-not-inc,...,Native-country_ Puerto-Rico,Native-country_ Scotland,Native-country_ South,Native-country_ Taiwan,Native-country_ Thailand,Native-country_ Trinadad&Tobago,Native-country_ United-States,Native-country_ Vietnam,Native-country_ Yugoslavia,Income_ >50K.
0,25,226802,7,0,0,40,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
1,38,89814,9,0,0,50,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
2,28,336951,12,0,0,40,1,0,0,0,...,0,0,0,0,0,0,1,0,0,1
3,44,160323,10,7688,0,40,0,1,0,0,...,0,0,0,0,0,0,1,0,0,1
4,34,198693,6,0,0,30,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0


In [36]:
test_input = test_dummies.iloc[:,:-1]
test_target = test_dummies.iloc[:,-1]

In [37]:
test_input.shape

(15315, 96)

In [38]:
test_target.shape

(15315,)

### Creating RandomForest model



In [39]:
clf = RandomForestClassifier(random_state= 365)

In [None]:
clf.fit(train_input,train_target)

### Testing the model

In [101]:
test_pred = clf.predict(test_input)

Feature names seen at fit time, yet now missing:
- Education_ 10th
- Marital status_ Divorced
- Native-country_ Cambodia
- Occupation_ ?
- Race_ Amer-Indian-Eskimo
- ...



ValueError: X has 98 features, but RandomForestClassifier is expecting 106 features as input.