In [33]:
# STAGE - LOADING
# Import everything that's needed
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")

In [34]:
# STAGE - LOADING
# Download Adult Data Set and display its first 5 rows
url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data'
col_names = ["Age", "Workclass", "Fnlwgt", "Education",
             "EducationNum", "MaritalStatus","Occupation",
             "Relationship", "Race", "Sex", "CapitalGain", 
             "CapitalLoss","HoursPerWeek", "NativeCountry", "Salary"]
data = pd.read_csv(url, names=col_names)
data.head()


Unnamed: 0,Age,Workclass,Fnlwgt,Education,EducationNum,MaritalStatus,Occupation,Relationship,Race,Sex,CapitalGain,CapitalLoss,HoursPerWeek,NativeCountry,Salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [35]:
# There are missing values in Occupation column, represented by ' ?'
# the same goes for Workclass and NativeCountry cols.
data["Occupation"].value_counts() 


 Prof-specialty       4140
 Craft-repair         4099
 Exec-managerial      4066
 Adm-clerical         3770
 Sales                3650
 Other-service        3295
 Machine-op-inspct    2002
 ?                    1843
 Transport-moving     1597
 Handlers-cleaners    1370
 Farming-fishing       994
 Tech-support          928
 Protective-serv       649
 Priv-house-serv       149
 Armed-Forces            9
Name: Occupation, dtype: int64

In [36]:
# STAGE - DATA PREPARATION
data[data == ' ?'] = np.nan
data.isnull().sum()

Age                 0
Workclass        1836
Fnlwgt              0
Education           0
EducationNum        0
MaritalStatus       0
Occupation       1843
Relationship        0
Race                0
Sex                 0
CapitalGain         0
CapitalLoss         0
HoursPerWeek        0
NativeCountry     583
Salary              0
dtype: int64

In [37]:
# STAGE - DATA PREPARATION
# Replace mising values with the most frequent value called the mode.
for col in ['Workclass', 'Occupation', 'NativeCountry']:
    data[col].fillna(data[col].mode()[0], inplace=True)

In [38]:
# STAGE - DATA PREPARATION
# We don't need Education column as we have EducationNum
del data['Education'] 
del data['Fnlwgt']
data.head()

Unnamed: 0,Age,Workclass,EducationNum,MaritalStatus,Occupation,Relationship,Race,Sex,CapitalGain,CapitalLoss,HoursPerWeek,NativeCountry,Salary
0,39,State-gov,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [39]:
# STAGE - DATA PREPARATION
# Our independent feature "Salary" has over-representation of people earining less than 50k
# That will be balanced later on
data['Salary'].value_counts() 

 <=50K    24720
 >50K      7841
Name: Salary, dtype: int64

In [40]:
# STAGE - DATA PREPARATION
# Convert non-numeric values to numerical values
columns_to_convert = ['Workclass','MaritalStatus','Occupation','Relationship','Race','Sex','NativeCountry','Salary']
for column in columns_to_convert:
  data[column] = pd.factorize(data[column])[0]

data.head()

Unnamed: 0,Age,Workclass,EducationNum,MaritalStatus,Occupation,Relationship,Race,Sex,CapitalGain,CapitalLoss,HoursPerWeek,NativeCountry,Salary
0,39,0,13,0,0,0,0,0,2174,0,40,0,0
1,50,1,13,1,1,1,0,0,0,0,13,0,0
2,38,2,9,2,2,0,0,0,0,0,40,0,0
3,53,2,7,1,2,1,1,0,0,0,40,0,0
4,28,2,13,1,3,2,1,1,0,0,40,1,0


In [41]:
# STAGE - DATA PREPARATION
# Remove labels from the training set 
income_predictors = data.drop('Salary', axis = 1) # X
income_labels = data['Salary'].copy() # Y

ros = RandomOverSampler(random_state = 42)
ros.fit(income_predictors, income_labels)
income_predictors_resampled, income_labels_resampled = ros.fit_resample(income_predictors, income_labels)
# Split the dataset into train and testing (Testing will use 20% of dataset, training will use 80%)
income_predictors_train, income_predictors_test, income_labels_train, income_labels_test = train_test_split(income_predictors_resampled, income_labels_resampled, test_size = 0.2, random_state = 42)


In [42]:
# STAGE - DATA MODELING
random_forest = RandomForestClassifier(random_state = 42)
random_forest.fit(income_predictors_train, income_labels_train)
income_labels_pred_random_forest = random_forest.predict(income_predictors_test)



In [43]:
# STAGE - Evaluation
untuned_score = accuracy_score(income_labels_test, income_labels_pred_random_forest)
print("Untuned accuracy", untuned_score)

Untuned accuracy 0.9141383495145631


In [None]:
# Train and fine-tune a RandomForestClassifier (this will take many minutes)
params = {'max_depth': [40, 70],'n_estimators': [600, 1000]}
grid_search_cv = GridSearchCV(RandomForestClassifier(random_state=42), params, n_jobs=1, cv=5)
grid_search_cv.fit(income_predictors_train, income_labels_train)

In [None]:
# Show the best estimator
fine_tuned_random_forest = grid_search_cv.best_estimator_
print(fine_tuned_random_forest)

In [None]:
# STAGE - DATA MODELING
fine_tuned_random_forest = RandomForestClassifier(max_depth=70, n_estimators=600, random_state=42)
fine_tuned_random_forest.fit(income_predictors_train, income_labels_train)
income_labels_pred_random_forest = fine_tuned_random_forest.predict(income_predictors_test)

In [None]:
# STAGE - EVALUATION
tuned_score = accuracy_score(income_labels_test, income_labels_pred_random_forest) 

print("Untuned accuracy", untuned_score)
print("Tuned accuracy", tuned_score)
print("Tuned score is better than untuned by: ", tuned_score - untuned_score)