In [72]:
# Import libraries necessary for this project
import numpy as np
import pandas as pd
from time import time
from IPython.display import display # Allows the use of display() for DataFrames

# Import supplementary visualization code visuals.py
import visuals as vs

# Pretty display for notebooks
%matplotlib inline

In [73]:
# Load the Census dataset
data = pd.read_csv("census.csv")

# Success - Display the first record
display(data.head(n=1))

Unnamed: 0,age,workclass,education_level,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174.0,0.0,40.0,United-States,<=50K


In [None]:
#Total number of records
n_records = data.shape[0]

# Number of records where individual's income is more than $50,000
n_greater_50k = data.income[data.income == '>50K'].count()

# Number of records where individual's income is at most $50,000
n_at_most_50k = data.income[data.income == '<=50K'].count()

# Percentage of individuals whose income is more than $50,000
greater_percent = (n_greater_50k * 100.0) / n_records 

# Print the results
print "Total number of records: {}".format(n_records)
print "Individuals making more than $50,000: {}".format(n_greater_50k)
print "Individuals making at most $50,000: {}".format(n_at_most_50k)
print "Percentage of individuals making more than $50,000: {:.2f}%".format(greater_percent)

In [70]:
import pandas as pd
from IPython.display import display # Allows the use of display() for DataFrames

data = pd.read_csv("census.csv")

data.head()

In [45]:
from sklearn.cross_validation import train_test_split

labels = data['income'].map({'<=50K': 0, '>50K': 1})
features = pd.get_dummies(data.drop('income', axis = 1))

display(features.head(), labels.head())

Unnamed: 0,age,education-num,capital-gain,capital-loss,hours-per-week,workclass_ Federal-gov,workclass_ Local-gov,workclass_ Private,workclass_ Self-emp-inc,workclass_ Self-emp-not-inc,...,native-country_ Portugal,native-country_ Puerto-Rico,native-country_ Scotland,native-country_ South,native-country_ Taiwan,native-country_ Thailand,native-country_ Trinadad&Tobago,native-country_ United-States,native-country_ Vietnam,native-country_ Yugoslavia
0,39,13.0,2174.0,0.0,40.0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,50,13.0,0.0,0.0,13.0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
2,38,9.0,0.0,0.0,40.0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
3,53,7.0,0.0,0.0,40.0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
4,28,13.0,0.0,0.0,40.0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


0    0
1    0
2    0
3    0
4    0
Name: income, dtype: int64

In [46]:
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.20, random_state=42)

In [79]:
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import make_scorer
from sklearn.grid_search import GridSearchCV, RandomizedSearchCV
from sklearn.cross_validation import ShuffleSplit
from sklearn.metrics import r2_score, fbeta_score
from scipy.stats import randint as sp_randint

def fit_model(X, y):
    """ 
    Performs grid search and returns best classifier
    """
    
    # Cross-validation sets from the training data
    # sklearn version 0.18: ShuffleSplit(n_splits=10, test_size=0.1, train_size=None, random_state=None)
    # sklearn versiin 0.17: ShuffleSplit(n, n_iter=10, test_size=0.1, train_size=None, random_state=None)
    cv_sets = ShuffleSplit(X.shape[0], n_iter = 10, test_size = 0.20, random_state = 42)

    classifier = RandomForestClassifier(n_estimators=20)
    param_dist = {"max_depth": [3, None],
              "max_features": sp_randint(1, 11),
              "min_samples_split": sp_randint(2, 11),
              "min_samples_leaf": sp_randint(1, 11),
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}
    scoring_fnc = make_scorer(fbeta_score)

    #grid = GridSearchCV(estimator=classifier, param_grid=param_dist, scoring=scoring_fnc, cv=cv_sets)
    random_search = RandomizedSearchCV(classifier, param_distributions=param_dist, n_iter=20)
    random_search.fit(X, y)
    
    #print pd.DataFrame(grid.cv_results_)
    #display(pd.DataFrame(grid.grid_scores_))

    # Return the optimal model after fitting the data
    return random_search.best_estimator_

In [81]:
reg = fit_model(X_train, y_train)

In [82]:
r2_score(reg.predict(X_test), y_test)

0.14289331455597065

In [69]:
from sklearn.neural_network import MLPClassifier

clf = MLPClassifier(max_iter=10000)

clf.fit(X_train, y_train)
r2_score(clf.predict(X_train), y_train)


-0.01715074013454565

In [78]:
from sklearn.metrics import fbeta_score
                                                            
fbeta_score(clf.predict(X_train), y_train, beta=1)

0.64769190871369287