## Import Data and Libraries

In [8]:
# Import data and take a look
import numpy as np
import pandas as pd

df = pd.read_csv('adult.csv', na_values=['#NAME?'])
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39.0,State-gov,77516.0,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50.0,Self-emp-not-inc,83311.0,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38.0,Private,215646.0,HS-grad,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53.0,Private,234721.0,11th,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,,0,0,40,United-States,<=50K
4,28.0,Private,338409.0,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [9]:
# Take a look at the outcome variable: 'income'
print(df['income'].value_counts())

<=50K    3779
>50K     1221
Name: income, dtype: int64


In [10]:
# Assign outcome as 0 if income <=50K and as 1 if income >50K
df['income'] = [0 if x == '<=50K' else 1 for x in df['income']]

# Assign X as a DataFrame of features and y as a Series of the outcome variable
X = df.drop('income', 1)
y = df.income

In [11]:
X.head(5)

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country
0,39.0,State-gov,77516.0,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States
1,50.0,Self-emp-not-inc,83311.0,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States
2,38.0,Private,215646.0,HS-grad,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States
3,53.0,Private,234721.0,11th,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,,0,0,40,United-States
4,28.0,Private,338409.0,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba


In [12]:
y.head()

0    0
1    0
2    0
3    0
4    0
Name: income, dtype: int64

In [13]:
y.value_counts()

0    3779
1    1221
Name: income, dtype: int64

### Basic data cleaning

In [14]:
# Education is a categorical feature:
print(X['education'].head(5))

0    Bachelors
1    Bachelors
2      HS-grad
3         11th
4    Bachelors
Name: education, dtype: object


In [16]:
X['education'].unique()

array(['Bachelors', 'HS-grad', '11th', 'Masters', '9th', 'Some-college',
       'Assoc-acdm', 'Assoc-voc', '7th-8th', 'Doctorate', '?',
       'Prof-school', '5th-6th', '10th', '1st-4th', 'Preschool', '12th'],
      dtype=object)

In [17]:
# Use get_dummies in pandas 
# Another option: OneHotEncoder in sci-kit learn
print(pd.get_dummies(X['education']).head(5))

   10th  11th  12th  1st-4th  5th-6th  7th-8th  9th  ?  Assoc-acdm  Assoc-voc  \
0     0     0     0        0        0        0    0  0           0          0   
1     0     0     0        0        0        0    0  0           0          0   
2     0     0     0        0        0        0    0  0           0          0   
3     0     1     0        0        0        0    0  0           0          0   
4     0     0     0        0        0        0    0  0           0          0   

   Bachelors  Doctorate  HS-grad  Masters  Preschool  Prof-school  \
0          1          0        0        0          0            0   
1          1          0        0        0          0            0   
2          0          0        1        0          0            0   
3          0          0        0        0          0            0   
4          1          0        0        0          0            0   

   Some-college  
0             0  
1             0  
2             0  
3             0  
4       

In [18]:
# Decide which categorical variables you want to use in model
for col_name in X.columns:
    if X[col_name].dtypes == 'object':
        unique_cat = len(X[col_name].unique())
        print("Feature '{col_name}' has {unique_cat} unique categories".format(col_name=col_name, unique_cat=unique_cat))


Feature 'workclass' has 8 unique categories
Feature 'education' has 17 unique categories
Feature 'marital_status' has 7 unique categories
Feature 'occupation' has 15 unique categories
Feature 'relationship' has 6 unique categories
Feature 'race' has 6 unique categories
Feature 'sex' has 3 unique categories
Feature 'native_country' has 40 unique categories


In [19]:
# Although, 'native_country' has a lot of unique categories, most categories only have a few observations
print(X['native_country'].value_counts().sort_values(ascending=False).head(10))

United-States    4465
Mexico            104
?                  97
Canada             28
Philippines        22
Germany            22
England            16
El-Salvador        16
Puerto-Rico        16
China              15
Name: native_country, dtype: int64


In [21]:
# In this case, bucket low frequecy categories as "Other"
X['native_country'] = ['United-States ' if x == 'United-States' else 'Other' for x in X['native_country']]

print(X['native_country'].value_counts().sort_values(ascending=False))

United-States     4465
Other              535
Name: native_country, dtype: int64


In [22]:
# Create a list of features to dummy
todummy_list = ['workclass', 'education', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'native_country']

In [23]:
# Function to dummy all the categorical variables used for modeling
def dummy_df(df, todummy_list):
    for x in todummy_list:
        dummies = pd.get_dummies(df[x], prefix=x, dummy_na=False)
        df = df.drop(x, 1)
        df = pd.concat([df, dummies], axis=1)
    return df

In [25]:
X = dummy_df(X, todummy_list)
X.head(5)

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week,workclass_?,workclass_Federal-gov,workclass_Local-gov,workclass_Private,...,relationship_Wife,race_Amer-Indian-Eskimo,race_Asian-Pac-Islander,race_Black,race_Other,race_White,sex_Female,sex_Male,native_country_Other,native_country_United-States
0,39.0,77516.0,13.0,2174,0,40,0,0,0,0,...,0,0,0,0,0,1,0,1,0,1
1,50.0,83311.0,13.0,0,0,13,0,0,0,0,...,0,0,0,0,0,1,0,1,0,1
2,38.0,215646.0,9.0,0,0,40,0,0,0,1,...,0,0,0,0,0,1,0,1,0,1
3,53.0,234721.0,7.0,0,0,40,0,0,0,1,...,0,0,0,1,0,0,0,0,0,1
4,28.0,338409.0,13.0,0,0,40,0,0,0,1,...,1,0,0,1,0,0,1,0,1,0


### Handling missing data

In [26]:
# How much of your data is missing?
X.isnull().sum().sort_values(ascending=False).head()

fnlwgt                 107
education_num           57
age                     48
education_Doctorate      0
education_7th-8th        0
dtype: int64

In [27]:
# Impute missing values using Imputer in sklearn.preprocessing
from sklearn.preprocessing import Imputer

imp = Imputer(missing_values='NaN', strategy='median', axis=0)
imp.fit(X)
X = pd.DataFrame(data=imp.transform(X) , columns=X.columns)



In [28]:
# Now check again to see if you still have missing data
X.isnull().sum().sort_values(ascending=False).head()

native_country_United-States     0
education_Bachelors              0
education_5th-6th                0
education_7th-8th                0
education_9th                    0
dtype: int64

### More Data Exploration

#### Outlier detection - Tukey IQR

In [29]:
def find_outliers_tukey(x):
    q1 = np.percentile(x, 25)
    q3 = np.percentile(x, 75)
    iqr = q3-q1 
    floor = q1 - 1.5*iqr
    ceiling = q3 + 1.5*iqr
    outlier_indices = list(x.index[(x < floor)|(x > ceiling)])
    outlier_values = list(x[outlier_indices])

    return outlier_indices, outlier_values

In [30]:
tukey_indices, tukey_values = find_outliers_tukey(X['age'])
print(np.sort(tukey_values))

[76. 76. 76. 76. 76. 76. 76. 76. 76. 77. 77. 77. 77. 77. 78. 78. 79. 79.
 79. 80. 80. 80. 81. 81. 81. 81. 82. 88. 90. 90. 90. 90. 90. 90. 90.]


#### Outlier detection - Kernel Density Estimation

In [31]:
from sklearn.preprocessing import scale
from statsmodels.nonparametric.kde import KDEUnivariate

def find_outliers_kde(x):
    x_scaled = scale(list(map(float, x)))
    kde = KDEUnivariate(x_scaled)
    kde.fit(bw="scott", fft=True)
    pred = kde.evaluate(x_scaled)
    
    n = sum(pred < 0.05)
    outlier_ind = np.asarray(pred).argsort()[:n]
    outlier_value = np.asarray(x)[outlier_ind]

    return outlier_ind, outlier_value

In [32]:
kde_indices, kde_values = find_outliers_kde(X['age'])
print(np.sort(kde_values))

  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


[68. 68. 68. 68. 68. 68. 68. 68. 68. 68. 68. 68. 68. 68. 68. 68. 68. 68.
 69. 69. 69. 69. 69. 69. 69. 69. 69. 69. 69. 69. 70. 70. 70. 70. 70. 70.
 70. 70. 70. 70. 71. 71. 71. 71. 71. 71. 71. 71. 71. 71. 72. 72. 72. 72.
 72. 72. 72. 72. 72. 72. 72. 72. 73. 73. 73. 73. 73. 73. 73. 74. 74. 74.
 74. 74. 74. 74. 74. 74. 74. 75. 75. 75. 75. 75. 75. 75. 75. 75. 75. 75.
 75. 76. 76. 76. 76. 76. 76. 76. 76. 76. 77. 77. 77. 77. 77. 78. 78. 79.
 79. 79. 80. 80. 80. 81. 81. 81. 81. 82. 88. 90. 90. 90. 90. 90. 90. 90.]


### Distribution of Features

In [33]:
# Use pyplot in matplotlib to plot histograms
%matplotlib inline
import matplotlib.pyplot as plt

def plot_histogram(x):
    plt.hist(x, color='gray', alpha=0.5)
    plt.title("Histogram of '{var_name}'".format(var_name=x.name))
    plt.xlabel("Value")
    plt.ylabel("Frequency")
    plt.show()