In [13]:
# Exploring a ML Data Set with Pandas
## taken from: https://www.hackerearth.com/practice/machine-learning/data-manipulation-visualisation-r-python/tutorial-data-manipulation-numpy-pandas-python/tutorial/

### The goal is to predict if the salary of a given person is less than or more than 50K.
### Data are taken from the UCI Machine Learning Repository.
### The dependent variable is 'target'.

import numpy as np
import pandas as pd


In [9]:
# load the data 

train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

# some good info on loading data from other sources (besides working directory):
# https://www.shanelynn.ie/using-pandas-dataframe-creating-editing-viewing-data-in-python/

In [10]:
train.info() # 15 data columns and 32561 rows of data
             # 6 of the 15 columns have integer classes and the rest have object or character clases.  

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
age               32561 non-null int64
workclass         30725 non-null object
fnlwgt            32561 non-null int64
education         32561 non-null object
education.num     32561 non-null int64
marital.status    32561 non-null object
occupation        30718 non-null object
relationship      32561 non-null object
race              32561 non-null object
sex               32561 non-null object
capital.gain      32561 non-null int64
capital.loss      32561 non-null int64
hours.per.week    32561 non-null int64
native.country    31978 non-null object
target            32561 non-null object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [11]:
# Check test data; an alternative way of quickly checking rows and columns is:

print("The train data has", train.shape)
print("The test data has", test.shape)

# I'd also like to figure out how to split a data set !! (e.g., as with caret package in R)


The train data has (32561, 15)
The test data has (16281, 15)


In [12]:
# Take a peek at the data set

train.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,target
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [14]:
# I want to know more about the response of interest, target:

train['target'].describe()

count      32561
unique         2
top        <=50K
freq       24720
Name: target, dtype: object

In [15]:
train.target.head()

# again this link was helpful: https://www.shanelynn.ie/using-pandas-dataframe-creating-editing-viewing-data-in-python/

0     <=50K
1     <=50K
2     <=50K
3     <=50K
4     <=50K
Name: target, dtype: object

In [16]:
train['target'].nunique() # number of unique entries 

2

In [17]:
# Now let's check missing values, if present, in the data.

nans = train.shape[0] - train.dropna().shape[0]
print("%d rows have missing values in the train data" %nans)

nand = test.shape[0] - test.dropna().shape[0]
print("%d rows have missing values in the test data" %nand)

2399 rows have missing values in the train data
1221 rows have missing values in the test data


In [18]:
# We should be more curious to know which columns have missing values

train.isnull().sum()

## here we see that only 3 columns, workclass, occupation, and native country, have missing values.

age                  0
workclass         1836
fnlwgt               0
education            0
education.num        0
marital.status       0
occupation        1843
relationship         0
race                 0
sex                  0
capital.gain         0
capital.loss         0
hours.per.week       0
native.country     583
target               0
dtype: int64

In [19]:
# Count the number of unqiue values from character variables:

cat = train.select_dtypes(include = ['O'])
cat.apply(pd.Series.nunique)



workclass          8
education         16
marital.status     7
occupation        14
relationship       6
race               5
sex                2
native.country    41
target             2
dtype: int64

In [30]:
# Check the modes (most often repeated value) of the variables that have na's.

train.mode() # whole data frame
train.loc[:,"education"].mode()
train.loc[:,"workclass"].mode()
train.loc[:,"occupation"].mode()
train.loc[:,"native.country"].mode()

# this link helped with this task: http://www.datasciencemadesimple.com/mode-function-python-pandas-dataframe-row-column-wise-mode/


0     United-States
dtype: object

In [31]:
# Since missing values are found in all 3 character variables, let's impute these missing values with their respective modes.

# Education
train.workclass.value_counts(sort = True)
train.workclass.fillna("Private", inplace = True)

# Occupation
train.occupation.value_counts(sort = True)
train.occupation.fillna('Prof-specialty', inplace = True)

# Native country
train['native.country'].value_counts(sort = True)
train['native.country'].fillna('United-States', inplace = True)

In [32]:
# Check again to see if there are any missing values.

train.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education.num     0
marital.status    0
occupation        0
relationship      0
race              0
sex               0
capital.gain      0
capital.loss      0
hours.per.week    0
native.country    0
target            0
dtype: int64

In [39]:
# Now check the target variable to investigate if the data are imbalanced or not.

# Check proportion of target variable.

train.target.value_counts()/train.shape[0]

 <=50K    0.75919
 >50K     0.24081
Name: target, dtype: float64

In [40]:
# The chunk above tells you that 75% of the data set belongs to the <=50K class. Meaning, that even if we take a guess of target prediction as <=50K, we'll get 75% accuacy.
# Create a cross tabulation of the target variable with education. With this, we'll try to understand the influence of education on the target variable.

pd.crosstab(train.education, train.target, margins = True)/train.shape[0]

# From the output below we see that out of 75% of the people with <=50K salary, 27% are high school graduate (e.g., people with lower education are expected to earn less). 
# On the other hand, out of 25% of people with salaries >=50K, 6% are bachelors and 5% are high school grads. 

target,<=50K,>50K,All
education,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
10th,0.02675,0.001904,0.028654
11th,0.034243,0.001843,0.036086
12th,0.012285,0.001013,0.013298
1st-4th,0.004975,0.000184,0.00516
5th-6th,0.009736,0.000491,0.010227
7th-8th,0.018611,0.001228,0.01984
9th,0.014957,0.000829,0.015786
Assoc-acdm,0.024631,0.008139,0.032769
Assoc-voc,0.031357,0.011087,0.042443
Bachelors,0.09625,0.06821,0.164461


In [41]:
# Time to use the scikit learn library. Scikit learn accepts data in numeric format. We'll have to convert the charcter variable into numeric. We'll use the labelencoder function.
# In label encoding, each unique value of a variable gets assigned a number.
# E.g. a variable color that has 4 values, red, green ,blue and pink might be encoded as red = 2, green = 0, blue = 1, and pink = 3.

### I"M CONFUSED?! Won't changing these factors to numeric affect how they are interpreted (3 > 2)???

from sklearn import preprocessing 

In [42]:
for x in train.columns:
    if train[x].dtype == 'object':
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(train[x].values))
        train[x] = lbl.transform(list(train[x].values))

In [43]:
# check the changes

train.head() # the variables have been converted to numeric, including the target variable.

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,target
0,39,6,77516,9,13,4,0,1,4,1,2174,0,40,38,0
1,50,5,83311,9,13,2,3,0,4,1,0,0,13,38,0
2,38,3,215646,11,9,0,5,1,4,1,0,0,40,38,0
3,53,3,234721,1,7,2,5,0,2,1,0,0,40,38,0
4,28,3,338409,9,13,2,9,5,2,0,0,0,40,4,0


In [44]:
train.target.value_counts() # these are now zeros or ones; 0 is under 50K.

0    24720
1     7841
Name: target, dtype: int64

In [45]:
# Building a random forest model

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import cross_val_score
from sklearn.metrics import accuracy_score

y = train['target']
del train['target']

X = train
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.3, random_state = 1, stratify = y)





In [47]:
X.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country
0,39,6,77516,9,13,4,0,1,4,1,2174,0,40,38
1,50,5,83311,9,13,2,3,0,4,1,0,0,13,38
2,38,3,215646,11,9,0,5,1,4,1,0,0,40,38
3,53,3,234721,1,7,2,5,0,2,1,0,0,40,38
4,28,3,338409,9,13,2,9,5,2,0,0,0,40,4


In [48]:
X_train.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country
11662,47,0,329205,9,13,0,9,4,4,1,0,0,40,38
18348,34,3,169605,0,6,5,7,4,4,0,0,0,36,38
5962,38,6,107164,15,10,5,3,3,4,1,0,0,40,38
30669,41,4,177905,9,13,2,3,0,4,1,7688,0,70,38
6244,18,3,84253,1,7,4,7,3,4,0,0,0,24,38


In [49]:
X_test.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country
11108,20,3,109813,1,7,4,12,2,4,1,0,0,40,38
10964,56,4,205601,10,16,2,9,0,4,1,99999,0,70,38
19773,49,5,79627,15,10,2,3,0,4,1,0,0,40,38
2431,61,5,244087,11,9,2,2,0,4,1,0,0,52,38
16412,42,3,172148,15,10,2,0,0,2,1,0,0,40,38


In [50]:
y_train.head()

11662    0
18348    0
5962     0
30669    1
6244     0
Name: target, dtype: int64

In [51]:
y_test.head()

11108    0
10964    1
19773    0
2431     1
16412    0
Name: target, dtype: int64

In [57]:
# Train the RF classifier

clf = RandomForestClassifier(n_estimators = 500, max_depth = 6)

clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                max_depth=6, max_features='auto', max_leaf_nodes=None,
                min_impurity_split=1e-07, min_samples_leaf=1,
                min_samples_split=2, min_weight_fraction_leaf=0.0,
                n_estimators=500, n_jobs=1, oob_score=False, random_state=None,
                verbose=0, warm_start=False)

clf.predict(X_test)



array([0, 1, 0, ..., 0, 0, 0])

In [58]:
# Make a prediction and check the model's accuracy
prediction = clf.predict(X_test)
acc = accuracy_score(np.array(y_test), prediction)
print('The accuracy of Random Forest is {}'.format(acc))

The accuracy of Random Forest is 0.8526973078104207


In [None]:
# this tells us that our learning algorithm gave 85% accuracy. We can do many things to improve accuracy.
# In this tutorial, we divided the train data into two halves and made prediction on the test data. As your exercise, you should use this model and make prediction on the test data we loaded initially. You can perform same set of steps we did on the train data to complete this exercise.
# https://www.hackerearth.com/practice/machine-learning/data-manipulation-visualisation-r-python/tutorial-data-manipulation-numpy-pandas-python/tutorial/