In [1]:
# Import libraries 

import numpy as np # linear algebra
import pandas as pd # data processing, 

# Libraries for data visualization
import matplotlib.pyplot as pplt  
#import seaborn as sns
from pandas.plotting import scatter_matrix

# Import scikit_learn module for the algorithm/model: Linear Regression
from sklearn.linear_model import LogisticRegression
# Import scikit_learn module to split the dataset into train.test sub-datasets
from sklearn.model_selection import train_test_split 
# Import scikit_learn module for k-fold cross validation
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
# import the metrics class
from sklearn import metrics
# import stats for accuracy 
#import statsmodels.api as sm

In [2]:
#load the dataset provided
adult_data = pd.read_csv('./data/adult.data')
# salary dataset info to find columns and count of the data
adult_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32560 entries, 0 to 32559
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   39              32560 non-null  int64 
 1    State-gov      32560 non-null  object
 2    77516          32560 non-null  int64 
 3    Bachelors      32560 non-null  object
 4    13             32560 non-null  int64 
 5    Never-married  32560 non-null  object
 6    Adm-clerical   32560 non-null  object
 7    Not-in-family  32560 non-null  object
 8    White          32560 non-null  object
 9    Male           32560 non-null  object
 10   2174           32560 non-null  int64 
 11   0              32560 non-null  int64 
 12   40             32560 non-null  int64 
 13   United-States  32560 non-null  object
 14   <=50K          32560 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [3]:
# Column names to be added
column_names = ['age',
                'workclass',
                'final weight',
                'education',
                'education num',
                'marital status',
                'occupation',
                'relationship',
                'race',
                'sex',
                'capital gain',
                'capital loss',
                'hours per week',
                'native country',
                'salary']
# Create DataFrame by assigning column names
df = pd.DataFrame(adult_data, columns = column_names)

# Add column names while reading a CSV file
df = pd.read_csv('./data/adult.data', names=column_names)

# Add column names to existing DataFrame
df.columns = column_names

# View result
df

Unnamed: 0,age,workclass,final weight,education,education num,marital status,occupation,relationship,race,sex,capital gain,capital loss,hours per week,native country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [4]:
#Finding the special characters in the data frame
df.isin([' ?']).sum(axis=0)

age                  0
workclass         1836
final weight         0
education            0
education num        0
marital status       0
occupation        1843
relationship         0
race                 0
sex                  0
capital gain         0
capital loss         0
hours per week       0
native country     583
salary               0
dtype: int64

In [None]:
# We replace the special character to nan, and then we drop the columns
df['workclass'] = df['workclass'].replace(' ?',np.nan)
df['occupation'] = df['occupation'].replace(' ?',np.nan)
df['native country'] = df['native country'].replace(' ?',np.nan)
# Dropping the NaN rows now
df.dropna(how='any',inplace=True)

# We run a loop of value_counts of each column to find out unique values.
for c in df.columns:
    print ("---- %s ---" % c)
    print (df[c].value_counts())
    
#Result
df

In [None]:
#Dropping based on uniquness of data from the dataset
#We only keep the categorical data
df.drop(['age',
         'final weight',
         'education num',
         'hours per week',
         'capital gain',
         'capital loss',
         'native country'], axis=1, inplace=True)

# View result with only the categorical data
df

In [None]:
#Mapping the data into numerical data using map function
#We replace every ' <=50K' with value 0 and every ' >50K' with value 1
df['salary'] = df['salary'].map({' <=50K': 0, ' >50K': 1}).astype(int)

#We are going to convert the other categorical data
#The fonction value_counts() help us find out the unique value

#Sex
#We replace every ' Male' with value 0 and ' Female' with value 1
df['sex'] = df['sex'].map({' Male': 0, ' Female': 1}).astype(int)

#Race
#We replace every 'White' with value 0, 'Black' with value 1, 'Asian-Pac-Islander' with value 2,
#' Amer-Indian-Eskimo' with value 3 and ' Other' with value 4
df['race'] = df['race'].map({' White': 0,
                             ' Black': 1,
                            ' Asian-Pac-Islander': 2,
                            ' Amer-Indian-Eskimo': 3,
                            ' Other': 4}).astype(int)
#df["race"].value_counts()

#Relationship
#We replace " Husband" with value0, " Wife" with value 1, " Not-in-family" with value 2,
# " Own-child" with value 3, " Unmarried" with value 4 and " Other-relative" with value 5
df["relationship"] = df["relationship"].map({" Husband": 0,
                                             " Wife": 1,
                                             " Not-in-family": 2,
                                             " Own-child": 3,
                                             " Unmarried": 4,
                                             " Other-relative": 5}).astype(int)
#df["relationship"].value_counts()

#Occupation
#We replace " Prof-specialty" with value 0, " Craft-repair" with value 1,
# " Exec-managerial" with value 2, " Adm-clerical" with value 3," Sales" with value 4,
# " Machine-op-inspct" with value 5, " Transport-moving" with value 6, " Handlers-cleaners" with value 7,
# " Farming-fishing" with value 8, " Tech-support" with value 9, " Protective-serv" with value 10,
# " Priv-house-serv" with value 11, " Armed-Forces" with value 12 and " Other-service" with value 13
df["occupation"] = df["occupation"].map({" Prof-specialty": 0,
                                         " Craft-repair": 1,
                                         " Exec-managerial": 2,
                                         " Adm-clerical": 3,
                                         " Sales": 4,
                                         " Machine-op-inspct": 5,
                                         " Transport-moving": 6,
                                         " Handlers-cleaners": 7,
                                         " Farming-fishing": 8,
                                         " Tech-support": 9,
                                         " Protective-serv": 10,
                                         " Priv-house-serv": 11,
                                         " Armed-Forces": 12,
                                         " Other-service": 13}).astype(int)
#df["occupation"].value_counts()

#Marital status
#We replace " Married-civ-spouse" with value 0, " Never-married" with value 1, " Divorced" with value 2,
# " Separated" with value 3, " Widowed" with value 4, " Married-spouse-absent" with value 5
# and " Married-AF-spouse" with value 6
df["marital status"] = df["marital status"].map({" Married-civ-spouse": 0,
                                                 " Never-married": 1,
                                                 " Divorced": 2,
                                                 " Separated": 3,
                                                 " Widowed": 4,
                                                 " Married-spouse-absent": 5,
                                                 " Married-AF-spouse": 6}).astype(int)
#df["marital status"].value_counts()

#Education
#We replace ' Some-college' with value 0, ' Preschool' with value 1, ' 5th-6th' with value 2,
# ' HS-grad' with value 3, ' Masters' with value 4, ' 12th' with value 5, ' 7th-8th' with value 6,
# ' Prof-school' with value 7, ' 1st-4th' with value 8, ' Assoc-acdm' with value 9,
# ' Doctorate' with value 10, ' 11th' with value 11, ' Bachelors' with value 12, ' 10th' with value 13,
# ' Assoc-voc' with value 14 and ' 9th' with value 15
df['education'] = df['education'].map({' Some-college': 0,
                                       ' Preschool': 1,
                                       ' 5th-6th': 2,
                                       ' HS-grad': 3,
                                       ' Masters': 4,
                                       ' 12th': 5,
                                       ' 7th-8th': 6,
                                       ' Prof-school': 7,
                                       ' 1st-4th': 8,
                                       ' Assoc-acdm': 9,
                                       ' Doctorate': 10,
                                       ' 11th': 11,
                                       ' Bachelors': 12,
                                       ' 10th': 13,
                                       ' Assoc-voc': 14,
                                       ' 9th': 15}).astype(int)
#df["education"].value_counts()

#Workclass
#We replace ' Self-emp-inc' with value 0, ' State-gov' with value 1, ' Federal-gov' with value 2,
# ' Without-pay' with value 3, ' Local-gov' with value 4, ' Private' with value 5,
# and ' Self-emp-not-inc' with value 6
df['workclass'] = df['workclass'].map({' Self-emp-inc': 0,
                                       ' State-gov': 1,
                                       ' Federal-gov': 2,
                                       ' Without-pay': 3,
                                       ' Local-gov': 4,
                                       ' Private': 5,
                                       ' Self-emp-not-inc': 6}).astype(int)
#df["workclass"].value_counts()

#Result with modification
df