<b>Dataset location: </b>https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data

The data is in the format value1, value2... <br />
The leading whitespace for each value needs to be removed

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName('Predicting whether a person\'s income is greater than $50K') \
    .getOrCreate()

rawData = spark.read\
            .format('csv')\
            .option('header', 'false')\
            .option('ignoreLeadingWhiteSpace', 'true')\
            .load('../datasets/adult.csv')

#### Specify column headers for data set

In [50]:
rawData.show(2)

+---+----------------+-----+---------+---+------------------+---------------+-------------+-----+----+----+----+----+-------------+-----+
|_c0|             _c1|  _c2|      _c3|_c4|               _c5|            _c6|          _c7|  _c8| _c9|_c10|_c11|_c12|         _c13| _c14|
+---+----------------+-----+---------+---+------------------+---------------+-------------+-----+----+----+----+----+-------------+-----+
| 39|       State-gov|77516|Bachelors| 13|     Never-married|   Adm-clerical|Not-in-family|White|Male|2174|   0|  40|United-States|<=50K|
| 50|Self-emp-not-inc|83311|Bachelors| 13|Married-civ-spouse|Exec-managerial|      Husband|White|Male|   0|   0|  13|United-States|<=50K|
+---+----------------+-----+---------+---+------------------+---------------+-------------+-----+----+----+----+----+-------------+-----+
only showing top 2 rows



In [51]:
dataset = rawData.toDF('Age',
               'WorkClass',
               'FnlWgt',
               'Education',
               'EducationNum',
               'MaritalStatus',
               'Occupation',
               'Relationship',
               'Race',
               'Gender',
               'CapitalGain',
               'CapitalLoss',
               'HoursPerWeek',
               'NativeCountry',
               'Label'
                )

In [52]:
dataset.toPandas().head()

Unnamed: 0,Age,WorkClass,FnlWgt,Education,EducationNum,MaritalStatus,Occupation,Relationship,Race,Gender,CapitalGain,CapitalLoss,HoursPerWeek,NativeCountry,Label
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


#### Drop FnlWgt column which does not appear meaningful

In [53]:
dataset = dataset.drop('FnlWgt')

#### Examine the dataset
* The FnlWgt column has been dropped
* There are missing values in the data represented by '?' (e.g. line 32541 for column WorkClass)

In [54]:
#dataset.toPandas()

#### Count rows in dataset

In [55]:
dataset.count()

32561

#### Convert missing values to null
Missing values in this dataset are represented by ?

In [56]:
dataset = dataset.replace('?', None)

#### Drop all rows which contain even a single missing value
The value 'any' for parameter how specifies that even a single missing value in a row should result in it being dropped (as opposed to 'all' where all values need to be missing)

In [57]:
dataset = dataset.dropna(how='any')

#### Number of rows has reduced now

In [58]:
dataset.count()

30162

#### Confirm missing value rows are not there
Row 32541 for example

In [59]:
x = dataset.toPandas()
x.isna().sum().sum()#toPandas()

0

#### View the data types for all the columns
Since they have all been loaded as Strings, we need to convert the numeric fields to Float

In [60]:
dataset.describe()

DataFrame[summary: string, Age: string, WorkClass: string, Education: string, EducationNum: string, MaritalStatus: string, Occupation: string, Relationship: string, Race: string, Gender: string, CapitalGain: string, CapitalLoss: string, HoursPerWeek: string, NativeCountry: string, Label: string]

In [61]:
from pyspark.sql.types import FloatType
from pyspark.sql.functions import col

dataset = dataset.withColumn('Age', 
                             dataset['Age'].cast(FloatType()))
dataset = dataset.withColumn('EducationNum', 
                             dataset['EducationNum'].cast(FloatType()))
dataset = dataset.withColumn('CapitalGain', 
                             dataset['CapitalGain'].cast(FloatType()))
dataset = dataset.withColumn('CapitalLoss', 
                             dataset['CapitalLoss'].cast(FloatType()))
dataset = dataset.withColumn('HoursPerWeek', 
                             dataset['HoursPerWeek'].cast(FloatType()))

dataset.toPandas().head()

Unnamed: 0,Age,WorkClass,Education,EducationNum,MaritalStatus,Occupation,Relationship,Race,Gender,CapitalGain,CapitalLoss,HoursPerWeek,NativeCountry,Label
0,39.0,State-gov,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174.0,0.0,40.0,United-States,<=50K
1,50.0,Self-emp-not-inc,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,13.0,United-States,<=50K
2,38.0,Private,HS-grad,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,0.0,0.0,40.0,United-States,<=50K
3,53.0,Private,11th,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0.0,0.0,40.0,United-States,<=50K
4,28.0,Private,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0.0,0.0,40.0,Cuba,<=50K


#### Transform categorical fields
First use StringIndexer to convert categorical values to indices

In [62]:
from pyspark.ml.feature import StringIndexer

indexedDF = StringIndexer(
    inputCol='WorkClass', 
    outputCol='WorkClass_index').fit(dataset).transform(dataset)

#### A new column called WorkClass_index is created
This stores the indexed values of WorkClass

In [63]:
indexedDF.toPandas().head()

Unnamed: 0,Age,WorkClass,Education,EducationNum,MaritalStatus,Occupation,Relationship,Race,Gender,CapitalGain,CapitalLoss,HoursPerWeek,NativeCountry,Label,WorkClass_index
0,39.0,State-gov,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174.0,0.0,40.0,United-States,<=50K,3.0
1,50.0,Self-emp-not-inc,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,13.0,United-States,<=50K,1.0
2,38.0,Private,HS-grad,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,0.0,0.0,40.0,United-States,<=50K,0.0
3,53.0,Private,11th,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0.0,0.0,40.0,United-States,<=50K,0.0
4,28.0,Private,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0.0,0.0,40.0,Cuba,<=50K,0.0


#### OneHotEncoding
Use the new indexed field to obtain a one-hot-encoded field

In [64]:
from pyspark.ml.feature import OneHotEncoder

encodedDF = OneHotEncoder(
    inputCol="WorkClass_index", 
    outputCol="WorkClass_encoded").transform(indexedDF)

#### A WorkClass_encoded field is created 
* This contains the one-hot-encoding for WorkClass
* This cannot operate directly on a column with string values - values need to be numeric. Hence we use the WorkClass_index as input

In [65]:
encodedDF.toPandas().head()

Unnamed: 0,Age,WorkClass,Education,EducationNum,MaritalStatus,Occupation,Relationship,Race,Gender,CapitalGain,CapitalLoss,HoursPerWeek,NativeCountry,Label,WorkClass_index,WorkClass_encoded
0,39.0,State-gov,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174.0,0.0,40.0,United-States,<=50K,3.0,"(0.0, 0.0, 0.0, 1.0, 0.0, 0.0)"
1,50.0,Self-emp-not-inc,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,13.0,United-States,<=50K,1.0,"(0.0, 1.0, 0.0, 0.0, 0.0, 0.0)"
2,38.0,Private,HS-grad,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,0.0,0.0,40.0,United-States,<=50K,0.0,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0)"
3,53.0,Private,11th,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0.0,0.0,40.0,United-States,<=50K,0.0,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0)"
4,28.0,Private,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0.0,0.0,40.0,Cuba,<=50K,0.0,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0)"


#### View the original and transformed fields together

In [66]:
encodedDF.select('WorkClass', 'WorkClass_index', 'WorkClass_encoded')\
         .toPandas()\
         .head()

Unnamed: 0,WorkClass,WorkClass_index,WorkClass_encoded
0,State-gov,3.0,"(0.0, 0.0, 0.0, 1.0, 0.0, 0.0)"
1,Self-emp-not-inc,1.0,"(0.0, 1.0, 0.0, 0.0, 0.0, 0.0)"
2,Private,0.0,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0)"
3,Private,0.0,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0)"
4,Private,0.0,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0)"


### Transform the entire dataset
* So far we have only transformed a single column
* We need to perform this transformation for every categorical and non-numeric column
* This will be simplified by using a Pipeline (a feature of Spark ML)

####  First, split the data into training and test sets

In [67]:
(trainingData, testData) = dataset.randomSplit([0.8,0.2])

#### Encode all the categorical fields in the dataset
We begin by listing all the categorical fields

In [68]:
categoricalFeatures = [
               'WorkClass',
               'Education',
               'MaritalStatus',
               'Occupation',
               'Relationship',
               'Race',
               'Gender',
               'NativeCountry'
]

#### Create an array of StringIndexers to convert the categorical values to indices

**`keep`** helps to create a new index if we come across a new value in the test data that has not been seen in training

In [69]:
indexers = [StringIndexer(
    inputCol=column, 
    outputCol=column + '_index', 
    handleInvalid='keep') for column in categoricalFeatures]

#### Create an array of OneHotEncoders to encode the categorical values

In [70]:
encoders = [OneHotEncoder(
    inputCol=column + '_index', 
    outputCol= column + '_encoded') for column in categoricalFeatures]

#### Index the Label field

In [72]:
# Target shouldn't be OHE
labelIndexer = [StringIndexer(
    inputCol='Label', outputCol='Label_index')]

#### Create a pipeline
The pipeline contains the array of StringIndexers and OneHotEncoders

In [73]:
from pyspark.ml import Pipeline

pipeline = Pipeline(stages=indexers + encoders + labelIndexer)

#### View the result of the transformations performed by this pipeline
This pipeline can transform our dataset into a format which can be used by our model

In [74]:
transformedDF = pipeline.fit(trainingData).transform(trainingData)
transformedDF.toPandas().tail()

Unnamed: 0,Age,WorkClass,Education,EducationNum,MaritalStatus,Occupation,Relationship,Race,Gender,CapitalGain,...,NativeCountry_index,WorkClass_encoded,Education_encoded,MaritalStatus_encoded,Occupation_encoded,Relationship_encoded,Race_encoded,Gender_encoded,NativeCountry_encoded,Label_index
24109,90.0,Private,Some-college,10.0,Never-married,Other-service,Not-in-family,Asian-Pac-Islander,Male,0.0,...,0.0,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 1.0, 0.0, 0.0)","(1.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0
24110,90.0,Private,Some-college,10.0,Separated,Adm-clerical,Own-child,White,Female,0.0,...,7.0,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 1.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 1.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...",0.0
24111,90.0,Self-emp-not-inc,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Husband,White,Male,10566.0,...,0.0,"(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0
24112,90.0,Self-emp-not-inc,HS-grad,9.0,Never-married,Exec-managerial,Not-in-family,White,Male,2964.0,...,0.0,"(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0
24113,90.0,Self-emp-not-inc,Some-college,10.0,Married-civ-spouse,Farming-fishing,Husband,White,Male,0.0,...,0.0,"(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0


#### Select the required features
At this point the dataset contains a lot of additional columns. We select the features needed by our model

In [75]:
# Select numerical features + OHE features
requiredFeatures = [
    'Age',
    'EducationNum',
    'CapitalGain',
    'CapitalLoss',
    'HoursPerWeek',
    'WorkClass_encoded',
    'Education_encoded',
    'MaritalStatus_encoded',
    'Occupation_encoded',
    'Relationship_encoded',
    'Race_encoded',
    'Gender_encoded',
    'NativeCountry_encoded'
]

#### VectorAssembler
VectorAssembler is a transformer that combines a given list of columns into a single vector column. It is useful for combining raw features and features generated by different feature transformers into a single feature vector
* We had previously written our own function to create such a vector

In [76]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols=requiredFeatures,
                                    outputCol='features')

In [77]:
transformedDF = assembler.transform(transformedDF)
transformedDF.toPandas().tail(2)

Unnamed: 0,Age,WorkClass,Education,EducationNum,MaritalStatus,Occupation,Relationship,Race,Gender,CapitalGain,...,WorkClass_encoded,Education_encoded,MaritalStatus_encoded,Occupation_encoded,Relationship_encoded,Race_encoded,Gender_encoded,NativeCountry_encoded,Label_index,features
24112,90.0,Self-emp-not-inc,HS-grad,9.0,Never-married,Exec-managerial,Not-in-family,White,Male,2964.0,...,"(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,"(90.0, 9.0, 2964.0, 0.0, 12.0, 0.0, 1.0, 0.0, ..."
24113,90.0,Self-emp-not-inc,Some-college,10.0,Married-civ-spouse,Farming-fishing,Husband,White,Male,0.0,...,"(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,"(90.0, 10.0, 0.0, 0.0, 40.0, 0.0, 1.0, 0.0, 0...."


In [78]:
transformedDF.select('features').toPandas().tail(2)

Unnamed: 0,features
24112,"(90.0, 9.0, 2964.0, 0.0, 12.0, 0.0, 1.0, 0.0, ..."
24113,"(90.0, 10.0, 0.0, 0.0, 40.0, 0.0, 1.0, 0.0, 0...."


#### Specify our estimator

In [79]:
from pyspark.ml.classification import RandomForestClassifier

rf = RandomForestClassifier(labelCol='Label_index', 
                            featuresCol='features',
                            maxDepth=5)

#### Final Pipeline
* The pipeline we built previously only transformed the feature columns
* We re-create the pipeline to include the VectorAssembler and the estimator

The pipeline to be used to build the model contains all the transformers and ends with the estimator

In [80]:
pipeline = Pipeline(
    stages=indexers + encoders + labelIndexer + [assembler, rf]
)

#### Train the model

In [81]:
model = pipeline.fit(trainingData)

#### Use the test data for predictions

In [82]:
predictions = model.transform(testData)
predictionsDF = predictions.toPandas()
predictionsDF.head(2)

Unnamed: 0,Age,WorkClass,Education,EducationNum,MaritalStatus,Occupation,Relationship,Race,Gender,CapitalGain,...,Occupation_encoded,Relationship_encoded,Race_encoded,Gender_encoded,NativeCountry_encoded,Label_index,features,rawPrediction,probability,prediction
0,17.0,Local-gov,11th,7.0,Never-married,Craft-repair,Own-child,White,Male,0.0,...,"(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 1.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,"(17.0, 7.0, 0.0, 0.0, 35.0, 0.0, 0.0, 1.0, 0.0...","[18.89038715509818, 1.1096128449018263]","[0.9445193577549087, 0.055480642245091305]",0.0
1,17.0,Local-gov,11th,7.0,Never-married,Prof-specialty,Own-child,White,Female,0.0,...,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 1.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 1.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...",0.0,"(17.0, 7.0, 0.0, 0.0, 20.0, 0.0, 0.0, 1.0, 0.0...","[19.03257139151009, 0.9674286084899089]","[0.9516285695755047, 0.04837143042449545]",0.0


#### Select the correct label and predictions to evaluate the model

In [83]:
predictions = predictions.select(
    'Label_index',
    'prediction'
)

#### Create an evaluator for our model

In [84]:
# metricNames: precision, recall, weightedPrecision, weightedRecall and f1
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(
    labelCol='Label_index', 
    predictionCol='prediction', 
    metricName='accuracy')

#### Check the accuracy

In [85]:
accuracy = evaluator.evaluate(predictions)
print('Test Accuracy = ', accuracy)

Test Accuracy =  0.8250661375661376


#### Examine incorrect predictions
we can see the probability of classes are very close when  we have misclassification

In [86]:
predictionsDF.loc[
    predictionsDF['Label_index'] != predictionsDF['prediction']
].head(4)

Unnamed: 0,Age,WorkClass,Education,EducationNum,MaritalStatus,Occupation,Relationship,Race,Gender,CapitalGain,...,Occupation_encoded,Relationship_encoded,Race_encoded,Gender_encoded,NativeCountry_encoded,Label_index,features,rawPrediction,probability,prediction
164,19.0,Private,7th-8th,4.0,Never-married,Other-service,Not-in-family,White,Male,0.0,...,"(0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1.0,"(19.0, 4.0, 0.0, 0.0, 60.0, 1.0, 0.0, 0.0, 0.0...","[18.02692244853072, 1.9730775514692793]","[0.901346122426536, 0.09865387757346397]",0.0
500,21.0,Private,Some-college,10.0,Never-married,Protective-serv,Not-in-family,Black,Female,99999.0,...,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 1.0, 0.0, 0.0, 0.0)","(0.0, 1.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1.0,"(21.0, 10.0, 99999.0, 0.0, 40.0, 1.0, 0.0, 0.0...","[10.8760607951167, 9.123939204883298]","[0.543803039755835, 0.4561969602441649]",0.0
778,23.0,Private,Some-college,10.0,Married-civ-spouse,Adm-clerical,Husband,White,Male,0.0,...,"(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1.0,"(23.0, 10.0, 0.0, 0.0, 40.0, 1.0, 0.0, 0.0, 0....","[15.33801164872073, 4.661988351279271]","[0.7669005824360365, 0.23309941756396352]",0.0
866,24.0,Private,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,...,"(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1.0,"(24.0, 13.0, 0.0, 0.0, 55.0, 1.0, 0.0, 0.0, 0....","[10.312564115059882, 9.687435884940117]","[0.5156282057529941, 0.4843717942470058]",0.0
