In [1]:
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.types import *
sc = SparkContext.getOrCreate()
ss = SparkSession.builder.getOrCreate()

In [2]:
def toDoubleSafe(v):
    try:
        return float(v)
    except ValueError:
        return str(v) #if it is not a float type return as a string.

## Create an RDD

In [3]:
#load and convert the data
census_raw = sc.textFile("../Data/adult.raw", 4).map(lambda x:  x.split(", "))
census_raw = census_raw.map(lambda row:  [toDoubleSafe(x) for x in row])

## Convert the RDD to DataFrame.


In [4]:
from pyspark.sql.types import *
adultschema = StructType([
    StructField("age",DoubleType(),True),
    StructField("workclass",StringType(),True),
    StructField("fnlwgt",DoubleType(),True),
    StructField("education",StringType(),True),
    StructField("marital_status",StringType(),True),
    StructField("occupation",StringType(),True),
    StructField("relationship",StringType(),True),
    StructField("race",StringType(),True),
    StructField("sex",StringType(),True),
    StructField("capital_gain",DoubleType(),True),
    StructField("capital_loss",DoubleType(),True),
    StructField("hours_per_week",DoubleType(),True),
    StructField("native_country",StringType(),True),
    StructField("income",StringType(),True)
])


In [5]:
dfraw = ss.createDataFrame(census_raw, adultschema)

In [6]:
dfraw.show(10)

+----+----------------+--------+---------+--------------------+-----------------+-------------+-----+------+------------+------------+--------------+--------------+------+
| age|       workclass|  fnlwgt|education|      marital_status|       occupation| relationship| race|   sex|capital_gain|capital_loss|hours_per_week|native_country|income|
+----+----------------+--------+---------+--------------------+-----------------+-------------+-----+------+------------+------------+--------------+--------------+------+
|39.0|       State-gov| 77516.0|Bachelors|       Never-married|     Adm-clerical|Not-in-family|White|  Male|      2174.0|         0.0|          40.0| United-States| <=50K|
|50.0|Self-emp-not-inc| 83311.0|Bachelors|  Married-civ-spouse|  Exec-managerial|      Husband|White|  Male|         0.0|         0.0|          13.0| United-States| <=50K|
|38.0|         Private|215646.0|  HS-grad|            Divorced|Handlers-cleaners|Not-in-family|White|  Male|         0.0|         0.0|      

In [7]:
#Check the most commonly used vals.
dfraw.groupBy(dfraw["workclass"]).count().orderBy("count",ascending=False).show()
dfraw.groupBy(dfraw["occupation"]).count().orderBy("count",ascending=False).show()
dfraw.groupBy(dfraw["native_country"]).count().orderBy("count",ascending=False).show()

+----------------+-----+
|       workclass|count|
+----------------+-----+
|         Private|33906|
|Self-emp-not-inc| 3862|
|       Local-gov| 3136|
|               ?| 2799|
|       State-gov| 1981|
|    Self-emp-inc| 1695|
|     Federal-gov| 1432|
|     Without-pay|   21|
|    Never-worked|   10|
+----------------+-----+

+-----------------+-----+
|       occupation|count|
+-----------------+-----+
|   Prof-specialty| 6172|
|     Craft-repair| 6112|
|  Exec-managerial| 6086|
|     Adm-clerical| 5611|
|            Sales| 5504|
|    Other-service| 4923|
|Machine-op-inspct| 3022|
|                ?| 2809|
| Transport-moving| 2355|
|Handlers-cleaners| 2072|
|  Farming-fishing| 1490|
|     Tech-support| 1446|
|  Protective-serv|  983|
|  Priv-house-serv|  242|
|     Armed-Forces|   15|
+-----------------+-----+

+------------------+-----+
|    native_country|count|
+------------------+-----+
|     United-States|43832|
|            Mexico|  951|
|                 ?|  857|
|       Philippin

## Clean the data. 

### Missing data imputation.


In [8]:
#Missing data imputation - Impute the most common row for "?".
dfrawrp = dfraw.replace(["?"], ["Private"], ["workclass"])
dfrawrpl = dfrawrp.replace(["?"], ["Prof-specialty"], ["occupation"])
dfrawnona = dfrawrpl.replace(["?"], ["United-States"], ["native_country"])

In [9]:
dfrawnona.show()

+----+----------------+--------+------------+--------------------+-----------------+-------------+------------------+------+------------+------------+--------------+--------------+------+
| age|       workclass|  fnlwgt|   education|      marital_status|       occupation| relationship|              race|   sex|capital_gain|capital_loss|hours_per_week|native_country|income|
+----+----------------+--------+------------+--------------------+-----------------+-------------+------------------+------+------------+------------+--------------+--------------+------+
|39.0|       State-gov| 77516.0|   Bachelors|       Never-married|     Adm-clerical|Not-in-family|             White|  Male|      2174.0|         0.0|          40.0| United-States| <=50K|
|50.0|Self-emp-not-inc| 83311.0|   Bachelors|  Married-civ-spouse|  Exec-managerial|      Husband|             White|  Male|         0.0|         0.0|          13.0| United-States| <=50K|
|38.0|         Private|215646.0|     HS-grad|            Div

### Convert strings to categorical values

In [10]:
#converting strings to numeric values
from pyspark.ml.feature import StringIndexer

def indexStringColumns(df, cols):
    #variable newdf will be updated several times
    newdf = df
    
    for c in cols:
        #For each given colum, fits StringIndexerModel.
        si = StringIndexer(inputCol=c, outputCol=c+"-num")
        sm = si.fit(newdf)
        #Creates a DataFame by putting the transformed values in the new colum with suffix "-num" 
        #and then drops the original columns.
        #and drop the "-num" suffix. 
        newdf = sm.transform(newdf).drop(c)
        newdf = newdf.withColumnRenamed(c+"-num", c)
    return newdf

dfnumeric = indexStringColumns(dfrawnona, ["workclass", "education", "marital_status", "occupation", "relationship", "race", "sex", "native_country", "income"])

In [11]:
dfnumeric.show()

+----+--------+------------+------------+--------------+---------+---------+--------------+----------+------------+----+---+--------------+------+
| age|  fnlwgt|capital_gain|capital_loss|hours_per_week|workclass|education|marital_status|occupation|relationship|race|sex|native_country|income|
+----+--------+------------+------------+--------------+---------+---------+--------------+----------+------------+----+---+--------------+------+
|39.0| 77516.0|      2174.0|         0.0|          40.0|      3.0|      2.0|           1.0|       3.0|         1.0| 0.0|0.0|           0.0|   0.0|
|50.0| 83311.0|         0.0|         0.0|          13.0|      1.0|      2.0|           0.0|       2.0|         0.0| 0.0|0.0|           0.0|   0.0|
|38.0|215646.0|         0.0|         0.0|          40.0|      0.0|      0.0|           2.0|       8.0|         1.0| 0.0|0.0|           0.0|   0.0|
|53.0|234721.0|         0.0|         0.0|          40.0|      0.0|      5.0|           0.0|       8.0|         0.0| 1.

In [12]:
from pyspark.ml.feature import OneHotEncoder
def oneHotEncodeColumns(df, cols):
    newdf = df
    for c in cols:
        #For each given colum, create OneHotEncoder. 
        #dropLast : Whether to drop the last category in the encoded vector (default: true)
        onehotenc = OneHotEncoder(inputCol=c, outputCol=c+"-onehot", dropLast=False)
        #Creates a DataFame by putting the transformed values in the new colum with suffix "-onehot" 
        #and then drops the original columns.
        #and drop the "-onehot" suffix. 
        newdf = onehotenc.transform(newdf).drop(c)
        newdf = newdf.withColumnRenamed(c+"-onehot", c)
    return newdf

dfhot = oneHotEncodeColumns(dfnumeric, ["workclass", "education", "marital_status", "occupation", "relationship", "race", "native_country"])

In [13]:
dfhot.show()

+----+--------+------------+------------+--------------+---+------+-------------+---------------+--------------+--------------+-------------+-------------+---------------+
| age|  fnlwgt|capital_gain|capital_loss|hours_per_week|sex|income|    workclass|      education|marital_status|    occupation| relationship|         race| native_country|
+----+--------+------------+------------+--------------+---+------+-------------+---------------+--------------+--------------+-------------+-------------+---------------+
|39.0| 77516.0|      2174.0|         0.0|          40.0|0.0|   0.0|(8,[3],[1.0])| (16,[2],[1.0])| (7,[1],[1.0])|(14,[3],[1.0])|(6,[1],[1.0])|(5,[0],[1.0])| (41,[0],[1.0])|
|50.0| 83311.0|         0.0|         0.0|          13.0|0.0|   0.0|(8,[1],[1.0])| (16,[2],[1.0])| (7,[0],[1.0])|(14,[2],[1.0])|(6,[0],[1.0])|(5,[0],[1.0])| (41,[0],[1.0])|
|38.0|215646.0|         0.0|         0.0|          40.0|0.0|   0.0|(8,[0],[1.0])| (16,[0],[1.0])| (7,[2],[1.0])|(14,[8],[1.0])|(6,[1],[1.0])

### Create a feature vector

In [14]:
# Merging the data with Vector Assembler.
from pyspark.ml.feature import VectorAssembler
input_cols=["age","capital_gain","capital_loss","fnlwgt","hours_per_week","sex","workclass","education","marital_status","occupation","relationship","native_country","race"]

#VectorAssembler takes a number of collumn names(inputCols) and output column name (outputCol)
#and transforms a DataFrame to assemble the values in inputCols into one single vector with outputCol.
va = VectorAssembler(outputCol="features", inputCols=input_cols)
#lpoints - labeled data.
lpoints = va.transform(dfhot).select("features", "income").withColumnRenamed("income", "label")

In [15]:
lpoints.rdd.take(5)

[Row(features=SparseVector(103, {0: 39.0, 1: 2174.0, 3: 77516.0, 4: 40.0, 9: 1.0, 16: 1.0, 31: 1.0, 40: 1.0, 52: 1.0, 57: 1.0, 98: 1.0}), label=0.0),
 Row(features=SparseVector(103, {0: 50.0, 3: 83311.0, 4: 13.0, 7: 1.0, 16: 1.0, 30: 1.0, 39: 1.0, 51: 1.0, 57: 1.0, 98: 1.0}), label=0.0),
 Row(features=SparseVector(103, {0: 38.0, 3: 215646.0, 4: 40.0, 6: 1.0, 14: 1.0, 32: 1.0, 45: 1.0, 52: 1.0, 57: 1.0, 98: 1.0}), label=0.0),
 Row(features=SparseVector(103, {0: 53.0, 3: 234721.0, 4: 40.0, 6: 1.0, 19: 1.0, 30: 1.0, 45: 1.0, 51: 1.0, 57: 1.0, 99: 1.0}), label=0.0),
 Row(features=SparseVector(103, {0: 28.0, 3: 338409.0, 4: 40.0, 5: 1.0, 6: 1.0, 16: 1.0, 30: 1.0, 37: 1.0, 55: 1.0, 65: 1.0, 99: 1.0}), label=0.0)]

In [16]:
lpoints.show()

+--------------------+-----+
|            features|label|
+--------------------+-----+
|(103,[0,1,3,4,9,1...|  0.0|
|(103,[0,3,4,7,16,...|  0.0|
|(103,[0,3,4,6,14,...|  0.0|
|(103,[0,3,4,6,19,...|  0.0|
|(103,[0,3,4,5,6,1...|  0.0|
|(103,[0,3,4,5,6,1...|  0.0|
|(103,[0,3,4,5,6,2...|  0.0|
|(103,[0,3,4,7,14,...|  1.0|
|(103,[0,1,3,4,5,6...|  1.0|
|(103,[0,1,3,4,6,1...|  1.0|
|(103,[0,3,4,6,15,...|  1.0|
|(103,[0,3,4,9,16,...|  1.0|
|(103,[0,3,4,5,6,1...|  0.0|
|(103,[0,3,4,6,20,...|  0.0|
|(103,[0,3,4,6,18,...|  1.0|
|(103,[0,3,4,6,22,...|  0.0|
|(103,[0,3,4,7,14,...|  0.0|
|(103,[0,3,4,6,14,...|  0.0|
|(103,[0,3,4,6,19,...|  0.0|
|(103,[0,3,4,5,7,1...|  1.0|
+--------------------+-----+
only showing top 20 rows



## Divide the dataset into training and testing sets.

In [17]:
#Divide the dataset into training and testing sets.
splits = lpoints.randomSplit([0.8, 0.2])

#cache() : the algorithm is interative and training and data sets are going to be reused many times.
adulttrain = splits[0].cache()
adultvalid = splits[1].cache()

In [18]:
adulttrain.write.saveAsTable("adulttrain")
adultvalid.write.saveAsTable("adultvalid")

## Train the model.

In [19]:
#Train the model.
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(regParam=0.01, maxIter=1000, fitIntercept=True)
lrmodel = lr.fit(adulttrain)
#The above lines are same as..
#lr = LogisticRegression()
#lrmodel = lr.setParams(regParam=0.01, maxIter=1000, fitIntercept=True).fit(adulttrain)

## Interpret the model parameters.

In [20]:
#Interpret the model parameters
print(lrmodel.coefficients)
print(lrmodel.intercept)

[0.018845931873333527,0.00014117726300889692,0.0005420090732477371,6.811362299080358e-07,0.027553596470840415,-0.533435453996313,0.036908507162697786,-0.3426115532340778,0.01162573033965854,-0.1647756175230354,0.22177430572982113,0.60712304893352,-0.1837908831874839,-1.3301213639036116,-0.367063257414429,-0.012340396730868503,0.7529391088145894,1.1523632498878413,0.15751034567132394,-0.9167908987223913,0.2131295966879849,-1.1427752161860065,-1.384072882749018,1.6212286567831247,-1.2396112642183577,-0.5708447990501982,1.693083714799912,-1.260741574143656,-1.5216556881734835,-1.790991276225644,0.8271481184965022,-0.6847713286332966,-0.2748867157641572,-0.34659892656222585,-0.2823935121164496,-0.32135025306078585,1.089205792851293,0.21943430605459455,0.06589209152828732,0.6635146447085448,-0.05093329858721947,0.24031746437995777,-0.7523053400899289,-0.3447147058043936,-0.15028002517001668,-0.686867732437323,-0.8309230588870046,0.48994556166268904,0.3998027455124984,-1.0587781417554925,0.3

In [21]:
#Evaluate models using test dataset.
#First, transform the validation set.
validpredicts = lrmodel.transform(adultvalid)
validpredicts.show()

+--------------------+-----+--------------------+--------------------+----------+
|            features|label|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|(103,[0,1,3,4,5,6...|  1.0|[-0.4097835424099...|[0.39896402475665...|       1.0|
|(103,[0,1,3,4,5,6...|  1.0|[-0.0257375403803...|[0.49356597007068...|       1.0|
|(103,[0,1,3,4,5,6...|  0.0|[4.03361344555393...|[0.98259797440904...|       0.0|
|(103,[0,1,3,4,5,6...|  0.0|[3.01148801659483...|[0.95309042702372...|       0.0|
|(103,[0,1,3,4,5,6...|  0.0|[3.14598666348974...|[0.95875029359034...|       0.0|
|(103,[0,1,3,4,5,6...|  0.0|[4.66221281531713...|[0.99064284533211...|       0.0|
|(103,[0,1,3,4,5,6...|  1.0|[0.12901501630603...|[0.53220909011043...|       0.0|
|(103,[0,1,3,4,5,6...|  1.0|[-0.3969319268858...|[0.40204970010218...|       1.0|
|(103,[0,1,3,4,5,6...|  0.0|[0.11666520955840...|[0.52913326603171...|       0.0|
|(103,[0,1,3,4,5

## Output
rawPrediction : includes two values - log-odds that a sample doesn't and does belong to the category (making > 50,000).

probability : the probability that the sample is not in the category.

prediction : proability that the sample belongs to the category.

In [22]:
validpredicts.select("rawPrediction").collect()

[Row(rawPrediction=DenseVector([-0.4098, 0.4098])),
 Row(rawPrediction=DenseVector([-0.0257, 0.0257])),
 Row(rawPrediction=DenseVector([4.0336, -4.0336])),
 Row(rawPrediction=DenseVector([3.0115, -3.0115])),
 Row(rawPrediction=DenseVector([3.146, -3.146])),
 Row(rawPrediction=DenseVector([4.6622, -4.6622])),
 Row(rawPrediction=DenseVector([0.129, -0.129])),
 Row(rawPrediction=DenseVector([-0.3969, 0.3969])),
 Row(rawPrediction=DenseVector([0.1167, -0.1167])),
 Row(rawPrediction=DenseVector([1.6962, -1.6962])),
 Row(rawPrediction=DenseVector([4.697, -4.697])),
 Row(rawPrediction=DenseVector([5.6833, -5.6833])),
 Row(rawPrediction=DenseVector([-14.281, 14.281])),
 Row(rawPrediction=DenseVector([-0.2612, 0.2612])),
 Row(rawPrediction=DenseVector([1.9357, -1.9357])),
 Row(rawPrediction=DenseVector([2.1555, -2.1555])),
 Row(rawPrediction=DenseVector([1.8844, -1.8844])),
 Row(rawPrediction=DenseVector([0.0135, -0.0135])),
 Row(rawPrediction=DenseVector([-0.4008, 0.4008])),
 Row(rawPrediction

In [23]:
validpredicts.select("probability").collect()

[Row(probability=DenseVector([0.399, 0.601])),
 Row(probability=DenseVector([0.4936, 0.5064])),
 Row(probability=DenseVector([0.9826, 0.0174])),
 Row(probability=DenseVector([0.9531, 0.0469])),
 Row(probability=DenseVector([0.9588, 0.0412])),
 Row(probability=DenseVector([0.9906, 0.0094])),
 Row(probability=DenseVector([0.5322, 0.4678])),
 Row(probability=DenseVector([0.402, 0.598])),
 Row(probability=DenseVector([0.5291, 0.4709])),
 Row(probability=DenseVector([0.845, 0.155])),
 Row(probability=DenseVector([0.991, 0.009])),
 Row(probability=DenseVector([0.9966, 0.0034])),
 Row(probability=DenseVector([0.0, 1.0])),
 Row(probability=DenseVector([0.4351, 0.5649])),
 Row(probability=DenseVector([0.8739, 0.1261])),
 Row(probability=DenseVector([0.8962, 0.1038])),
 Row(probability=DenseVector([0.8681, 0.1319])),
 Row(probability=DenseVector([0.5034, 0.4966])),
 Row(probability=DenseVector([0.4011, 0.5989])),
 Row(probability=DenseVector([0.8609, 0.1391])),
 Row(probability=DenseVector([0.17

## Evaluate the model.

In [24]:
#Evaluate the model. default metric : Area Under ROC
from pyspark.ml.evaluation import BinaryClassificationEvaluator
bceval = BinaryClassificationEvaluator()
print (bceval.getMetricName() +":" + str(bceval.evaluate(validpredicts)))

areaUnderROC:0.901769284628039


In [25]:
#Evaluate the model. metric : Area Under PR
bceval.setMetricName("areaUnderPR")
print (bceval.getMetricName() +":" + str(bceval.evaluate(validpredicts)))

areaUnderPR:0.7441970509619391


### n-fold validation and the results.

In [26]:
from pyspark.ml.tuning import CrossValidator
from pyspark.ml.tuning import ParamGridBuilder
cv = CrossValidator().setEstimator(lr).setEvaluator(bceval).setNumFolds(5)
#ParamGridBuilder() – combinations of parameters and their values.
paramGrid = ParamGridBuilder().addGrid(lr.maxIter, [1000]).addGrid(lr.regParam, [0.0001, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5]).build()
#setEstimatorParamMaps() takes ParamGridBuilder().
cv.setEstimatorParamMaps(paramGrid)
cvmodel = cv.fit(adulttrain)

In [27]:
print(cvmodel.bestModel.coefficients)
print(cvmodel.bestModel.intercept)
print(cvmodel.bestModel._java_obj.getMaxIter())
print(cvmodel.bestModel._java_obj.getRegParam())

[0.021010418025111575,0.00031113117841078845,0.0006366158462145024,8.027898761803586e-07,0.031113237920317213,-0.7380206277809533,-0.40784482078278894,-0.8679958909258415,-0.4631789674644593,-0.6504061833023447,-0.28017925176292285,0.1923882331162061,-0.6367680902368058,-4.795772875310172,-0.6200014619084054,-0.21910621219260204,0.6085969256179298,1.038421461162468,-0.059012493836538726,-1.3578824956750293,0.02792072873604022,-1.6084929059047857,-1.8982466976944345,1.5797216112416999,-1.7059331760886811,-0.9114244814608924,1.6895008768161632,-1.7782854668110104,-2.307980465653427,-5.774520085045608,1.2362370528182014,-1.446473691886886,-0.9780983768826562,-1.0424515544860655,-0.9551354050315146,-1.0862725600177865,1.5639381065787028,-0.0008397921014162724,-0.11005969616405281,0.4912586632813959,-0.24097150966284112,0.05242779400191432,-1.1076323158484185,-0.5588343294940479,-0.3374981743764733,-0.9728025850923292,-1.173850924112299,0.33751686953910687,0.2615223260505519,-2.618892799013

In [28]:
BinaryClassificationEvaluator().evaluate(cvmodel.bestModel.transform(adultvalid))

0.9039037104278022

In [29]:
BinaryClassificationEvaluator().setMetricName("areaUnderPR").evaluate(cvmodel.bestModel.transform(adultvalid))

0.7571472121238862

In [30]:
BinaryClassificationEvaluator().setMetricName("areaUnderROC").evaluate(cvmodel.bestModel.transform(adultvalid))

0.9039037104277987