# PySpark Cookbook

### Tomasz Drabas, Denny Lee
#### Version: 0.1
#### Date: 2/28/2018

# Loading the data

In [62]:
import pyspark.sql.functions as func
census_path = '../data/census_income.csv'

census = spark.read.csv(
    census_path
    , header=True
    , inferSchema=True
)

for col, typ in census.dtypes:
    if typ == 'string':
        census = census.withColumn(
            col
            , func.ltrim(func.rtrim(census[col]))
        )
census.count()

32561

In [63]:
census.show()

+---+----------------+------+------------+-------------+--------------------+-----------------+-------------+------------------+------+------------+------------+--------------+--------------+-----+
|age|       workclass|fnlwgt|   education|education-num|      marital-status|       occupation| relationship|              race|   sex|capital-gain|capital-loss|hours-per-week|native-country|label|
+---+----------------+------+------------+-------------+--------------------+-----------------+-------------+------------------+------+------------+------------+--------------+--------------+-----+
| 39|       State-gov| 77516|   Bachelors|           13|       Never-married|     Adm-clerical|Not-in-family|             White|  Male|        2174|           0|            40| United-States|<=50K|
| 50|Self-emp-not-inc| 83311|   Bachelors|           13|  Married-civ-spouse|  Exec-managerial|      Husband|             White|  Male|           0|           0|            13| United-States|<=50K|
| 38|     

In [64]:
census.printSchema()

root
 |-- age: integer (nullable = true)
 |-- workclass: string (nullable = true)
 |-- fnlwgt: integer (nullable = true)
 |-- education: string (nullable = true)
 |-- education-num: integer (nullable = true)
 |-- marital-status: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- relationship: string (nullable = true)
 |-- race: string (nullable = true)
 |-- sex: string (nullable = true)
 |-- capital-gain: integer (nullable = true)
 |-- capital-loss: integer (nullable = true)
 |-- hours-per-week: integer (nullable = true)
 |-- native-country: string (nullable = true)
 |-- label: string (nullable = true)

# Exploring data

## Data prep

List of columns to keep

In [65]:
cols_to_keep = census.dtypes

cols_to_keep = (
    ['label','age'
     ,'capital-gain'
     ,'capital-loss'
     ,'hours-per-week'
    ] + [
        e[0] for e in cols_to_keep[:-1] 
        if e[1] == 'string'
    ]
)

cols_to_keep

['label', 'age', 'capital-gain', 'capital-loss', 'hours-per-week', 'workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']

Get numeric and categorical columns

In [66]:
import pyspark.mllib.stat as st
import numpy as np

census_subset = census.select(cols_to_keep)

cols_num = [
    e[0] for e in census_subset.dtypes 
    if e[1] == 'int'
]
cols_cat = [
    e[0] for e in census_subset.dtypes[1:] 
    if e[1] == 'string'
]
cols_num, cols_cat

(['age', 'capital-gain', 'capital-loss', 'hours-per-week'], ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country'])

## Numerical data

In [67]:
rdd_num = (
    census_subset
    .select(cols_num)
    .rdd
    .map(lambda row: [e for e in row])
)

stats_num = st.Statistics.colStats(rdd_num)

for col, min_, mean_, max_, var_ in zip(
      cols_num
    , stats_num.min()
    , stats_num.mean()
    , stats_num.max()
    , stats_num.variance()
):
    print('{0}: min->{1:.1f}, mean->{2:.1f}, max->{3:.1f}, stdev->{4:.1f}'
          .format(col, min_, mean_, max_, np.sqrt(var_)))

age: min->17.0, mean->38.6, max->90.0, stdev->13.6
capital-gain: min->0.0, mean->1077.6, max->99999.0, stdev->7385.3
capital-loss: min->0.0, mean->87.3, max->4356.0, stdev->403.0
hours-per-week: min->1.0, mean->40.4, max->99.0, stdev->12.3

## Categorical data

In [68]:
rdd_cat = (
    census_subset
    .select(cols_cat + ['label'])
    .rdd
    .map(lambda row: [e for e in row])
)

results_cat = {}

for i, col in enumerate(cols_cat + ['label']):
    results_cat[col] = (
        rdd_cat
        .groupBy(lambda row: row[i])
        .map(lambda el: (el[0], len(el[1])))
        .collect()
    )

for k in results_cat:
    print(
        k
        , sorted(
            results_cat[k]
            , key=lambda el: el[1]
            , reverse=True)
        , '\n')

sex [('Male', 21790), ('Female', 10771)] 

race [('White', 27816), ('Black', 3124), ('Asian-Pac-Islander', 1039), ('Amer-Indian-Eskimo', 311), ('Other', 271)] 

label [('<=50K', 24720), ('>50K', 7841)] 

native-country [('United-States', 29170), ('Mexico', 643), ('?', 583), ('Philippines', 198), ('Germany', 137), ('Canada', 121), ('Puerto-Rico', 114), ('El-Salvador', 106), ('India', 100), ('Cuba', 95), ('England', 90), ('Jamaica', 81), ('South', 80), ('China', 75), ('Italy', 73), ('Dominican-Republic', 70), ('Vietnam', 67), ('Guatemala', 64), ('Japan', 62), ('Poland', 60), ('Columbia', 59), ('Taiwan', 51), ('Haiti', 44), ('Iran', 43), ('Portugal', 37), ('Nicaragua', 34), ('Peru', 31), ('France', 29), ('Greece', 29), ('Ecuador', 28), ('Ireland', 24), ('Hong', 20), ('Trinadad&Tobago', 19), ('Cambodia', 19), ('Laos', 18), ('Thailand', 18), ('Yugoslavia', 16), ('Outlying-US(Guam-USVI-etc)', 14), ('Hungary', 13), ('Honduras', 13), ('Scotland', 12), ('Holand-Netherlands', 1)] 

marital-statu

## Correlations

In [69]:
correlations = st.Statistics.corr(rdd_num)
correlations

array([[ 1.        ,  0.0776745 ,  0.05777454,  0.06875571],
       [ 0.0776745 ,  1.        , -0.03161506,  0.07840862],
       [ 0.05777454, -0.03161506,  1.        ,  0.05425636],
       [ 0.06875571,  0.07840862,  0.05425636,  1.        ]])

In [70]:
for i, el_i in enumerate(abs(correlations) > 0.05):
    print(cols_num[i])
    
    for j, el_j in enumerate(el_i):
        if el_j and j != i:
            print(
                '    '
                , cols_num[j]
                , correlations[i][j]
            )
            
    print()

age
     capital-gain 0.077674498166
     capital-loss 0.057774539479
     hours-per-week 0.0687557075095

capital-gain
     age 0.077674498166
     hours-per-week 0.0784086153901

capital-loss
     age 0.057774539479
     hours-per-week 0.0542563622727

hours-per-week
     age 0.0687557075095
     capital-gain 0.0784086153901
     capital-loss 0.0542563622727

# Statistical testing

In [71]:
import pyspark.mllib.linalg as ln

census_occupation = (
    census
    .groupby('occupation')
    .pivot('label')
    .count()
)

census_occupation_coll = (
    census_occupation
    .rdd
    .map(lambda row: (row[1:]))
    .flatMap(lambda row: row)
    .collect()
)

len_row = len(census_occupation.collect())
dense_mat = ln.DenseMatrix(
    len_row
    , 2
    , census_occupation_coll
    , True
)

chi_sq = st.Statistics.chiSqTest(dense_mat)

print(chi_sq.pValue)
print(chi_sq.nullHypothesis)

0.0
the occurrence of the outcomes is statistically independent.

In [72]:
dense_mat.toArray()

array([[  2.66700000e+03,   9.83000000e+02],
       [  2.09800000e+03,   1.96800000e+03],
       [  2.28100000e+03,   1.85900000e+03],
       [  1.28400000e+03,   8.60000000e+01],
       [  8.79000000e+02,   1.15000000e+02],
       [  3.17000000e+03,   9.29000000e+02],
       [  1.27700000e+03,   3.20000000e+02],
       [  1.48000000e+02,   1.00000000e+00],
       [  4.38000000e+02,   2.11000000e+02],
       [  3.15800000e+03,   1.37000000e+02],
       [  6.45000000e+02,   2.83000000e+02],
       [  1.75200000e+03,   2.50000000e+02],
       [  8.00000000e+00,   1.00000000e+00],
       [  1.65200000e+03,   1.91000000e+02],
       [  3.26300000e+03,   5.07000000e+02]])

# Transforming the data

Number of distinct values

In [73]:
len_ftrs = []

for col in cols_cat:
    (
        len_ftrs
        .append(
            (col
             , census
                 .select(col)
                 .distinct()
                 .count()
            )
        )
    )
    
len_ftrs = dict(len_ftrs)
len_ftrs

{'sex': 2, 'race': 5, 'native-country': 42, 'marital-status': 7, 'workclass': 9, 'education': 16, 'occupation': 15, 'relationship': 6}

Using hashing trick

In [74]:
import pyspark.mllib.feature as feat

final_data = (
    census
    .select(cols_to_keep)
    .rdd
    .map(lambda row: [
        list(
            feat.HashingTF(int(len_ftrs[col] / 2.0))
            .transform(row[i])
            .toArray()
        ) if i >= 5
        else [row[i]] 
        for i, col in enumerate(cols_to_keep)]
    )
)

final_data.take(3)

[[['<=50K'], [39], [2174], [0], [40], [1.0, 2.0, 1.0, 5.0], [3.0, 3.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0], [2.0, 3.0, 8.0], [0.0, 3.0, 3.0, 1.0, 4.0, 1.0, 0.0], [5.0, 5.0, 3.0], [3.0, 2.0], [4.0], [1.0, 0.0, 0.0, 3.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 1.0, 1.0, 2.0, 1.0, 1.0, 0.0]], [['<=50K'], [50], [0], [0], [13], [4.0, 3.0, 1.0, 8.0], [3.0, 3.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0], [5.0, 5.0, 8.0], [0.0, 1.0, 2.0, 2.0, 8.0, 1.0, 1.0], [4.0, 2.0, 1.0], [3.0, 2.0], [4.0], [1.0, 0.0, 0.0, 3.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 1.0, 1.0, 2.0, 1.0, 1.0, 0.0]], [['<=50K'], [38], [0], [0], [40], [2.0, 2.0, 0.0, 3.0], [2.0, 2.0, 0.0, 0.0, 0.0, 0.0, 2.0, 1.0], [3.0, 2.0, 3.0], [2.0, 3.0, 1.0, 3.0, 7.0, 0.0, 1.0], [5.0, 5.0, 3.0], [3.0, 2.0], [4.0], [1.0, 0.0, 0.0, 3.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 1.0, 1.0, 2.0, 1.0, 1.0, 0.0]]]

Encode label

In [75]:
def labelEncode(label):
    return [int(label[0] == '>50K')]

final_data = (
    final_data
    .map(lambda row: labelEncode(row[0]) 
         + [item 
            for sublist in row[1:] 
            for item in sublist]
        )
)

# Standardizing data

In [76]:
standardizer = feat.StandardScaler(True, True)
sModel = standardizer.fit(final_data.map(lambda row: row[1:]))
final_data_scaled = sModel.transform(final_data.map(lambda row: row[1:]))

final_data = (
    final_data
    .map(lambda row: row[0])
    .zipWithIndex()
    .map(lambda row: (row[1], row[0]))
    .join(
        final_data_scaled
        .zipWithIndex()
        .map(lambda row: (row[1], row[0]))
    )
    .map(lambda row: row[1])
)

final_data.take(3)

[(0, DenseVector([0.0307, 0.1485, -0.2167, -0.0354, -1.2635, 0.008, 1.7796, 1.0001, 0.83, 0.5743, -0.3473, -0.443, 0.6826, -0.4007, -0.3862, -0.4685, -1.1369, -0.4555, 0.4551, -1.1329, 2.0776, 1.8713, -1.0381, -0.3381, -0.2381, -0.775, 0.9805, 1.5207, 0.3083, -0.0634, -0.2574, -0.7031, 0.3208, -0.0901, -0.1263, 0.3355, -0.1223, 0.3378, -0.0853, -0.0937, -0.0887, -0.2104, 0.0, 0.1286, -0.1976, -0.1433, -0.1419, 0.1895, 0.298, 0.2896, 0.1638, 0.1221, 0.0])), (0, DenseVector([-0.0426, -0.1459, -0.2167, -0.0354, -0.2368, 0.008, -0.5593, -0.2503, -0.2076, -0.1937, -0.3473, -0.443, -0.9076, -0.4007, 1.1933, -0.4685, -0.4505, -1.1655, -2.1192, 0.935, 2.0776, -0.5029, 0.383, 1.0477, -1.306, 1.0328, 0.9805, 1.5207, 0.3083, -0.0634, -0.2574, -0.7031, 0.3208, -0.0901, -0.1263, 0.3355, -0.1223, 0.3378, -0.0853, -0.0937, -0.0887, -0.2104, 0.0, 0.1286, -0.1976, -0.1433, -0.1419, 0.1895, 0.298, 0.2896, 0.1638, 0.1221, 0.0])), (0, DenseVector([-0.7758, -0.1459, -0.2167, -0.0354, -0.2368, 0.008, -0.559

In [77]:
final_data.take(1)

[(0, DenseVector([0.0307, 0.1485, -0.2167, -0.0354, -1.2635, 0.008, 1.7796, 1.0001, 0.83, 0.5743, -0.3473, -0.443, 0.6826, -0.4007, -0.3862, -0.4685, -1.1369, -0.4555, 0.4551, -1.1329, 2.0776, 1.8713, -1.0381, -0.3381, -0.2381, -0.775, 0.9805, 1.5207, 0.3083, -0.0634, -0.2574, -0.7031, 0.3208, -0.0901, -0.1263, 0.3355, -0.1223, 0.3378, -0.0853, -0.0937, -0.0887, -0.2104, 0.0, 0.1286, -0.1976, -0.1433, -0.1419, 0.1895, 0.298, 0.2896, 0.1638, 0.1221, 0.0]))]

In [78]:
sModel.mean, sModel.std

(DenseVector([38.5816, 1077.6488, 87.3038, 40.4375, 2.2307, 1.9942, 0.2391, 3.4004, 2.2001, 2.2522, 0.1836, 0.2528, 0.5707, 0.1384, 1.2445, 1.5914, 3.6564, 3.6415, 7.1161, 1.0957, 0.838, 1.4236, 2.461, 4.7319, 1.223, 0.4287, 3.8206, 2.8271, 2.472, 3.0508, 2.4882, 4.6616, 0.9066, 0.008, 0.0157, 2.7153, 0.0157, 0.8967, 0.0072, 0.0108, 0.0078, 0.0432, 0.0, 1.9416, 0.0445, 0.0238, 0.0197, 0.9444, 0.9136, 1.8534, 0.9618, 0.964, 0.0]), DenseVector([13.6404, 7385.2921, 402.9602, 12.3474, 0.974, 0.722, 0.4276, 1.5994, 0.9637, 1.3021, 0.5285, 0.5706, 0.6288, 0.3453, 0.6331, 1.2623, 1.457, 1.4084, 1.9423, 0.9671, 1.0407, 0.8424, 1.4074, 2.1649, 0.9365, 0.5532, 1.2029, 1.4289, 1.7124, 0.8013, 1.8969, 0.941, 0.2911, 0.0893, 0.1243, 0.8486, 0.1281, 0.3057, 0.0846, 0.1151, 0.088, 0.2055, 0.0, 0.4543, 0.2251, 0.1658, 0.1391, 0.2935, 0.29, 0.5062, 0.2331, 0.2945, 0.0]))

# Creating an RDD for training

In [79]:
import pyspark.mllib.regression as reg

final_data_income = (
    final_data
    .map(lambda row: reg.LabeledPoint(
        row[0]
        , row[1]
        )
    )
)
final_data_income.take(2)

[LabeledPoint(0.0, [0.03067008638,0.148450615588,-0.216656200028,-0.0354289029213,-1.26349550696,0.00799647373215,1.77958633578,1.0001208369,0.830011803082,0.574318091225,-0.347301166352,-0.442992960562,0.682621408268,-0.400707851562,-0.386227594906,-0.46854980554,-1.13687358292,-0.455511851733,0.455078852767,-1.13291606435,2.07757549683,1.87129751533,-1.03808682171,-0.338080942511,-0.238127083048,-0.775013916071,0.980455418851,1.52071905613,0.308340936417,-0.0633904612925,-0.257367850576,-0.703060548727,0.320779847155,-0.0900636281825,-0.126266971505,0.335477418132,-0.122303578935,0.337810645307,-0.085261168002,-0.0936707494514,-0.0886669711795,-0.210415412747,0.0,0.12864878811,-0.197570816433,-0.143327008532,-0.141932215584,0.189518952773,0.297980479635,0.289626591195,0.16375112207,0.122095826183,0.0]), LabeledPoint(0.0, [-0.042641371748,-0.145918242817,-0.216656200028,-0.0354289029213,-0.236826580974,0.00799647373215,-0.559267778422,-0.250332636089,-0.207614485797,-0.193695846447,-0

In [80]:
mu, std = sModel.mean[3], sModel.std[3]

final_data_hours = (
    final_data
    .map(lambda row: reg.LabeledPoint(
        row[1][3] * std + mu
        , ln.Vectors.dense([row[0]] + list(row[1][0:3]) + list(row[1][4:]))
        )
    )
)
final_data_hours.take(2)

[LabeledPoint(40.0, [0.0,0.03067008638,0.148450615588,-0.216656200028,-1.26349550696,0.00799647373215,1.77958633578,1.0001208369,0.830011803082,0.574318091225,-0.347301166352,-0.442992960562,0.682621408268,-0.400707851562,-0.386227594906,-0.46854980554,-1.13687358292,-0.455511851733,0.455078852767,-1.13291606435,2.07757549683,1.87129751533,-1.03808682171,-0.338080942511,-0.238127083048,-0.775013916071,0.980455418851,1.52071905613,0.308340936417,-0.0633904612925,-0.257367850576,-0.703060548727,0.320779847155,-0.0900636281825,-0.126266971505,0.335477418132,-0.122303578935,0.337810645307,-0.085261168002,-0.0936707494514,-0.0886669711795,-0.210415412747,0.0,0.12864878811,-0.197570816433,-0.143327008532,-0.141932215584,0.189518952773,0.297980479635,0.289626591195,0.16375112207,0.122095826183,0.0]), LabeledPoint(40.0, [0.0,-0.042641371748,-0.145918242817,-0.216656200028,-0.236826580974,0.00799647373215,-0.559267778422,-0.250332636089,-0.207614485797,-0.193695846447,-0.347301166352,-0.4429929

# Splitting into training and testing

In [81]:
(
    final_data_income_train
    , final_data_income_test
) = (
    final_data_income.randomSplit([0.7, 0.3])
)

In [82]:
(
    final_data_hours_train
    , final_data_hours_test
) = (
    final_data_hours.randomSplit([0.7, 0.3])
)

# Predicting hours of work for census respondents

Linear regression (benchmark)

In [83]:
workhours_model_lm = reg.LinearRegressionWithSGD.train(final_data_hours_train)



In [84]:
small_sample_hours = sc.parallelize(final_data_hours_test.take(10))

for t,p in zip(
    small_sample_hours
        .map(lambda row: row.label)
        .collect()
    , workhours_model_lm.predict(
        small_sample_hours
            .map(lambda row: row.features)
    ).collect()):
    print(t,p)

50.0 45.9023510548
30.0 0.314537517465
50.0 -2.48376636865
15.0 1.53025028946
40.0 8.01134007899
43.0 1.54798295675
50.0 -7.27390674472
38.0 -0.808751734677
48.0 -1.42157623665
40.0 -9.26148064215

In [85]:
workhours_model_lm.weights

DenseVector([53.793, -3.4514, -3.271, -1.9501, 0.1113, -0.8569, 0.0895, 1.5276, -2.755, 1.5095, 1.2773, -1.2846, -0.795, 1.5652, 3.5236, 1.5256, -0.8454, -1.1562, -1.0639, 0.5737, -0.4507, -0.1431, -1.7297, 1.4213, 1.5768, -1.5884, 4.26, -0.1054, 4.0529, -0.6732, 0.7465, 0.0164, -0.0078, -0.2661, 0.3984, -0.0136, -0.0142, 0.0518, -0.441, -0.0189, 0.1313, 0.9807, 0.0, -0.301, 1.1183, -0.3137, 0.1066, 0.1851, 0.1046, 0.2166, 0.2525, 0.2499, 0.0])

# Forecasting income levels of census respondents

Logistic regression

In [86]:
import pyspark.mllib.classification as cl

income_model_lr = cl.LogisticRegressionWithSGD.train(final_data_income_train)



In [87]:
small_sample_income = sc.parallelize(final_data_income_test.take(10))

for t,p in zip(
    small_sample_income
        .map(lambda row: row.label)
        .collect()
    , income_model_lr.predict(
        small_sample_income
            .map(lambda row: row.features)
    ).collect()):
    print(t,p)

0.0 0
0.0 0
0.0 0
0.0 0
0.0 0
0.0 0
0.0 0
0.0 1
0.0 0
0.0 1

In [88]:
income_model_lr.threshold

0.5

In [89]:
income_model_lr.weights

DenseVector([0.236, 0.7628, 0.2321, 0.2129, 0.0594, 0.1149, -0.0103, -0.1193, 0.2547, -0.0603, -0.1274, 0.0879, 0.0011, -0.0314, -0.2971, -0.1015, 0.2153, 0.2254, 0.019, -0.02, 0.0304, -0.0272, 0.058, -0.0337, -0.0732, 0.1469, -0.1255, -0.0943, -0.164, 0.0358, -0.0447, -0.0872, -0.0007, 0.0, -0.016, -0.0023, 0.0018, -0.0056, 0.0046, -0.0052, 0.0117, -0.0255, 0.0, 0.011, -0.0537, -0.0049, -0.0549, 0.0066, -0.0076, 0.0079, 0.0305, -0.0332, 0.0])

Support Vector Machines

In [90]:
income_model_svm = cl.SVMWithSGD.train(
    final_data_income
#     , step=0.95
    , miniBatchFraction=1/2.0
)

In [91]:
for t,p in zip(
    small_sample_income
        .map(lambda row: row.label)
        .collect()
    , income_model_svm.predict(
        small_sample_income
            .map(lambda row: row.features)
    ).collect()):
    print(t,p)

0.0 1
0.0 0
0.0 0
0.0 0
0.0 1
0.0 0
0.0 1
0.0 1
0.0 0
0.0 1

In [92]:
income_model_svm.weights

DenseVector([0.1301, 1.0672, 0.1607, 0.0979, 0.0426, 0.084, -0.0115, -0.0772, 0.1667, -0.1179, -0.0835, 0.0359, -0.0265, -0.0687, -0.1805, -0.0576, 0.4046, 0.2522, 0.0732, -0.0664, 0.0039, -0.0032, 0.0656, -0.0723, -0.0431, 0.0966, -0.1211, -0.04, -0.1584, 0.0206, -0.0343, -0.0386, 0.0121, -0.0044, 0.0074, 0.0041, 0.0119, 0.0001, 0.0066, -0.0049, 0.0065, -0.0118, 0.0, 0.0028, -0.0656, 0.0057, -0.1175, 0.0053, -0.0114, 0.0086, 0.0019, -0.0241, 0.0])

# Building clustering models

In [93]:
import pyspark.mllib.clustering as clu

model_km = clu.KMeans.train(
    final_data.map(lambda row: row[1])
    , 2
    , initializationMode='random'
    , seed=666
)

In [94]:
import sklearn.metrics as m

predicted = (
    model_km
        .predict(
            final_data.map(lambda row: row[1])
        )
)
predicted = predicted.collect()

true = final_data.map(lambda row: row[0]).collect()

print(m.homogeneity_score(true, predicted))
print(m.completeness_score(true, predicted))

0.153472872815
0.122339061021

# Computing performance statistics

In [95]:
import pyspark.mllib.evaluation as ev

Regression metrics

In [96]:
true_pred_reg = (
    final_data_hours_test
    .map(lambda row: (
         float(workhours_model_lm.predict(row.features))
         , row.label))
)

metrics_lm = ev.RegressionMetrics(true_pred_reg)

In [97]:
print('R^2: ', metrics_lm.r2)
print('Explained Variance: ', metrics_lm.explainedVariance)
print('meanAbsoluteError: ', metrics_lm.meanAbsoluteError)

R^2:  -6.754451242767173
Explained Variance:  1145.7421086452416
meanAbsoluteError:  29.866629018908615

Classification metrics

In [104]:
true_pred_class_lr = (
    final_data_income_test
    .map(lambda row: (
        float(income_model_lr.predict(row.features))
        , row.label))
)

metrics_lr = ev.BinaryClassificationMetrics(true_pred_class_lr)

print('areaUnderPR: ', metrics_lr.areaUnderPR)
print('areaUnderROC: ', metrics_lr.areaUnderROC)

areaUnderPR:  0.7050195379236808
areaUnderROC:  0.7951114398791014

In [99]:
trainErr = (
    true_pred_class_lr
    .filter(lambda lp: lp[0] != lp[1]).count() 
    / float(true_pred_class_lr.count())
)
print("Training Error = " + str(trainErr))

Training Error = 0.26766639276910437

In [105]:
true_pred_class_svm = (
    final_data_income_test
    .map(lambda row: (
        float(income_model_svm.predict(row.features))
        , row.label))
)

metrics_svm = ev.BinaryClassificationMetrics(true_pred_class_svm)

print('areaUnderPR: ', metrics_svm.areaUnderPR)
print('areaUnderROC: ', metrics_svm.areaUnderROC)

areaUnderPR:  0.6911561138639338
areaUnderROC:  0.7794154787088152

In [101]:
trainErr = (
    true_pred_class_svm
    .filter(lambda lp: lp[0] != lp[1]).count() 
    / float(true_pred_class_svm.count())
)

print("Training Error = " + str(trainErr))

Training Error = 0.28358668857847164

Confusion matrix

In [102]:
(
    true_pred_class_lr
    .map(lambda el: ((el), 1))
    .reduceByKey(lambda x,y: x+y)
    .take(4)
)

[((0.0, 0.0), 4967), ((1.0, 1.0), 2163), ((0.0, 1.0), 196), ((1.0, 0.0), 2410)]

In [103]:
(
    true_pred_class_svm
    .map(lambda el: ((el), 1))
    .reduceByKey(lambda x,y: x+y)
    .take(4)
)

[((0.0, 0.0), 4848), ((1.0, 1.0), 2127), ((0.0, 1.0), 232), ((1.0, 0.0), 2529)]