Import and initiate findspark
Then import pyspark

In [1]:
import findspark
findspark.init("/usr/local/spark")

In [2]:
import pyspark

Instatiate SparkSession with Hive support

In [3]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Check point 3") \
        .config("spark.sql.warehouse.dir", "hdfs://localhost:54310/user/hive/warehouse") \
        .enableHiveSupport() \
        .getOrCreate()

# On some clusters the following config setting may be requied
#         .config("hive.metastore.uris", "<value>") 

spark.sql('create database capstone')

In [4]:
# Testing Hive integration
spark.sql('show databases').show()

+------------+
|databaseName|
+------------+
|    capstone|
|     default|
|        nyse|
|      office|
+------------+



In [5]:
df_cars = spark.read.load("/home/hduser/Downloads/sharedfolder/df_cars_merged.csv", format="csv", sep=",", inferSchema="true", header="true")
df_cars.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- year: integer (nullable = true)
 |-- selling_price: integer (nullable = true)
 |-- km_driven: integer (nullable = true)
 |-- StateorProvince: string (nullable = true)
 |-- City: string (nullable = true)
 |-- fuel: string (nullable = true)
 |-- seller_type: string (nullable = true)
 |-- transmission: string (nullable = true)
 |-- owner: string (nullable = true)
 |-- mileage: double (nullable = true)
 |-- engine: integer (nullable = true)
 |-- max_power: double (nullable = true)
 |-- seats: integer (nullable = true)
 |-- sold: string (nullable = true)
 |-- Region: string (nullable = true)
 |-- brand: string (nullable = true)



In [6]:
spark.sql('use capstone')

DataFrame[]

In [7]:
df_cars.write.mode("append").saveAsTable("capstone.dfcars")

In [8]:
spark.sql('select * from capstone.dfcars').show(truncate=False)

+---+----+-------------+---------+--------------------+----------+------+-----------+------------+------------+-------+------+---------+-----+----+------+----------+
|_c0|year|selling_price|km_driven|StateorProvince     |City      |fuel  |seller_type|transmission|owner       |mileage|engine|max_power|seats|sold|Region|brand     |
+---+----+-------------+---------+--------------------+----------+------+-----------+------------+------------+-------+------+---------+-----+----+------+----------+
|0  |2014|450000       |145500   |District of Columbia|Washington|Diesel|Individual |Manual      |First Owner |23.4   |1248  |74.0     |5    |Y   |East  |MARUTI    |
|1  |2019|1149000      |5000     |District of Columbia|Washington|Petrol|Individual |Manual      |First Owner |17.0   |1591  |121.3    |5    |Y   |East  |HYUNDAI   |
|2  |2017|600000       |25000    |District of Columbia|Washington|Petrol|Individual |Manual      |Third Owner |18.16  |1196  |86.8     |5    |Y   |East  |FORD      |
|3  

In [9]:
spark.sql('show tables from capstone').show()

+--------+---------+-----------+
|database|tableName|isTemporary|
+--------+---------+-----------+
|capstone|  df_cars|      false|
|capstone|   dfcars|      false|
+--------+---------+-----------+



In [10]:
spark.sql('use capstone')

DataFrame[]

### Profiling

In [11]:
df_cars.dtypes

[('_c0', 'int'),
 ('year', 'int'),
 ('selling_price', 'int'),
 ('km_driven', 'int'),
 ('StateorProvince', 'string'),
 ('City', 'string'),
 ('fuel', 'string'),
 ('seller_type', 'string'),
 ('transmission', 'string'),
 ('owner', 'string'),
 ('mileage', 'double'),
 ('engine', 'int'),
 ('max_power', 'double'),
 ('seats', 'int'),
 ('sold', 'string'),
 ('Region', 'string'),
 ('brand', 'string')]

In [12]:
numerical_features=['year','selling_price','km_driven','mileage','engine','max_power','seats']

In [13]:
df_cars.describe(numerical_features).show()

+-------+------------------+-----------------+-----------------+------------------+------------------+-----------------+------------------+
|summary|              year|    selling_price|        km_driven|           mileage|            engine|        max_power|             seats|
+-------+------------------+-----------------+-----------------+------------------+------------------+-----------------+------------------+
|  count|              7906|             7906|             7906|              7906|              7906|             7906|              7906|
|   mean|2013.9839362509485| 649813.720844928|69188.65975208703|19.419860865165695|1458.7088287376675|91.58737351378637|5.4163926132051605|
| stddev|3.8636953387034967|813582.7483541325|56792.29634331763| 4.036263200758886|  503.893056850139|35.74721608448376|0.9592082121984603|
|    min|              1994|            29999|                1|               0.0|               624|             32.8|                 2|
|    max|           

### Checking for missing values

In [14]:
from pyspark.sql.functions import isnan, when, count, col
df_cars.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df_cars.columns]).show()

+---+----+-------------+---------+---------------+----+----+-----------+------------+-----+-------+------+---------+-----+----+------+-----+
|_c0|year|selling_price|km_driven|StateorProvince|City|fuel|seller_type|transmission|owner|mileage|engine|max_power|seats|sold|Region|brand|
+---+----+-------------+---------+---------------+----+----+-----------+------------+-----+-------+------+---------+-----+----+------+-----+
|  0|   0|            0|        0|              0|   0|   0|          0|           0|    0|      0|     0|        0|    0|   0|     0|    0|
+---+----+-------------+---------+---------------+----+----+-----------+------------+-----+-------+------+---------+-----+----+------+-----+



In [15]:
def calculate_bounds(df):
    bounds = {
        c: dict(
            zip(["q1", "q3"], df.approxQuantile(c, [0.25, 0.75], 0))
        )
        for c,d in zip(df.columns, df.dtypes) if d[1] == "int" or d[1]=="double"
    }

    for c in bounds:
        iqr = bounds[c]['q3'] - bounds[c]['q1']
        bounds[c]['min'] = bounds[c]['q1'] - (iqr * 1.5)
        bounds[c]['max'] = bounds[c]['q3'] + (iqr * 1.5)

    return bounds

In [16]:
calculate_bounds(df_cars)

{'_c0': {'max': 11859.5, 'min': -3952.5, 'q1': 1977.0, 'q3': 5930.0},
 'engine': {'max': 2159.5, 'min': 619.5, 'q1': 1197.0, 'q3': 1582.0},
 'km_driven': {'max': 187500.0, 'min': -56500.0, 'q1': 35000.0, 'q3': 96000.0},
 'max_power': {'max': 152.925,
  'min': 17.124999999999993,
  'q1': 68.05,
  'q3': 102.0},
 'mileage': {'max': 30.63, 'min': 8.470000000000002, 'q1': 16.78, 'q3': 22.32},
 'seats': {'max': 5.0, 'min': 5.0, 'q1': 5.0, 'q3': 5.0},
 'selling_price': {'max': 1320000.0,
  'min': -360000.0,
  'q1': 270000.0,
  'q3': 690000.0},
 'year': {'max': 2024.5, 'min': 2004.5, 'q1': 2012.0, 'q3': 2017.0}}

In [17]:
import pyspark.sql.functions as f
def flag_outliers(df, id_col):
    bounds = calculate_bounds(df)
    outliers = {}

    return df.select(c, id_col,
        *[
            f.when(
                ~f.col(c).between(bounds[c]['min'], bounds[c]['max']),"yes"
            ).otherwise("no").alias(c+'_outlier')
        ]
    )

In [18]:
for c in numerical_features:
    flag_outliers(df_cars,c).show()

+----+----+------------+
|year|year|year_outlier|
+----+----+------------+
|2014|2014|          no|
|2019|2019|          no|
|2017|2017|          no|
|2016|2016|          no|
|2015|2015|          no|
|2018|2018|          no|
|2009|2009|          no|
|2010|2010|          no|
|2019|2019|          no|
|2014|2014|          no|
|2015|2015|          no|
|2019|2019|          no|
|2013|2013|          no|
|2018|2018|          no|
|2010|2010|          no|
|2016|2016|          no|
|2011|2011|          no|
|2016|2016|          no|
|2015|2015|          no|
|1999|1999|         yes|
+----+----+------------+
only showing top 20 rows

+-------------+-------------+---------------------+
|selling_price|selling_price|selling_price_outlier|
+-------------+-------------+---------------------+
|       450000|       450000|                   no|
|      1149000|      1149000|                   no|
|       600000|       600000|                   no|
|       540000|       540000|                   no|
|       63

### We observe outliers in the dataset and therefore they need to be handled

### Correlation co-efficients

In [19]:
from pyspark.ml.stat import Correlation
print('KMs driven :',df_cars.stat.corr('selling_price','km_driven'))
print('year :',df_cars.stat.corr('selling_price','year'))
print('engine :',df_cars.stat.corr('selling_price','engine'))
print('seats :',df_cars.stat.corr('selling_price','seats'))
print('max_power :',df_cars.stat.corr('selling_price','max_power'))
print('mileage :',df_cars.stat.corr('selling_price','mileage'))

KMs driven : -0.22215847533483693
year : 0.41230155817117
engine : 0.4556818000356126
seats : 0.041616693830263764
max_power : 0.7496737800444876
mileage : -0.12627994951355292


### Covariance co-efficients

In [20]:
#from pyspark.ml.stat import Covariance
print('KMs driven :',df_cars.stat.cov('selling_price','km_driven'))
print('year :',df_cars.stat.cov('selling_price','year'))
print('engine :',df_cars.stat.cov('selling_price','engine'))
print('seats :',df_cars.stat.cov('selling_price','seats'))
print('max_power :',df_cars.stat.cov('selling_price','max_power'))
print('mileage :',df_cars.stat.cov('selling_price','mileage'))

KMs driven : -10264884014.541874
year : 1296043.508228653
engine : 186810717.476202
seats : 32477.47033251092
max_power : 21803001.172212917
mileage : -414682.4053633221


### CROSS-TABULATION FOR BETTER ANALYSIS

In [21]:
print(df_cars.stat.crosstab('selling_price','fuel').show())

+------------------+---+------+---+------+
|selling_price_fuel|CNG|Diesel|LPG|Petrol|
+------------------+---+------+---+------+
|            978999|  0|     1|  0|     0|
|             68000|  0|     0|  0|     1|
|            600000|  0|   124|  0|    87|
|             31504|  0|     0|  0|     1|
|            105000|  0|     1|  1|     5|
|            537000|  0|     0|  0|     1|
|            499000|  0|     2|  0|     2|
|            715000|  0|     5|  0|     0|
|            403000|  0|     1|  0|     0|
|            541000|  0|     1|  0|     0|
|           1789999|  0|     1|  0|     0|
|            515000|  0|    10|  0|     7|
|            565000|  0|     3|  0|     7|
|            151000|  0|     1|  0|     0|
|           1025000|  0|     7|  0|     2|
|           1898999|  0|     1|  0|     0|
|            785000|  0|     0|  0|     1|
|           1051000|  0|     1|  0|     0|
|           1040000|  0|     0|  0|     1|
|           3300000|  0|     1|  0|     0|
+----------

In [22]:
print(df_cars.stat.crosstab('selling_price','seller_type').show())

+-------------------------+------+----------+----------------+
|selling_price_seller_type|Dealer|Individual|Trustmark Dealer|
+-------------------------+------+----------+----------------+
|                   978999|     0|         1|               0|
|                    68000|     0|         1|               0|
|                   600000|    43|       167|               1|
|                    31504|     0|         1|               0|
|                   105000|     0|         7|               0|
|                   537000|     0|         1|               0|
|                   499000|     1|         3|               0|
|                   715000|     0|         5|               0|
|                   403000|     0|         1|               0|
|                   541000|     0|         1|               0|
|                  1789999|     1|         0|               0|
|                   515000|     2|        15|               0|
|                   565000|     2|         8|          

In [23]:
print(df_cars.stat.crosstab('selling_price','transmission').show())

+--------------------------+---------+------+
|selling_price_transmission|Automatic|Manual|
+--------------------------+---------+------+
|                    978999|        0|     1|
|                     68000|        0|     1|
|                    600000|       44|   167|
|                     31504|        0|     1|
|                    105000|        0|     7|
|                    537000|        0|     1|
|                    499000|        1|     3|
|                    715000|        0|     5|
|                    403000|        0|     1|
|                    541000|        0|     1|
|                   1789999|        0|     1|
|                    515000|        1|    16|
|                    565000|        2|     8|
|                    151000|        0|     1|
|                   1025000|        6|     3|
|                   1898999|        1|     0|
|                    785000|        0|     1|
|                   1051000|        0|     1|
|                   1040000|      

In [24]:
print(df_cars.stat.crosstab('selling_price','owner').show())

+-------------------+-----------+--------------------+------------+--------------+-----------+
|selling_price_owner|First Owner|Fourth & Above Owner|Second Owner|Test Drive Car|Third Owner|
+-------------------+-----------+--------------------+------------+--------------+-----------+
|             978999|          1|                   0|           0|             0|          0|
|              68000|          0|                   0|           0|             0|          1|
|             600000|        170|                   1|          31|             0|          9|
|              31504|          0|                   0|           0|             0|          1|
|             105000|          1|                   1|           3|             0|          2|
|             537000|          1|                   0|           0|             0|          0|
|             499000|          3|                   0|           1|             0|          0|
|             715000|          4|                 

In [25]:
print(df_cars.stat.crosstab('selling_price','sold').show())

+------------------+---+---+
|selling_price_sold|  N|  Y|
+------------------+---+---+
|            978999|  1|  0|
|             68000|  1|  0|
|            600000|159| 52|
|             31504|  0|  1|
|            105000|  4|  3|
|            537000|  1|  0|
|            499000|  4|  0|
|            715000|  3|  2|
|            403000|  0|  1|
|            541000|  1|  0|
|           1789999|  1|  0|
|            515000| 14|  3|
|            565000|  7|  3|
|            151000|  1|  0|
|           1025000|  7|  2|
|           1898999|  1|  0|
|            785000|  1|  0|
|           1051000|  1|  0|
|           1040000|  0|  1|
|           3300000|  1|  0|
+------------------+---+---+
only showing top 20 rows

None


In [26]:
df_cars.stat.crosstab('selling_price','Region').show()

+--------------------+-------+----+-----+----+
|selling_price_Region|Central|East|South|West|
+--------------------+-------+----+-----+----+
|              978999|      0|   0|    1|   0|
|               68000|      0|   0|    0|   1|
|              600000|     75|  46|   38|  52|
|               31504|      0|   0|    1|   0|
|              105000|      1|   2|    2|   2|
|              537000|      0|   0|    0|   1|
|              499000|      0|   0|    1|   3|
|              715000|      0|   3|    2|   0|
|              403000|      1|   0|    0|   0|
|              541000|      0|   0|    0|   1|
|             1789999|      0|   0|    1|   0|
|              515000|      2|   6|    3|   6|
|              565000|      2|   2|    1|   5|
|              151000|      1|   0|    0|   0|
|             1025000|      1|   4|    2|   2|
|             1898999|      0|   0|    0|   1|
|              785000|      0|   1|    0|   0|
|             1051000|      1|   0|    0|   0|
|            

In [27]:
print(df_cars.stat.crosstab('selling_price','brand').show())

+-------------------+----------+-----+----+---+---------+------+------+----+-----+----+-----+-------+-----+------+----+---+----+-----+--------+------+-------------+---+----------+------+----+-------+-----+----+------+----------+-----+
|selling_price_brand|AMBASSADOR|ASHOK|AUDI|BMW|CHEVROLET|DAEWOO|DATSUN|FIAT|FORCE|FORD|HONDA|HYUNDAI|ISUZU|JAGUAR|JEEP|KIA|LAND|LEXUS|MAHINDRA|MARUTI|MERCEDES-BENZ| MG|MITSUBISHI|NISSAN|OPEL|RENAULT|SKODA|TATA|TOYOTA|VOLKSWAGEN|VOLVO|
+-------------------+----------+-----+----+---+---------+------+------+----+-----+----+-----+-------+-----+------+----+---+----+-----+--------+------+-------------+---+----------+------+----+-------+-----+----+------+----------+-----+
|             978999|         0|    0|   0|  0|        0|     0|     0|   0|    0|   0|    0|      1|    0|     0|   0|  0|   0|    0|       0|     0|            0|  0|         0|     0|   0|      0|    0|   0|     0|         0|    0|
|              68000|         0|    0|   0|  0|        0|   

In [28]:
features = ["year", "km_driven", "StateorProvince", "City","fuel", "seller-type", "transmission", "owner","mileage", "engine", "max_power", "seats", "sold","Region", "brand"]

In [30]:
#lr_data = df_cars.select(col("selling_price").alias("label"), *features)

In [None]:
lr_data = df_cars.select(col("selling_price").alias("label"), *features)
lr_data.printSchema()

### ENCODING OF DATA

In [43]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer

indexers = [StringIndexer(inputCol=column, outputCol=column+'_index').fit(df_cars) for column in list(set(df_cars.columns)-set(['selling_price','km_driven','year','_c0','mileage','engine','max_power','seats'])) ]

pipeline = Pipeline(stages=indexers)
df_r = pipeline.fit(df_cars).transform(df_cars)

df_r.show()

+---+----+-------------+---------+--------------------+----------+------+-----------+------------+------------+-------+------+---------+-----+----+------+----------+-----------+------------+----------+---------------------+----------+-----------+----------+-----------------+------------------+
|_c0|year|selling_price|km_driven|     StateorProvince|      City|  fuel|seller_type|transmission|       owner|mileage|engine|max_power|seats|sold|Region|     brand|brand_index|Region_index|fuel_index|StateorProvince_index|sold_index|owner_index|City_index|seller_type_index|transmission_index|
+---+----+-------------+---------+--------------------+----------+------+-----------+------------+------------+-------+------+---------+-----+----+------+----------+-----------+------------+----------+---------------------+----------+-----------+----------+-----------------+------------------+
|  0|2014|       450000|   145500|District of Columbia|Washington|Diesel| Individual|      Manual| First Owner|   2

In [46]:
df_r=df_r.drop("_c0","brand", "Region", "fuel", "StateorProvince", "sold", "owner" ,"City" ,"seller_type","transmission")

In [47]:
df_r.printSchema()

root
 |-- year: integer (nullable = true)
 |-- selling_price: integer (nullable = true)
 |-- km_driven: integer (nullable = true)
 |-- mileage: double (nullable = true)
 |-- engine: integer (nullable = true)
 |-- max_power: double (nullable = true)
 |-- seats: integer (nullable = true)
 |-- brand_index: double (nullable = true)
 |-- Region_index: double (nullable = true)
 |-- fuel_index: double (nullable = true)
 |-- StateorProvince_index: double (nullable = true)
 |-- sold_index: double (nullable = true)
 |-- owner_index: double (nullable = true)
 |-- City_index: double (nullable = true)
 |-- seller_type_index: double (nullable = true)
 |-- transmission_index: double (nullable = true)



### MODEL BUILDING

In [48]:
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline
from pyspark.sql.functions import *

In [56]:
features=['year','km_driven','mileage','engine','max_power','seats','brand_index','Region_index','fuel_index','StateorProvince_index','sold_index','owner_index','City_index','seller_type_index','transmission_index']

In [57]:
lr_data = df_r.select(col("selling_price").alias("label"), *features)
lr_data.printSchema()

root
 |-- label: integer (nullable = true)
 |-- year: integer (nullable = true)
 |-- km_driven: integer (nullable = true)
 |-- mileage: double (nullable = true)
 |-- engine: integer (nullable = true)
 |-- max_power: double (nullable = true)
 |-- seats: integer (nullable = true)
 |-- brand_index: double (nullable = true)
 |-- Region_index: double (nullable = true)
 |-- fuel_index: double (nullable = true)
 |-- StateorProvince_index: double (nullable = true)
 |-- sold_index: double (nullable = true)
 |-- owner_index: double (nullable = true)
 |-- City_index: double (nullable = true)
 |-- seller_type_index: double (nullable = true)
 |-- transmission_index: double (nullable = true)



In [58]:
lr_data.show()

+-------+----+---------+-------+------+---------+-----+-----------+------------+----------+---------------------+----------+-----------+----------+-----------------+------------------+
|  label|year|km_driven|mileage|engine|max_power|seats|brand_index|Region_index|fuel_index|StateorProvince_index|sold_index|owner_index|City_index|seller_type_index|transmission_index|
+-------+----+---------+-------+------+---------+-----+-----------+------------+----------+---------------------+----------+-----------+----------+-----------------+------------------+
| 450000|2014|   145500|   23.4|  1248|     74.0|    5|        0.0|         2.0|       0.0|                 33.0|       1.0|        0.0|       5.0|              0.0|               0.0|
|1149000|2019|     5000|   17.0|  1591|    121.3|    5|        1.0|         2.0|       1.0|                 33.0|       1.0|        0.0|       5.0|              0.0|               0.0|
| 600000|2017|    25000|  18.16|  1196|     86.8|    5|        6.0|        

### Linear Regressor

In [80]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

In [121]:
(trainingData, testData) = lr_data.randomSplit([0.8, 0.2])

#### VectorAssembler is a transformer that combines a given list of columns into a single vector column.

In [122]:
vectorAssembler = VectorAssembler(inputCols=features, outputCol="features")
train_a = vectorAssembler.transform(trainingData)
#train_a.show()
train_b = train_a.select("features",train_a.label.alias('label'))
train_b.show(truncate= False)

+-----------------------------------------------------------------------------+-----+
|features                                                                     |label|
+-----------------------------------------------------------------------------+-----+
|[1997.0,80000.0,16.1,796.0,37.0,4.0,0.0,0.0,1.0,6.0,0.0,2.0,249.0,0.0,0.0]   |29999|
|[2001.0,10000.0,17.3,993.0,60.0,5.0,0.0,0.0,1.0,1.0,0.0,0.0,1081.0,0.0,0.0]  |30000|
|[2000.0,56194.0,16.1,796.0,37.0,4.0,0.0,0.0,1.0,6.0,0.0,3.0,593.0,0.0,0.0]   |31000|
|[2004.0,90000.0,18.9,998.0,67.1,5.0,0.0,2.0,1.0,11.0,0.0,2.0,1162.0,0.0,0.0] |33351|
|[1998.0,40000.0,16.1,796.0,37.0,4.0,0.0,3.0,1.0,9.0,0.0,1.0,639.0,0.0,0.0]   |35000|
|[2003.0,35000.0,16.1,796.0,37.0,4.0,0.0,1.0,1.0,21.0,1.0,1.0,762.0,0.0,0.0]  |35000|
|[2001.0,42108.0,16.1,796.0,37.0,4.0,0.0,0.0,1.0,1.0,0.0,0.0,422.0,0.0,0.0]   |39000|
|[1996.0,32000.0,16.1,796.0,37.0,4.0,0.0,1.0,1.0,30.0,0.0,1.0,177.0,0.0,0.0]  |40000|
|[1997.0,120000.0,16.1,796.0,37.0,4.0,0.0,3.0,1.0,9.0,

In [123]:
lr = LinearRegression()
model = lr.fit(train_b)
test_a = vectorAssembler.transform(testData)
test_b = test_a.select('features', test_a.label.alias('label'))
test_c = model.transform(test_b)
test_c.show(truncate=False)

+----------------------------------------------------------------------------+-----+-------------------+
|features                                                                    |label|prediction         |
+----------------------------------------------------------------------------+-----+-------------------+
|[2004.0,110000.0,16.1,796.0,37.0,4.0,0.0,3.0,1.0,4.0,1.0,2.0,724.0,0.0,0.0] |31504|-629600.7332796678 |
|[2007.0,10000.0,16.1,796.0,37.0,4.0,0.0,2.0,1.0,11.0,0.0,0.0,1248.0,0.0,0.0]|35000|-454552.260469839  |
|[2002.0,80000.0,16.1,796.0,37.0,4.0,0.0,1.0,1.0,0.0,1.0,2.0,1.0,0.0,0.0]    |40000|-682258.409262687  |
|[1996.0,70000.0,16.1,796.0,37.0,4.0,0.0,3.0,1.0,31.0,0.0,2.0,174.0,0.0,0.0] |42000|-863181.513095215  |
|[2003.0,62000.0,16.1,796.0,37.0,4.0,0.0,3.0,1.0,4.0,0.0,0.0,617.0,0.0,0.0]  |42000|-635431.1577419862 |
|[1998.0,80000.0,16.1,796.0,37.0,4.0,0.0,2.0,1.0,5.0,0.0,3.0,296.0,0.0,0.0]  |45000|-829615.7728456482 |
|[1999.0,100000.0,17.3,993.0,60.0,5.0,0.0,3.0,1.0,4.0,0

In [125]:
evaluator = RegressionEvaluator()
print(evaluator.evaluate(test_c,{evaluator.metricName: "r2"}))

0.6753329119351326
