# ML task on Cars dataset
**Target:**

    Predict how many days `duration_listed` it will take to sell a car on a web platform.

![cars.jpg](attachment:cars.jpg)

In [1]:
from pyspark.sql import SparkSession

In [2]:
appName = "ML pipline"
master = "local[4]"

spark = SparkSession.builder.master(master).appName(appName).getOrCreate()

21/12/13 01:23:06 WARN Utils: Your hostname, Dmitrijs-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 192.168.0.100 instead (on interface en0)
21/12/13 01:23:06 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
21/12/13 01:23:07 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
spark

# 0. Reading Data

In [4]:
df_cars = spark.read.csv('cars.csv', header='True', inferSchema=True)
TARGET = 'duration_listed'

                                                                                

In [5]:
df_cars = df_cars.dropna()

In [7]:
df_cars.printSchema()

root
 |-- manufacturer_name: string (nullable = true)
 |-- model_name: string (nullable = true)
 |-- transmission: string (nullable = true)
 |-- odometer_value: integer (nullable = true)
 |-- year_produced: integer (nullable = true)
 |-- engine_capacity: double (nullable = true)
 |-- body_type: string (nullable = true)
 |-- price_usd: double (nullable = true)
 |-- location_region: string (nullable = true)
 |-- number_of_photos: integer (nullable = true)
 |-- up_counter: integer (nullable = true)
 |-- duration_listed: integer (nullable = true)



### Separating different data types

In [9]:
str_cols = [item[0] for item in df_cars.dtypes if item[1].startswith('string')]

float_cols = [item[0] for item in df_cars.dtypes if item[1].startswith('double')]

int_cols = [item[0] for item in df_cars.dtypes if item[1].startswith('int')]

bool_cols = [item[0] for item in df_cars.dtypes if item[1].startswith('boolean')]

# 1. Splitting the data into train and test

### 1.1 Simple splitting

In [10]:
# Split the data into train and test sets
train, test = df_cars.randomSplit([.75, .25], seed=1234)

In [11]:
print(f'Train size: {train.count()}\nTest size:  {test.count()}')

                                                                                

Train size: 28854
Test size:  9667


**Note**

Stratified split need to be done by hands

### 1.2 Stratified splitting

In [12]:
df_cars.groupby('transmission').count().show()

+------------+-----+
|transmission|count|
+------------+-----+
|   automatic|12888|
|  mechanical|25633|
+------------+-----+



In [13]:
zeros = df_cars.filter("transmission = 'automatic'")
ones = df_cars.filter("transmission = 'mechanical'")
# split datasets into training and testing
train0, test0 = zeros.randomSplit([.75, .25], seed=1234)
train1, test1 = ones.randomSplit([.75, .25], seed=1234)
# stack datasets back together
train = train0.union(train1)
test = test0.union(test1)

In [14]:
train.count(), test.count()



(28900, 9621)

# 2. Encoding Categorical variables

### 2.1 Label encoding (string indexing)

In [15]:
from pyspark.ml.feature import StringIndexer

In [16]:
str_cols_ind =[col +'_ind' for col in str_cols]

str_indexer = StringIndexer(inputCols=str_cols, outputCols=str_cols_ind)

train = str_indexer.fit(train).transform(train)

                                                                                

In [18]:
train.select('manufacturer_name', 'manufacturer_name_ind').distinct().sort('manufacturer_name_ind').show(5)

+-----------------+---------------------+
|manufacturer_name|manufacturer_name_ind|
+-----------------+---------------------+
|       Volkswagen|                  0.0|
|             Opel|                  1.0|
|              BMW|                  2.0|
|             Ford|                  3.0|
|             Audi|                  4.0|
+-----------------+---------------------+
only showing top 5 rows



### 2.2 One hot encoding

In [20]:
from pyspark.ml.feature import OneHotEncoder

In [22]:
# Create a OneHotEncoder

str_dummys = [col + '_dummy' for col in str_cols]

onehot = OneHotEncoder(inputCols=str_cols_ind, outputCols=str_dummys)

In [23]:
train = onehot.fit(train).transform(train)

In [24]:
train.select('manufacturer_name',
             'manufacturer_name_ind',
             'manufacturer_name_dummy')\
            .distinct()\
            .sort('manufacturer_name_ind')\
            .show()

+-----------------+---------------------+-----------------------+
|manufacturer_name|manufacturer_name_ind|manufacturer_name_dummy|
+-----------------+---------------------+-----------------------+
|       Volkswagen|                  0.0|         (54,[0],[1.0])|
|             Opel|                  1.0|         (54,[1],[1.0])|
|              BMW|                  2.0|         (54,[2],[1.0])|
|             Ford|                  3.0|         (54,[3],[1.0])|
|             Audi|                  4.0|         (54,[4],[1.0])|
|          Renault|                  5.0|         (54,[5],[1.0])|
|    Mercedes-Benz|                  6.0|         (54,[6],[1.0])|
|          Peugeot|                  7.0|         (54,[7],[1.0])|
|          Citroen|                  8.0|         (54,[8],[1.0])|
|           Nissan|                  9.0|         (54,[9],[1.0])|
|            Mazda|                 10.0|        (54,[10],[1.0])|
|           Toyota|                 11.0|        (54,[11],[1.0])|
|         

[Stage 26:>                                                         (0 + 2) / 2]                                                                                

### `Note` Dense vs Sparse vectors

In [20]:
from pyspark.ml.linalg import SparseVector, DenseVector

DenseVector([1, 0, 0, 0, 0, 0, 7, 0, 0])

DenseVector([1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 7.0, 0.0, 0.0])

In [21]:
SparseVector(9, [0, 6], [1, 7])

SparseVector(9, {0: 1.0, 6: 7.0})

# 4. Vectorizing predictors

In [25]:
from pyspark.ml.feature import VectorAssembler

In [27]:
train = train.select(int_cols + float_cols + str_dummys)

vec_assembler = VectorAssembler(inputCols=train.columns, outputCol='features')

train = vec_assembler.transform(train)

train.select('features').show()

+--------------------+
|            features|
+--------------------+
|(1150,[0,1,2,3,4,...|
|(1150,[0,1,2,3,4,...|
|(1150,[0,1,2,3,4,...|
|(1150,[0,1,2,3,4,...|
|(1150,[0,1,2,3,4,...|
|(1150,[0,1,2,3,4,...|
|(1150,[0,1,2,3,4,...|
|(1150,[0,1,2,3,4,...|
|(1150,[0,1,2,3,4,...|
|(1150,[0,1,2,3,4,...|
|(1150,[0,1,2,3,4,...|
|(1150,[0,1,2,3,4,...|
|(1150,[0,1,2,3,4,...|
|(1150,[0,1,2,3,4,...|
|(1150,[0,1,2,3,4,...|
|(1150,[0,1,2,3,4,...|
|(1150,[0,1,2,3,4,...|
|(1150,[0,1,2,3,4,...|
|(1150,[0,1,2,3,4,...|
|(1150,[0,1,2,3,4,...|
+--------------------+
only showing top 20 rows



# 5. Baseline model

In [31]:
import pyspark.ml

In [34]:
from pyspark.ml.regression import LinearRegression

In [35]:
reg = LinearRegression(labelCol=TARGET, featuresCol='features')

In [36]:
reg = reg.fit(train)

21/12/13 01:32:56 WARN Instrumentation: [db42c6cd] regParam is zero, which might cause numerical instability and overfitting.
21/12/13 01:32:56 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
21/12/13 01:32:56 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.ForeignLinkerBLAS
21/12/13 01:32:57 WARN InstanceBuilder$NativeLAPACK: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK
21/12/13 01:32:57 WARN Instrumentation: [db42c6cd] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
21/12/13 01:32:58 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
21/12/13 01:32:58 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS
                                                                                

In [37]:
predictions = reg.transform(train)

In [38]:
predictions.select('prediction', TARGET).show()

+------------------+---------------+
|        prediction|duration_listed|
+------------------+---------------+
|171.99966937750236|            172|
|152.00043232862572|            152|
| 61.00039985877073|             61|
|228.00881627833198|            228|
|21.004959882561277|             21|
| 124.0051284774795|            124|
|1181.9997296909503|           1182|
|186.00255264048337|            186|
|  118.998348281514|            119|
|40.000285416312664|             40|
|2078.9957483423136|           2079|
| 4.998532476567573|              5|
|11.005056153951378|             11|
|265.00008605794403|            265|
|154.00028038763838|            154|
|  789.997760545205|            790|
|  90.9983982943395|             91|
| 7.998422849330582|              8|
| 66.99337051761036|             67|
|251.99684822765752|            252|
+------------------+---------------+
only showing top 20 rows



In [39]:
from pyspark.ml.evaluation import RegressionEvaluator

In [42]:
lr_evaluator = RegressionEvaluator(
    labelCol=TARGET, predictionCol="prediction", metricName="r2")
r2 = lr_evaluator.evaluate(predictions)
print(f'R2 on train data = {r2}')

R2 on train data = 0.9999999943022991


# 6. Pipline

![Screenshot%202021-12-12%20at%202.24.36%20AM.png](attachment:Screenshot%202021-12-12%20at%202.24.36%20AM.png)

In [46]:
from pyspark.ml import Pipeline

from pyspark.ml.regression import GBTRegressor

stages = []

#depend on categorical columns: country and types of emission

for categoricalCol in str_cols:
    stringIndexer = StringIndexer(inputCol = categoricalCol,
                                  outputCol = categoricalCol + '_Index', handleInvalid = 'keep')
    encoder = OneHotEncoder(inputCol=stringIndexer.getOutputCol(),
                            outputCol=categoricalCol + "_classVec")
    stages.extend([stringIndexer, encoder])

assemblerInputs = [c + "_classVec" for c in str_cols] + float_cols + int_cols

# transfrom columns
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
stages.append(assembler)

In [48]:
# train/test split (25% test)
trainingData, testData = df_cars.randomSplit([0.75, 0.25])

# Train Gradient Boosting on trees
gbt = GBTRegressor(labelCol=TARGET, featuresCol="features", maxIter=100)
stages.append(gbt)

In [49]:
# build pipline
pipeline = Pipeline(stages=stages)

In [54]:
pipeline.extractParamMap()

{Param(parent='Pipeline_1fd8d846cf1b', name='stages', doc='a list of pipeline stages'): [StringIndexer_46a997a566aa,
  OneHotEncoder_342ff370ea24,
  StringIndexer_58601d195e0a,
  OneHotEncoder_3303777956f9,
  StringIndexer_fca568c1d7e8,
  OneHotEncoder_e560b9b14308,
  StringIndexer_2abe2635b353,
  OneHotEncoder_209af54ed4b3,
  StringIndexer_52110707a4b0,
  OneHotEncoder_9d06dbedee65,
  VectorAssembler_a8ec130d6683,
  GBTRegressor_098d7ccf4bcb]}

In [55]:
# тренируем модель
model = pipeline.fit(trainingData)

                                                                                

In [56]:
# делаем предсказания на тестовой выборке
predictions = model.transform(testData)

# 7. Validating the model

In [57]:
predictions.select(TARGET, "prediction").show()

+---------------+------------------+
|duration_listed|        prediction|
+---------------+------------------+
|             61|  56.5959557738575|
|             60|56.695858824669536|
|            228|232.73776149858264|
|             21|16.447366722911127|
|           1182| 740.3030007843715|
|           2079|   739.29906660045|
|             91|   90.746617552623|
|              8| 6.163956306848992|
|             67| 66.83515809275464|
|            252|254.63637917809243|
|             90| 90.86865791178727|
|            135|140.12313964458912|
|             73| 68.51555197762012|
|            121|124.38333204517578|
|             30|29.853760519507322|
|             54| 56.65851776522103|
|            153|157.85222590661158|
|             95|  90.9685609625993|
|             75| 68.34412694404726|
|             62|56.695858824669536|
+---------------+------------------+
only showing top 20 rows



In [58]:
gbt_evaluator = RegressionEvaluator(
    labelCol=TARGET, predictionCol="prediction", metricName="r2")
r2 = gbt_evaluator.evaluate(predictions)
print(f"R2 on test data = {r2:.2f}")

[Stage 1057:>                                                       (0 + 1) / 1]

R2 on test data = 0.74


                                                                                

# 8. Save model

In [60]:
model.save('model_GBT')

In [39]:
# spark.stop()

---