# Part 5

This section include:
* assemble data
* transform data
* train model (LinearRegression)

In [11]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression

In [2]:
spark = SparkSession.builder.appName('Part 5').getOrCreate()

In [3]:
basic_dataframe = spark.read.csv('./data/Part_5/data_1.csv', sep=';', inferSchema=True, header=True)
basic_dataframe.show()

+---+------+----------------+
|Age|Income|Customer_Segment|
+---+------+----------------+
| 25| 50000|              10|
| 32| 60000|              20|
| 22| 45000|              20|
| 45| 80000|              30|
| 36| 75000|              30|
| 28| 52000|              20|
| 40| 70000|              20|
| 21| 48000|              10|
| 30| 55000|              10|
| 38| 73000|              20|
+---+------+----------------+



In [6]:
featureAssembler = VectorAssembler(inputCols=['Age','Customer_Segment'],outputCol='[Age, Customer_Segment]')

In [7]:
transformed_dataframe = featureAssembler.transform(basic_dataframe)
transformed_dataframe.show()

+---+------+----------------+-----------------------+
|Age|Income|Customer_Segment|[Age, Customer_Segment]|
+---+------+----------------+-----------------------+
| 25| 50000|              10|            [25.0,10.0]|
| 32| 60000|              20|            [32.0,20.0]|
| 22| 45000|              20|            [22.0,20.0]|
| 45| 80000|              30|            [45.0,30.0]|
| 36| 75000|              30|            [36.0,30.0]|
| 28| 52000|              20|            [28.0,20.0]|
| 40| 70000|              20|            [40.0,20.0]|
| 21| 48000|              10|            [21.0,10.0]|
| 30| 55000|              10|            [30.0,10.0]|
| 38| 73000|              20|            [38.0,20.0]|
+---+------+----------------+-----------------------+



In [9]:
prepared_data = transformed_dataframe.select(['Income','[Age, Customer_Segment]'])
prepared_data.show()

+------+-----------------------+
|Income|[Age, Customer_Segment]|
+------+-----------------------+
| 50000|            [25.0,10.0]|
| 60000|            [32.0,20.0]|
| 45000|            [22.0,20.0]|
| 80000|            [45.0,30.0]|
| 75000|            [36.0,30.0]|
| 52000|            [28.0,20.0]|
| 70000|            [40.0,20.0]|
| 48000|            [21.0,10.0]|
| 55000|            [30.0,10.0]|
| 73000|            [38.0,20.0]|
+------+-----------------------+



In [13]:
train_data, test_data = prepared_data.randomSplit([0.75,0.25])

In [15]:
regressor = LinearRegression(featuresCol='[Age, Customer_Segment]', labelCol='Income')
regressor = regressor.fit(train_data)

In [17]:
regressor.coefficients

DenseVector([1230.4703, 374.3879])

In [18]:
regressor.intercept

15383.987563155559

In [22]:
prediction_data = regressor.evaluate(test_data)
prediction_data.predictions.show()

+------+-----------------------+-----------------+
|Income|[Age, Customer_Segment]|       prediction|
+------+-----------------------+-----------------+
| 45000|            [22.0,20.0]|49942.09094442267|
| 70000|            [40.0,20.0]|72090.55577147311|
+------+-----------------------+-----------------+



In [24]:
(prediction_data.meanAbsoluteError, prediction_data.meanSquaredError)

(3516.323357947891, 14397343.168292053)