## PySpark Session-6:

## PySpark MLib: Machine Learning Package
    Implementing Regression Model

In [2]:
# Create spark session and import the dataset

from pyspark.sql import SparkSession

In [4]:
spark = SparkSession.builder.appName("Mlib").getOrCreate()

In [5]:
df_pyspark = spark.read.csv("Test6.csv", header=True, inferSchema=True)

In [6]:
df_pyspark.show()

+----+---+---+------+
|Name|Age|Exp|Salary|
+----+---+---+------+
|   A| 21|  3| 30000|
|   B| 23|  4| 45000|
|   C| 22|  3| 38000|
|   D| 24|  4| 55000|
|   E| 25|  5| 60000|
|   F| 24|  3| 58000|
|   G| 25|  5| 62000|
|   H| 22|  2| 28000|
+----+---+---+------+



In [7]:
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Exp: integer (nullable = true)
 |-- Salary: integer (nullable = true)



In [8]:
# Fitting regression model:
# Age & Exp as Independent Variables/features
# Salary: Dependent/predicted or target variable

#  [Age, Exp] --- Independent_Features
#  [Salry]    --- Dependent Feature

In [10]:
from pyspark.ml.feature import VectorAssembler

feature_as_assmb = VectorAssembler(inputCols=["Age","Exp"], outputCol="Independent_Features")

In [13]:
# Transform the AGE & EXP as One single Feature "Independent_Features"
# It will be combined together and tradformed 

trans_dataset = feature_as_assmb.transform(df_pyspark)
trans_dataset.show()

+----+---+---+------+--------------------+
|Name|Age|Exp|Salary|Independent_Features|
+----+---+---+------+--------------------+
|   A| 21|  3| 30000|          [21.0,3.0]|
|   B| 23|  4| 45000|          [23.0,4.0]|
|   C| 22|  3| 38000|          [22.0,3.0]|
|   D| 24|  4| 55000|          [24.0,4.0]|
|   E| 25|  5| 60000|          [25.0,5.0]|
|   F| 24|  3| 58000|          [24.0,3.0]|
|   G| 25|  5| 62000|          [25.0,5.0]|
|   H| 22|  2| 28000|          [22.0,2.0]|
+----+---+---+------+--------------------+



In [14]:
# Check out all columns of dataset
trans_dataset.columns

['Name', 'Age', 'Exp', 'Salary', 'Independent_Features']

In [16]:
# In this, we require only Independent_Features & Salary variables
# Wherein, Independent_Features: Independent Variable    Salary: Dependent Variable

Final_Reg_Data = trans_dataset.select("Independent_Features", "Salary")
Final_Reg_Data.show()

+--------------------+------+
|Independent_Features|Salary|
+--------------------+------+
|          [21.0,3.0]| 30000|
|          [23.0,4.0]| 45000|
|          [22.0,3.0]| 38000|
|          [24.0,4.0]| 55000|
|          [25.0,5.0]| 60000|
|          [24.0,3.0]| 58000|
|          [25.0,5.0]| 62000|
|          [22.0,2.0]| 28000|
+--------------------+------+



In [19]:
from pyspark.ml.regression import LinearRegression

# Train-Test Splist:
train_data, test_data = Final_Reg_Data.randomSplit([0.75,0.25])

In [22]:
regressor = LinearRegression(featuresCol="Independent_Features", labelCol="Salary")

regressor = regressor.fit(train_data)

In [23]:
# Regression Co-efficients
regressor.coefficients

DenseVector([9428.5714, -3285.7143])

In [24]:
# Regression Intercept Value
regressor.intercept

-158714.28571428492

In [25]:
# Prediction Results

pred_res = regressor.evaluate(test_data)

In [26]:
pred_res.predictions.show()

+--------------------+------+------------------+
|Independent_Features|Salary|        prediction|
+--------------------+------+------------------+
|          [22.0,2.0]| 28000|42142.857142857654|
|          [24.0,4.0]| 55000| 54428.57142857116|
|          [25.0,5.0]| 60000| 60571.42857142794|
|          [25.0,5.0]| 62000| 60571.42857142794|
+--------------------+------+------------------+



In [30]:
# Find the Mean_Absolute_Error & Mean_Squared_Error

pred_res.meanAbsoluteError, pred_res.meanSquaredError

(4178.571428571624, 50678571.428575404)