# Linear Regression using Spark Mllib

Data Preparation 

Import the necessary Pyspark Sql functions necessary. If you are using row, make sure you are using Spark version 1.4 above.

In [1]:
from __future__ import print_function

from pyspark.ml.regression import LinearRegression

from pyspark.sql import SparkSession
from pyspark.ml.linalg import Vectors


Read the dataset as an rdd and store it as rdd1. Make sure your dataset has a numeric prediction variable

In [2]:
if __name__ == "__main__":

    # Create a SparkSession (Note, the config section is only for Windows!)
    spark = SparkSession.builder.config("spark.sql.warehouse.dir", "file:///C:/temp").appName("LinearRegression").getOrCreate()

In [3]:
    # Load up our data and convert it to the format MLLib expects.
    inputLines = spark.sparkContext.textFile('regression.txt')
    data = inputLines.map(lambda x: x.split(",")).map(lambda x: (float(x[0]), Vectors.dense(float(x[1]))))

In [4]:
    # Convert this RDD to a DataFrame
    colNames = ["label", "features"]
    df = data.toDF(colNames)

    # Note, there are lots of cases where you can avoid going from an RDD to a DataFrame.
    # Perhaps you're importing data from a real database. Or you are using structured streaming
    # to get your data.

In [5]:
    # Let's split our data into training data and testing data
    trainTest = df.randomSplit([0.5, 0.5])
    trainingDF = trainTest[0]
    testDF = trainTest[1]

    # Now create our linear regression model
    lir = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)

    # Train the model using our training data
    model = lir.fit(trainingDF)
    
    import  findspark
findspark.init()
import pyspark as ps
import warnings
from pyspark.sql import SQLContext

    # Now see if we can predict values in our test data.
    # Generate predictions using our linear regression model for all features in our
    # test dataframe:
    fullPredictions = model.transform(testDF).cache()

    # Extract the predictions and the "known" correct labels.
    predictions = fullPredictions.select("prediction").rdd.map(lambda x: x[0])
    labels = fullPredictions.select("label").rdd.map(lambda x: x[0])

    # Zip them together
    predictionAndLabel = predictions.zip(labels).collect()

    # Print out the predicted and actual values for each point
    for prediction in predictionAndLabel:
      print(prediction)

(-2.65764264664974, -3.74)
(-1.5482369599878176, -2.17)
(-1.3917823118688286, -2.09)
(-1.398893886783328, -1.94)
(-1.3348897125528325, -1.88)
(-1.2211045139208405, -1.79)
(-1.171323489519344, -1.77)
(-1.0290919912293541, -1.67)
(-1.1499887647758456, -1.65)
(-1.1499887647758456, -1.6)
(-1.1428771898613461, -1.59)
(-1.100207740374349, -1.57)
(-1.1855466393483431, -1.53)
(-1.0290919912293541, -1.47)
(-0.9295299424263611, -1.4)
(-0.9935341166568565, -1.36)
(-0.915306792597362, -1.34)
(-0.829967893623368, -1.3)
(-1.0362035661438536, -1.29)
(-0.8370794685378675, -1.27)
(-0.8228563187088684, -1.26)
(-0.8441910434523671, -1.25)
(-0.9295299424263611, -1.25)
(-0.7659637193928726, -1.24)
(-0.8441910434523671, -1.22)
(-0.8441910434523671, -1.2)
(-0.872637343110365, -1.17)
(-0.8797489180248645, -1.17)
(-0.8086331688798695, -1.14)
(-0.8655257681958656, -1.11)
(-0.744628994649374, -1.09)
(-0.829967893623368, -1.09)
(-0.7872984441363711, -1.08)
(-0.701959545162377, -1.05)
(-0.5952859214448845, -1.03)


We can use take() to have a look at our data. A take(5) shows the first 5 rows from our csv file

In [6]:
rdd1.take(5)

NameError: name 'rdd1' is not defined

In [None]:
rdd1 = rdd1.map(lambda line: line.split(","))

In [None]:
rdd1.take(2)

Remove the headers

In [None]:
header = rdd1.first()
rdd1 = rdd1. filter(lambda line:line != header)

In [None]:
rdd1.take(2)

Convert to a DataFrame

In [None]:
df1 = rdd1.map(lambda line:Row(Price = line[0], Age =line[1],KM =line[2], FuelType =line[3],HP =line[4], MetColor =line[5], Automatic =line[6], CC =line[7],Doors = line [8],Weight = line [9])).toDF()

In [None]:
df1.show(5)

View as a Pandas DataFrame- All these steps help understand the data better

In [None]:
df1.toPandas().head()

In [None]:
df1.describe(['Price','CC','Age','KM']).show()

# Linear Regression with Spark mllib

In [None]:
import pyspark.mllib
import pyspark.mllib.regression
from pyspark.mllib.regression import LabeledPoint
from pyspark.sql.functions import * 



Select only the columns that you consider important to predict the output variable. You can use describe to show the summary stats

In [None]:
df1=df1.select('Price','CC','Age','KM')
df1.describe(['Price','CC','Age','KM']).show()

In [None]:
df1.take(2)

In [None]:
type(df1)

In [None]:
df1.printSchema()

# Labeled Points 

An important thing to note about mllib is that it requires our features to be expressed with LabeledPoints. Use map on df to return as an RDD of LabledPoints

In [None]:
temp = df1.map(lambda line: LabeledPoint(line[0],[line[1:]]))

In [None]:
temp.take(5)

Use the StandardScaler in Spark to scale the data

In [None]:
from pyspark.mllib.util import MLUtils
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.feature import StandardScaler

Use an RDD to scale the data

In [None]:
features = df1.map(lambda row: row[1:])

In [None]:
features.take(1)

In [None]:
standardizer = StandardScaler()
model = standardizer.fit(features)
features_transform = model.transform(features)

# The scaled features : 

In [None]:
features_transform.take(5)

# Add labels together with the features

In [None]:
labels = df1.map(lambda row: row[0])
labels.take(5)

In [None]:
transformedData = labels.zip(features_transform)
transformedData.take(5)

In [None]:
transformedData = transformedData.map(lambda row : LabeledPoint(row[0],row[1]))
transformedData.take(5)

# Build the Model 

In [None]:
trainingData, testingData = transformedData.randomSplit([.8,.2],seed=1234)

In [None]:
from pyspark.mllib.regression import LinearRegressionWithSGD


In [None]:
linearModel = LinearRegressionWithSGD.train(trainingData,1000,.2)


In [None]:
linearModel.weights

In [None]:
testingData.take(10)

In [None]:
linearModel.predict([8.5478,1.5054,0.8591])