# Pyspark for Linear Regression

### 1. Set up spark context and SparkSession

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Python Spark Regression example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

### 2. Load dataset

In [2]:
df = spark.read.format('com.databricks.spark.csv').\
                               options(header='true', \
                               inferschema='true').load("../data/Advertising.csv",header=True);

In [3]:
df.show(5,True)
df.printSchema()

+-----+-----+---------+-----+
|   TV|Radio|Newspaper|Sales|
+-----+-----+---------+-----+
|230.1| 37.8|     69.2| 22.1|
| 44.5| 39.3|     45.1| 10.4|
| 17.2| 45.9|     69.3|  9.3|
|151.5| 41.3|     58.5| 18.5|
|180.8| 10.8|     58.4| 12.9|
+-----+-----+---------+-----+
only showing top 5 rows

root
 |-- TV: double (nullable = true)
 |-- Radio: double (nullable = true)
 |-- Newspaper: double (nullable = true)
 |-- Sales: double (nullable = true)



### 3. Convert the data to dense vector

- check the dataset and the Schema

In [4]:
from pyspark.sql import Row
from pyspark.ml.linalg import Vectors

In [5]:
# convert the data to dense vector
#def transData(row):
#    return Row(label=row["Sales"],
#               features=Vectors.dense([row["TV"],
#                                       row["Radio"],
#                                       row["Newspaper"]]))
def transData(data):
    return data.rdd.map(lambda r: [Vectors.dense(r[:-1]),r[-1]]).toDF(['features','label'])

### 4. Transform the dataset to DataFrame

In [6]:
#transformed = df.rdd.map(transData).toDF() 
transformed= transData(df)

In [7]:
transformed.show(5)

+-----------------+-----+
|         features|label|
+-----------------+-----+
|[230.1,37.8,69.2]| 22.1|
| [44.5,39.3,45.1]| 10.4|
| [17.2,45.9,69.3]|  9.3|
|[151.5,41.3,58.5]| 18.5|
|[180.8,10.8,58.4]| 12.9|
+-----------------+-----+
only showing top 5 rows



### 6. Split the data to training and test data sets

In [8]:
from pyspark.ml import Pipeline
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator

In [9]:
# Split the data into training and test sets (40% held out for testing)
(trainingData, testData) = transformed.randomSplit([0.6, 0.4])

In [10]:
trainingData.show(5)
testData.show(5)

+---------------+-----+
|       features|label|
+---------------+-----+
| [0.7,39.6,8.7]|  1.6|
| [4.1,11.6,5.7]|  3.2|
|[7.3,28.1,41.4]|  5.5|
| [8.4,27.2,2.1]|  5.7|
|  [8.6,2.1,1.0]|  4.8|
+---------------+-----+
only showing top 5 rows

+----------------+-----+
|        features|label|
+----------------+-----+
|  [5.4,29.9,9.4]|  5.3|
| [7.8,38.9,50.6]|  6.6|
| [8.7,48.9,75.0]|  7.2|
|[11.7,36.9,45.2]|  7.3|
|[16.9,43.7,89.4]|  8.7|
+----------------+-----+
only showing top 5 rows



### 5. Fit Ordinary Least Square Regression Model

In [11]:
# Import LinearRegression class
from pyspark.ml.regression import LinearRegression

# Define LinearRegression algorithm
lr = LinearRegression()

In [15]:
model = lr.fit(trainingData)

### 6. Summary of the Model

In [16]:
def modelsummary(model):
    import numpy as np
    print ("Note: the last rows are the information for Intercept")
    print ("##","-------------------------------------------------")
    print ("##","  Estimate   |   Std.Error | t Values  |  P-value")
    coef = np.append(list(model.coefficients),model.intercept)
    Summary=model.summary
    
    for i in range(len(Summary.pValues)):
        print ("##",'{:10.6f}'.format(coef[i]),\
        '{:10.6f}'.format(Summary.coefficientStandardErrors[i]),\
        '{:8.3f}'.format(Summary.tValues[i]),\
        '{:10.6f}'.format(Summary.pValues[i]))
        
    print ("##",'---')
    print ("##","Mean squared error: % .6f" \
           % Summary.meanSquaredError, ", RMSE: % .6f" \
           % Summary.rootMeanSquaredError )
    print ("##","Multiple R-squared: %f" % Summary.r2, ", \
            Total iterations: %i"% Summary.totalIterations)    

In [17]:
modelsummary(model)

Note: the last rows are the information for Intercept
('##', '-------------------------------------------------')
('##', '  Estimate   |   Std.Error | t Values  |  P-value')
('##', '  0.044053', '  0.001785', '  24.680', '  0.000000')
('##', '  0.174338', '  0.011428', '  15.255', '  0.000000')
('##', '  0.012286', '  0.008553', '   1.436', '  0.153497')
('##', '  3.302545', '  0.380453', '   8.681', '  0.000000')
('##', '---')
('##', 'Mean squared error:  2.643675', ', RMSE:  1.625938')
('##', 'Multiple R-squared: 0.901501', ',             Total iterations: 1')


### 7. Make predictions

In [18]:
predictions = model.transform(testData)

In [20]:
# Select example rows to display.
predictions.select("features","label", "prediction").show(5)

+------------------+-----+----------------+
|        prediction|label|        features|
+------------------+-----+----------------+
| 8.868608434105395|  5.3|  [5.4,29.9,9.4]|
|11.049544466124647|  6.6| [7.8,38.9,50.6]|
|13.132338830689374|  7.2| [8.7,48.9,75.0]|
| 10.80633252274804|  7.3|[11.7,36.9,45.2]|
| 12.76393099323613|  8.7|[16.9,43.7,89.4]|
+------------------+-----+----------------+
only showing top 5 rows

