|Variable     |Description |
|:------------|:----------:|
|id           |A notation for a house           |
|date         |Date house was sold          |
|price        |Price is prediction target           |
|bedrooms     |Number of bedrooms           |
|bathrooms    |Number of bathrooms           |
|sqft_living  |Square footage of the home           |
|sqft_lot     |Square footage of the lot           |
|floors       |Total floors (levels) in house           |
|waterfront   |House which has a view to a waterfront           |
|view         |Has been viewed           |
|condition    |How good the condition is overall           |
|grade        |overall grade given to the housing unit, based on King County grading system           |
|sqft_above   |Square footage of house apart from basement           |
|sqft_basement|Square footage of the basement           |
|yr_built     |Built Year           |
|yr_renovated |Year when house was renovated           |
|zipcode      |Zip code           |
|lat          |Latitude coordinate           |
|long         |Longitude coordinate           |
|sqft_living15|Living room area in 2015(implies-- some renovations) This might or might not have affected the lotsize area|
|sqft_lot15   |LotSize area in 2015(implies-- some renovations)           |

## Data Load

## Creating Python File and Writing

In [2]:
%%writefile spark_job.py

import argparse
parser = argparse.ArgumentParser()
parser.add_argument("--bucket", help="bucket for input and output")
args = parser.parse_args()

BUCKET = args.bucket

Overwriting spark_job.py


In [3]:
%%writefile -a spark_job.py
from pyspark.sql import SparkSession, SQLContext, Row
from pyspark.sql.functions import col
from pyspark.sql.types import IntegerType, FloatType, DateType
from pyspark.sql.functions import regexp_replace
from pyspark.sql.functions import when

# Creating Spark Session
spark = SparkSession.builder.appName("LinearRegression_with_HouseData").getOrCreate()
sc = spark.sparkContext
sc.setLogLevel("ERROR")

gcs_bucket='dataproc-staging-us-central1-1012630253058-dtwj7n8q/notebooks/jupyter/'
data_file = "gs://" + gcs_bucket + "kc_house_data.csv"

# reading csv
house_df = spark.read.option("header", True).csv(data_file) 

# changing date rows 20141013T000000 ---> 20141013
house_df = house_df.withColumn("date"
                        ,when(house_df.date.endswith("T000000"), regexp_replace(house_df.date,"T000000","")))

# changing data types required
house_df = house_df.withColumn("id", col("id").cast(IntegerType())) \
                   .withColumn("price", col("price").cast(FloatType())) \
                   .withColumn("bedrooms", col("bedrooms").cast(IntegerType())) \
                   .withColumn("bathrooms", col("bathrooms").cast(FloatType())) \
                   .withColumn("sqft_living", col("sqft_living").cast(IntegerType())) \
                   .withColumn("sqft_lot", col("sqft_lot").cast(IntegerType())) \
                   .withColumn("floors", col("floors").cast(FloatType())) \
                   .withColumn("waterfront", col("waterfront").cast(IntegerType())) \
                   .withColumn("view", col("view").cast(IntegerType())) \
                   .withColumn("condition", col("condition").cast(IntegerType())) \
                   .withColumn("grade", col("grade").cast(IntegerType())) \
                   .withColumn("sqft_above", col("sqft_above").cast(IntegerType())) \
                   .withColumn("sqft_basement", col("sqft_basement").cast(IntegerType())) \
                   .withColumn("yr_built", col("yr_built").cast(DateType())) \
                   .withColumn("yr_renovated", col("yr_renovated").cast(DateType())) \
                   .withColumn("zipcode", col("zipcode").cast(IntegerType())) \
                   .withColumn("lat", col("lat").cast(FloatType())) \
                   .withColumn("long", col("long").cast(FloatType())) \
                   .withColumn("sqft_living15", col("sqft_living15").cast(IntegerType())) \
                   .withColumn("sqft_lot15", col("sqft_lot15").cast(IntegerType()))

Appending to spark_job.py


## Data exploration

In [4]:
%%writefile -a spark_job.py
# Data exploration
columns_for_lr = ["floors", "waterfront","lat" ,"bedrooms" ,"sqft_basement" ,"view" ,"bathrooms","sqft_living15","sqft_above","grade","sqft_living", "price"]
print(house_df.printSchema())
print(house_df.select(columns_for_lr).show(5, True))

Appending to spark_job.py


In [5]:
%%writefile -a spark_job.py
# Data Describe
house_df[columns_for_lr].describe().toPandas().T

Appending to spark_job.py


In [6]:
%%writefile -a spark_job.py
# Correlation between features and target 
import six

for i in house_df[columns_for_lr].columns:
    if not( isinstance(house_df.select(i).take(1)[0][0], six.string_types)):
        print("Correlation to price for", i, house_df.stat.corr('price',i))

Appending to spark_job.py


### Preparing data from Linear Regression

In [7]:
%%writefile -a spark_job.py
from pyspark.ml.feature import VectorAssembler

# Preparing to Linear Regression
features = ["floors", "waterfront","lat" ,"bedrooms" ,"sqft_basement" ,"view" ,"bathrooms","sqft_living15","sqft_above","grade","sqft_living"] 
vectorAssembler = VectorAssembler(inputCols = features, outputCol = 'Features')
vhouse_df = vectorAssembler.transform(house_df)
vhouse_df = vhouse_df.select(['Features', 'price'])
print("\nAfter Vector Assembling")
vhouse_df.show(5)

Appending to spark_job.py


## Traint Test Split

In [8]:
%%writefile -a spark_job.py
# Training, Test Split
(trainingData, testData) = vhouse_df.randomSplit([0.7, 0.3])

print("Training data")
print(trainingData.show(5))

print("\nTest data")
print(testData.show(5))

Appending to spark_job.py


In [9]:
%%writefile -a spark_job.py
print("Total trainingData instance :", trainingData.count(),"\nTotal testData instance :", testData.count())

Appending to spark_job.py


# Linear Regression

In [10]:
%%writefile -a spark_job.py
from pyspark.ml.regression import LinearRegression

print("*** Linear Regression ***")
lr = LinearRegression(featuresCol = 'Features', labelCol='price', maxIter = 20, regParam = 0.3, elasticNetParam = 0.8)
lr_model = lr.fit(trainingData)
print("Coefficients: " + str(lr_model.coefficients))
print("\nIntercept: " + str(lr_model.intercept))

Appending to spark_job.py


### Model Summary

In [11]:
%%writefile -a spark_job.py
trainingSummary = lr_model.summary
print("Model Summary for Linear Regression")
print("-----------------------------------")
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("r2: %f" % trainingSummary.r2)

Appending to spark_job.py


### Prediction

In [12]:
%%writefile -a spark_job.py
print("Prediction for Linear Regression Model")
print("--------------------------------------")
lr_predictions = lr_model.transform(testData)
lr_predictions.select("features","price","prediction").show(5)

from pyspark.ml.evaluation import RegressionEvaluator

# Evaluator for test data
lr_evaluator = RegressionEvaluator(labelCol="price",
                                   predictionCol="prediction",
                                   metricName="r2")

# R2 score for test data
r2 = lr_evaluator.evaluate(lr_predictions)
print("R Squared (R2) on test data = %g" % lr_evaluator.evaluate(lr_predictions))


Appending to spark_job.py


# Gradient-boosted Tree Regression

In [13]:
%%writefile -a spark_job.py
from pyspark.ml.regression import GBTRegressor

# Gradient-boosted Tree Regression
gbt = GBTRegressor(featuresCol = 'Features', labelCol = 'price', maxIter=10)
gbt_model = gbt.fit(trainingData)
gbt_predictions = gbt_model.transform(testData)
print("\n*** Gradient-boosted Tree Regression *** ")
gbt_predictions.select('prediction', 'price', 'features').show(5)

Appending to spark_job.py


In [14]:
%%writefile -a spark_job.py
# Gradient-boosted Tree Regression Evaluator
gbt_evaluator = RegressionEvaluator(
    labelCol="price", predictionCol="prediction", metricName="r2")
r2 = gbt_evaluator.evaluate(gbt_predictions)
print("\n*** Gradient-boosted Tree Regression - R Squared (R2) Score *** ")
print("---------------------------------------------------------------")
print("R Squared (R2) on test data = %g" % r2)

Appending to spark_job.py


In [15]:
%%writefile -a spark_job.py

import google.cloud.storage as gcs
bucket = gcs.Client().get_bucket(BUCKET)
for blob in bucket.list_blobs(prefix='sparkml/'):
    blob.delete()

Appending to spark_job.py


In [16]:
BUCKET= 'projects-de' 
print('Writing to {}'.format(BUCKET))
!python spark_job.py --bucket=$BUCKET

Writing to projects-de
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/02/07 16:17:26 INFO org.apache.spark.SparkEnv: Registering MapOutputTracker
22/02/07 16:17:26 INFO org.apache.spark.SparkEnv: Registering BlockManagerMaster
22/02/07 16:17:26 INFO org.apache.spark.SparkEnv: Registering BlockManagerMasterHeartbeat
22/02/07 16:17:26 INFO org.apache.spark.SparkEnv: Registering OutputCommitCoordinator
root                                                                            
 |-- id: integer (nullable = true)
 |-- date: string (nullable = true)
 |-- price: float (nullable = true)
 |-- bedrooms: integer (nullable = true)
 |-- bathrooms: float (nullable = true)
 |-- sqft_living: integer (nullable = true)
 |-- sqft_lot: integer (nullable = true)
 |-- floors: float (nullable = true)
 |-- waterfront: integer (nullable = true)
 |-- view: integer (nullable = true)
 |-- condition: integer (nullable = true

In [18]:
!gsutil cp spark_job.py gs://projects-de/sparkml/spark_job.py

Copying file://spark_job.py [Content-Type=text/x-python]...
/ [1 files][  5.9 KiB/  5.9 KiB]                                                
Operation completed over 1 objects/5.9 KiB.                                      


In [19]:
!gsutil ls gs://$BUCKET/**

gs://projects-de/sparkml/spark_job.py
