In [86]:
import pandas as pd

from pyspark.sql import SQLContext
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import RegressionEvaluator

In [87]:
sqlContext = SQLContext(sc)
data = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load('/input/day.csv')
data.take(1)

[Row(instant=1, dteday='2011-01-01', season=1, yr=0, mnth=1, holiday=0, weekday=6, workingday=0, weathersit=2, temp=0.344167, atemp=0.363625, hum=0.805833, windspeed=0.160446, casual=331, registered=654, cnt=985)]

In [88]:
data.limit(5).toPandas()

Unnamed: 0,instant,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1,0,1,0,6,0,2,0.344167,0.363625,0.805833,0.160446,331,654,985
1,2,2011-01-02,1,0,1,0,0,0,2,0.363478,0.353739,0.696087,0.248539,131,670,801
2,3,2011-01-03,1,0,1,0,1,1,1,0.196364,0.189405,0.437273,0.248309,120,1229,1349
3,4,2011-01-04,1,0,1,0,2,1,1,0.2,0.212122,0.590435,0.160296,108,1454,1562
4,5,2011-01-05,1,0,1,0,3,1,1,0.226957,0.22927,0.436957,0.1869,82,1518,1600


In [89]:
data.printSchema()

root
 |-- instant: integer (nullable = true)
 |-- dteday: string (nullable = true)
 |-- season: integer (nullable = true)
 |-- yr: integer (nullable = true)
 |-- mnth: integer (nullable = true)
 |-- holiday: integer (nullable = true)
 |-- weekday: integer (nullable = true)
 |-- workingday: integer (nullable = true)
 |-- weathersit: integer (nullable = true)
 |-- temp: double (nullable = true)
 |-- atemp: double (nullable = true)
 |-- hum: double (nullable = true)
 |-- windspeed: double (nullable = true)
 |-- casual: integer (nullable = true)
 |-- registered: integer (nullable = true)
 |-- cnt: integer (nullable = true)



In [90]:
data.describe().toPandas().transpose()

Unnamed: 0,0,1,2,3,4
summary,count,mean,stddev,min,max
instant,731,366.0,211.16581162678773,1,731
dteday,731,,,2011-01-01,2012-12-31
season,731,2.496580027359781,1.1108070927726252,1,4
yr,731,0.5006839945280438,0.5003418803818265,0,1
mnth,731,6.519835841313269,3.451912787256252,1,12
holiday,731,0.028727770177838577,0.16715474262247393,0,1
weekday,731,2.997264021887825,2.004786917944481,0,6
workingday,731,0.6839945280437757,0.4652333866777039,0,1
weathersit,731,1.3953488372093024,0.5448943419593665,1,3


In [91]:
data = data.select(
data.season,
data.yr,
data.mnth,
data.holiday,
data.weekday,
data.workingday,
data.weathersit,
data.temp,
data.atemp,
data.hum,
data.windspeed,
data.cnt.alias('label'))

In [92]:
data.limit(5).toPandas()

Unnamed: 0,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,label
0,1,0,1,0,6,0,2,0.344167,0.363625,0.805833,0.160446,985
1,1,0,1,0,0,0,2,0.363478,0.353739,0.696087,0.248539,801
2,1,0,1,0,1,1,1,0.196364,0.189405,0.437273,0.248309,1349
3,1,0,1,0,2,1,1,0.2,0.212122,0.590435,0.160296,1562
4,1,0,1,0,3,1,1,0.226957,0.22927,0.436957,0.1869,1600


In [93]:
vector = VectorAssembler(inputCols=data.columns, outputCol='features')
vector = vector.transform(data)
vector = vector.select(['features','label'])
vector.show(5)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[1.0,0.0,1.0,0.0,...|  985|
|[1.0,0.0,1.0,0.0,...|  801|
|[1.0,0.0,1.0,0.0,...| 1349|
|[1.0,0.0,1.0,0.0,...| 1562|
|[1.0,0.0,1.0,0.0,...| 1600|
+--------------------+-----+
only showing top 5 rows



In [94]:
train, test = vector.randomSplit([0.7, 0.3])

In [95]:
model = LinearRegression()
model = model.fit(train)

In [97]:
prediction = model.transform(test)
prediction.select("prediction","label", "features").show(5)

+------------------+-----+--------------------+
|        prediction|label|            features|
+------------------+-----+--------------------+
|  986.000000000001|  986|[1.0,0.0,1.0,0.0,...|
|1321.0000000000005| 1321|[1.0,0.0,1.0,0.0,...|
|1263.0000000000018| 1263|[1.0,0.0,1.0,0.0,...|
| 1985.000000000004| 1985|[1.0,0.0,1.0,0.0,...|
|1000.0000000000066| 1000|[1.0,0.0,1.0,1.0,...|
+------------------+-----+--------------------+
only showing top 5 rows



In [98]:
evaluator = RegressionEvaluator()
print("R2 value is : ",evaluator.evaluate(prediction, {evaluator.metricName:'r2'}))
print("RMSE score value is : ",evaluator.evaluate(prediction, {evaluator.metricName:'rmse'}))

R2 value is :  1.0
RMSE score value is :  5.677284802656805e-12
