In [1]:
import findspark
findspark.init('/home/ubuntu/spark-2.1.1-bin-hadoop2.7')
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Build_the_model_linear_regression').getOrCreate()

df = spark.read.csv("../Bike-Sharing-Dataset-hour_new.csv",inferSchema=True,header=True)

In [2]:
import pandas as pd
new_df = df.select('reinstant', 'mnth', 'hr', 'weekday', 'holiday', 'workingday', 'hum', 
                   'temp', 'atemp', 'windspeed', 'weathersit', 'cnt')
new_df.describe().toPandas().transpose()

Unnamed: 0,0,1,2,3,4
summary,count,mean,stddev,min,max
reinstant,17544,8772.5,5064.66089684196,1,17544
mnth,17544,6.519835841313269,3.4496492053764083,1,12
hr,17544,11.5,6.922383841604305,0,23
weekday,17544,2.997264021887825,2.0034722846903734,0,6
holiday,17544,0.028499772001823985,0.1664004605297057,0,1
workingday,17544,0.6842225262197903,0.46483801154182497,0,1
hum,17544,0.6272285681714603,0.19202039360213635,0.0,1.0
temp,17544,0.49698634860921004,0.19164844257608665,0.02,1.0
atemp,17544,0.4757743958048299,0.17104014126878642,0.0,1.0


In [3]:
from pyspark.ml.feature import VectorAssembler

vector_assembler = VectorAssembler(inputCols = ['reinstant', 'mnth', 'hr', 'weekday', 'holiday', 'workingday', 'hum', 
                                                'temp', 'atemp', 'windspeed', 'weathersit'], outputCol = 'features')
vector_output = vector_assembler.transform(new_df)
vector_output.printSchema()
vector_output.head(1)

root
 |-- reinstant: integer (nullable = true)
 |-- mnth: integer (nullable = true)
 |-- hr: integer (nullable = true)
 |-- weekday: integer (nullable = true)
 |-- holiday: integer (nullable = true)
 |-- workingday: integer (nullable = true)
 |-- hum: double (nullable = true)
 |-- temp: double (nullable = true)
 |-- atemp: double (nullable = true)
 |-- windspeed: double (nullable = true)
 |-- weathersit: integer (nullable = true)
 |-- cnt: integer (nullable = true)
 |-- features: vector (nullable = true)



[Row(reinstant=1, mnth=1, hr=0, weekday=6, holiday=0, workingday=0, hum=0.81, temp=0.24, atemp=0.2879, windspeed=0.0, weathersit=1, cnt=16, features=DenseVector([1.0, 1.0, 0.0, 6.0, 0.0, 0.0, 0.81, 0.24, 0.2879, 0.0, 1.0]))]

In [4]:
vector_output = vector_output.select(['features', 'cnt'])

print(vector_output.head(1))
pd.DataFrame(vector_output.take(10), columns=vector_output.columns)

[Row(features=DenseVector([1.0, 1.0, 0.0, 6.0, 0.0, 0.0, 0.81, 0.24, 0.2879, 0.0, 1.0]), cnt=16)]


Unnamed: 0,features,cnt
0,"[1.0, 1.0, 0.0, 6.0, 0.0, 0.0, 0.81, 0.24, 0.2...",16
1,"[2.0, 1.0, 1.0, 6.0, 0.0, 0.0, 0.8, 0.22, 0.27...",40
2,"[3.0, 1.0, 2.0, 6.0, 0.0, 0.0, 0.8, 0.22, 0.27...",32
3,"[4.0, 1.0, 3.0, 6.0, 0.0, 0.0, 0.75, 0.24, 0.2...",13
4,"[5.0, 1.0, 4.0, 6.0, 0.0, 0.0, 0.75, 0.24, 0.2...",1
5,"[6.0, 1.0, 5.0, 6.0, 0.0, 0.0, 0.75, 0.24, 0.2...",1
6,"[7.0, 1.0, 6.0, 6.0, 0.0, 0.0, 0.8, 0.22, 0.27...",2
7,"[8.0, 1.0, 7.0, 6.0, 0.0, 0.0, 0.86, 0.2, 0.25...",3
8,"[9.0, 1.0, 8.0, 6.0, 0.0, 0.0, 0.75, 0.24, 0.2...",8
9,"[10.0, 1.0, 9.0, 6.0, 0.0, 0.0, 0.76, 0.32, 0....",14


In [5]:
new_data = vector_output
new_data_pd = new_data.describe()
pd.DataFrame(new_data_pd.take(5), columns=new_data_pd.columns)

Unnamed: 0,summary,cnt
0,count,17544.0
1,mean,189.4493274053808
2,stddev,180.53262210365807
3,min,1.0
4,max,977.0


In [6]:
from pyspark.ml.regression import LinearRegression

lr = LinearRegression(featuresCol='features', labelCol='cnt')
lr_model = lr.fit(new_data)
print("Coefficients: ") 
x = 0
for coe in str(lr_model.coefficients)[1:].replace("]","").split(','):
    print("\t(cnt/"+new_df.columns[x]+"): "+coe)
    x += 1

print("Intercept: " + str(lr_model.intercept) + "\n")

data_summary = lr_model.summary
print("RMSE: " + str(data_summary.rootMeanSquaredError))
print("R2: " + str(data_summary.r2))
print("\n")

data_resi = data_summary.residuals
pd.DataFrame(data_resi.take(10), columns=data_resi.columns)

Coefficients: 
	(cnt/reinstant): 0.008985296703428802
	(cnt/mnth): -1.7762927007440814
	(cnt/hr): 7.466474062818152
	(cnt/weekday): 1.6768272906942836
	(cnt/holiday): -24.673992462766645
	(cnt/workingday): 4.282528014616356
	(cnt/hum): -198.99433020602825
	(cnt/temp): 81.50636821217496
	(cnt/atemp): 251.6351680011257
	(cnt/windspeed): 35.45118412223945
	(cnt/weathersit): -2.6031848260582677
Intercept: -9.339395418210728

RMSE: 141.95819511751597
R2: 0.3816500737837255




Unnamed: 0,residuals
0,88.827038
1,108.816617
2,93.341158
3,51.461
4,31.985541
5,31.561386
6,33.439321
7,44.333339
8,9.083703
9,-12.171413
