# Linear Regression using Pyspark

In [1]:
#create sparksession object
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName('lin_reg').getOrCreate()

ModuleNotFoundError: No module named 'pyspark'

In [0]:
#import Linear Regression from spark's MLlib
from pyspark.ml.regression import LinearRegression

In [0]:
#Load the dataset
df=spark.read.csv('/FileStore/tables/Linear_regression_dataset.csv',inferSchema=True,header=True)

In [0]:
#validate the size of data
print((df.count(), len(df.columns)))

In [0]:
#explore the data
df.printSchema()

In [0]:
display(df)

var_1,var_2,var_3,var_4,var_5,output
734,688,81,0.328,0.259,0.418
700,600,94,0.32,0.247,0.389
712,705,93,0.311,0.247,0.417
734,806,69,0.315,0.26,0.415
613,759,61,0.302,0.24,0.378
748,676,85,0.318,0.255,0.422
669,588,97,0.315,0.251,0.411
667,845,68,0.324,0.251,0.381
758,890,64,0.33,0.274,0.436
726,670,88,0.335,0.268,0.422


In [0]:
#view statistical measures of data 
df.describe().show(5,False)

In [0]:
#sneak into the dataset
df.head(3)

In [0]:
#import corr function from pyspark functions
from pyspark.sql.functions import corr

In [0]:
# check for correlation
df.select(corr('var_1','output')).show()

In [0]:
#import vectorassembler to create dense vectors
from pyspark.ml.linalg import Vector
from pyspark.ml.feature import VectorAssembler

In [0]:
#select the columns to create input vector
df.columns

In [0]:
#create the vector assembler 
vec_assmebler=VectorAssembler(inputCols=['var_1', 'var_2', 'var_3', 'var_4', 'var_5'],outputCol='features')

In [0]:
#transform the values
features_df=vec_assmebler.transform(df)

In [0]:
#validate the presence of dense vectors 
features_df.printSchema()

In [0]:
display(features_df)

var_1,var_2,var_3,var_4,var_5,output,features
734,688,81,0.328,0.259,0.418,"Map(vectorType -> dense, length -> 5, values -> List(734.0, 688.0, 81.0, 0.328, 0.259))"
700,600,94,0.32,0.247,0.389,"Map(vectorType -> dense, length -> 5, values -> List(700.0, 600.0, 94.0, 0.32, 0.247))"
712,705,93,0.311,0.247,0.417,"Map(vectorType -> dense, length -> 5, values -> List(712.0, 705.0, 93.0, 0.311, 0.247))"
734,806,69,0.315,0.26,0.415,"Map(vectorType -> dense, length -> 5, values -> List(734.0, 806.0, 69.0, 0.315, 0.26))"
613,759,61,0.302,0.24,0.378,"Map(vectorType -> dense, length -> 5, values -> List(613.0, 759.0, 61.0, 0.302, 0.24))"
748,676,85,0.318,0.255,0.422,"Map(vectorType -> dense, length -> 5, values -> List(748.0, 676.0, 85.0, 0.318, 0.255))"
669,588,97,0.315,0.251,0.411,"Map(vectorType -> dense, length -> 5, values -> List(669.0, 588.0, 97.0, 0.315, 0.251))"
667,845,68,0.324,0.251,0.381,"Map(vectorType -> dense, length -> 5, values -> List(667.0, 845.0, 68.0, 0.324, 0.251))"
758,890,64,0.33,0.274,0.436,"Map(vectorType -> dense, length -> 5, values -> List(758.0, 890.0, 64.0, 0.33, 0.274))"
726,670,88,0.335,0.268,0.422,"Map(vectorType -> dense, length -> 5, values -> List(726.0, 670.0, 88.0, 0.335, 0.268))"


In [0]:
#view the details of dense vector
features_df.select('features').show(5,False)

In [0]:
#create data containing input features and output column
model_df=features_df.select('features','output')

In [0]:
model_df.show(5,False)

In [0]:
#size of model df
print((model_df.count(), len(model_df.columns)))

### Split Data - Train & Test sets

In [0]:
#split the data into 70/30 ratio for train test purpose
train_df,test_df=model_df.randomSplit([0.7,0.3])

In [0]:
print((train_df.count(), len(train_df.columns)))

In [0]:
print((test_df.count(), len(test_df.columns)))

In [0]:
train_df.describe().show()

## Build Linear Regression Model

In [0]:
#Build Linear Regression model 
lin_Reg=LinearRegression(labelCol='output')

In [0]:
#fit the linear regression model on training data set 
lr_model=lin_Reg.fit(train_df)

In [0]:
lr_model.intercept

In [0]:
print(lr_model.coefficients)

In [0]:
training_predictions=lr_model.evaluate(train_df)

In [0]:
training_predictions.meanSquaredError

In [0]:
training_predictions.r2

In [0]:
#make predictions on test data 
test_results=lr_model.evaluate(test_df)

In [0]:
#view the residual errors based on predictions 
test_results.residuals.show(10)

In [0]:
#coefficient of determination value for model
test_results.r2

In [0]:
test_results.rootMeanSquaredError

In [0]:
test_results.meanSquaredError