In [None]:
!pip install pyspark

In [None]:
from pyspark import SparkConf,SparkContext
conf=SparkConf().setAppName('abc').setMaster('local') #
sc=SparkContext(conf=conf)
sc.setLogLevel('ERROR')
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName('abc').config('','').getOrCreate()
# NUMPY Dense Vector
import numpy as np
v1=np.array([1,2,3,4,5])
print(v1)
# simple python list
v2=[1,2,3,4,5,6]
print(v2)
# Sparce & dense spark vector
from pyspark.mllib.linalg import Vectors
v3=Vectors.dense([3,4,5,6])
print(v3)
v4 = Vectors.sparse(3, [0, 2], [1.0, 3.0])
print(v4)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

df = spark.read.csv("/content/drive/MyDrive/ColabInputs/data.csv",header=True,inferSchema=True) # header = None
df.show(5,0) # 0 doesnot truncate displaying columns, useful in large dataset

In [None]:
# Step 3: Exploratory Data Analysis

In [None]:
df.count()

In [None]:
df.select('Grades').distinct().count()

In [None]:
df.printSchema()

In [None]:
df.show()

In [None]:
df.describe().show()

In [None]:
#create a feature array by omitting the last column
feature_cols = df.columns[:-1]
from pyspark.ml.feature import VectorAssembler
vect_assembler = VectorAssembler(inputCols = feature_cols, outputCol="features")
#Utilize Assembler created above in order to add the feature column
data_w_features = vect_assembler.transform(df)

In [None]:
finalized_data = data_w_features.select("features","Grades")
finalized_data.show()

In [None]:
# train test split
train_dataset, test_dataset = finalized_data.randomSplit([0.7, 0.3])
print(df.count())
print(train_dataset.count())
print(test_dataset.count())

In [None]:
#Import Linear Regression class called LinearRegression
from pyspark.ml.regression import LinearRegression
LinReg = LinearRegression(featuresCol="features", labelCol="Grades")

In [None]:
# Model training and testing
#Train the model on the training using fit() method.
model = LinReg.fit(train_dataset)
#Predict the Grades using the evulate method
pred = model.evaluate(test_dataset)

In [None]:
pred.predictions.show()

In [None]:
#Find out coefficient value
coefficient = model.coefficients
print ("The coefficient of the model is : %a" %coefficient)
#Find out intercept Value
intercept = model.intercept
print ("The Intercept of the model is : %f" %intercept)

In [None]:
#Evaluate the model using metric like Mean Absolute Error(MAE), Root Mean Square Error(RMSE) and R-Square
from pyspark.ml.evaluation import RegressionEvaluator
evaluation = RegressionEvaluator(labelCol="Grades", predictionCol="prediction")
# Root Mean Square Error
rmse = evaluation.evaluate(pred.predictions, {evaluation.metricName: "rmse"})
print("RMSE: %.3f" % rmse)
# Mean Square Error
mse = evaluation.evaluate(pred.predictions, {evaluation.metricName: "mse"})
print("MSE: %.3f" % mse)
# Mean Absolute Error
mae = evaluation.evaluate(pred.predictions, {evaluation.metricName: "mae"})
print("MAE: %.3f" % mae)
# r2 - coefficient of determination
r2 = evaluation.evaluate(pred.predictions, {evaluation.metricName: "r2"})
print("r2: %.3f" %r2)