## PySpark Linear Regression, Random Forest Regression and Decison Regression

In [8]:
## Importing required libraries
import pyspark
import findspark
findspark.find()
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler,VectorIndexer
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression,DecisionTreeRegressor
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator

In [10]:
spark = SparkSession.builder.appName("Pyspark_ML_Algorithms").getOrCreate()

In [11]:
dataframe = spark.read.csv("Admission_Prediction.csv",header=True)

In [12]:
type (dataframe)

pyspark.sql.dataframe.DataFrame

In [13]:
dataframe.show(5)

+---------+-----------+-----------------+---+----+----+--------+----------------+
|GRE Score|TOEFL Score|University Rating|SOP|LOR |CGPA|Research|Chance of Admit |
+---------+-----------+-----------------+---+----+----+--------+----------------+
|      337|        118|                4|4.5| 4.5|9.65|       1|            0.92|
|      324|        107|                4|  4| 4.5|8.87|       1|            0.76|
|      316|        104|                3|  3| 3.5|   8|       1|            0.72|
|      322|        110|                3|3.5| 2.5|8.67|       1|             0.8|
|     null|        103|                2|  2|   3|8.21|       0|            0.65|
+---------+-----------+-----------------+---+----+----+--------+----------------+
only showing top 5 rows



In [14]:
dataframe.printSchema()

root
 |-- GRE Score: string (nullable = true)
 |-- TOEFL Score: string (nullable = true)
 |-- University Rating: string (nullable = true)
 |-- SOP: string (nullable = true)
 |-- LOR : string (nullable = true)
 |-- CGPA: string (nullable = true)
 |-- Research: string (nullable = true)
 |-- Chance of Admit : string (nullable = true)



In [15]:
from pyspark.sql.functions import col
new_dataframe = dataframe.select(*(col(c).cast("float").alias(c) for c in dataframe.columns))

In [16]:
new_dataframe.printSchema()

root
 |-- GRE Score: float (nullable = true)
 |-- TOEFL Score: float (nullable = true)
 |-- University Rating: float (nullable = true)
 |-- SOP: float (nullable = true)
 |-- LOR : float (nullable = true)
 |-- CGPA: float (nullable = true)
 |-- Research: float (nullable = true)
 |-- Chance of Admit : float (nullable = true)



In [17]:
from pyspark.sql.functions import col, count, isnan, when

In [18]:
for c in new_dataframe.columns:
    print(c)

GRE Score
TOEFL Score
University Rating
SOP
LOR 
CGPA
Research
Chance of Admit 


In [19]:
#checking for null ir nan type values in our columns
new_dataframe.select([count(when(col(c).isNull(), c)).alias(c) for c in new_dataframe.columns]).show()

+---------+-----------+-----------------+---+----+----+--------+----------------+
|GRE Score|TOEFL Score|University Rating|SOP|LOR |CGPA|Research|Chance of Admit |
+---------+-----------+-----------------+---+----+----+--------+----------------+
|        5|          3|                1|  0|   0|   0|       0|               0|
+---------+-----------+-----------------+---+----+----+--------+----------------+



In [20]:
from pyspark.ml.feature import Imputer

In [22]:
imputer = Imputer(inputCols=["GRE Score", "TOEFL Score","University Rating"], 
                  outputCols=["GRE Score", "TOEFL Score","University Rating"])
model = imputer.fit(new_dataframe)

imputed_data = model.transform(new_dataframe)

In [23]:
imputed_data

DataFrame[GRE Score: float, TOEFL Score: float, University Rating: float, SOP: float, LOR : float, CGPA: float, Research: float, Chance of Admit : float]

In [24]:
#checking for null ir nan type values in our columns
imputed_data.select([count(when(col(c).isNull(), c)).alias(c) for c in imputed_data.columns]).show()

+---------+-----------+-----------------+---+----+----+--------+----------------+
|GRE Score|TOEFL Score|University Rating|SOP|LOR |CGPA|Research|Chance of Admit |
+---------+-----------+-----------------+---+----+----+--------+----------------+
|        0|          0|                0|  0|   0|   0|       0|               0|
+---------+-----------+-----------------+---+----+----+--------+----------------+



In [27]:
features = imputed_data.drop('Chance of Admit ')

In [28]:
features

DataFrame[GRE Score: float, TOEFL Score: float, University Rating: float, SOP: float, LOR : float, CGPA: float, Research: float]

In [29]:
#let's assemble our features together using vectorAssembler
assembler = VectorAssembler( inputCols=features.columns,outputCol="features")

In [30]:
output = assembler.transform(imputed_data)

## Linear Regressor

In [32]:
output= output.select("features", "Chance of Admit ")

In [33]:
output = assembler.transform(imputed_data)

In [34]:
output= output.select("features", "Chance of Admit ")

In [35]:
## spilt the data
train_df,test_df = output.randomSplit([0.7, 0.3])

In [37]:
# show the data after splitted
train_df.show(5)
test_df.show(5)

+--------------------+----------------+
|            features|Chance of Admit |
+--------------------+----------------+
|[290.0,104.0,4.0,...|            0.45|
|[293.0,97.0,2.0,2...|            0.64|
|[294.0,93.0,1.0,1...|            0.46|
|[294.0,95.0,1.0,1...|            0.49|
|[295.0,93.0,1.0,2...|            0.46|
+--------------------+----------------+
only showing top 5 rows

+--------------------+----------------+
|            features|Chance of Admit |
+--------------------+----------------+
|[290.0,100.0,1.0,...|            0.47|
|[295.0,101.0,2.0,...|            0.69|
|[295.0,107.207244...|            0.37|
|[296.0,97.0,2.0,1...|            0.49|
|[296.0,99.0,2.0,3...|            0.47|
+--------------------+----------------+
only showing top 5 rows



In [39]:
lin_reg = LinearRegression(featuresCol = 'features', labelCol='Chance of Admit ')
linear_model = lin_reg.fit(train_df)

In [40]:
print("Coefficients: " + str(linear_model.coefficients))
print("Intercept: " + str(linear_model.intercept))

Coefficients: [0.0021768745492511373,0.002714372367972509,0.0057548874077161085,0.004674616188140981,0.019685745474739436,0.11114848221913484,0.020644294613579795]
Intercept: -1.327026781210642


In [46]:
trainSummary = linear_model.summary
print("RMSE: %f" %  trainSummary.rootMeanSquaredError)
print("r2: %f" % trainSummary.r2)

RMSE: 0.062024
r2: 0.809393


In [49]:
# prediction

predictions = linear_model.transform(test_df)
predictions.select("prediction","Chance of Admit ","features").show(5)

+-------------------+----------------+--------------------+
|         prediction|Chance of Admit |            features|
+-------------------+----------------+--------------------+
|0.46812489672553736|            0.47|[290.0,100.0,1.0,...|
| 0.5254977113012538|            0.69|[295.0,101.0,2.0,...|
| 0.5399721896974903|            0.37|[295.0,107.207244...|
|  0.505473577617293|            0.49|[296.0,97.0,2.0,1...|
|0.48964565621359757|            0.47|[296.0,99.0,2.0,3...|
+-------------------+----------------+--------------------+
only showing top 5 rows



### Linear Regression Model Evaluation

In [50]:
from pyspark.ml.evaluation import RegressionEvaluator
pred_evaluator = RegressionEvaluator(predictionCol="prediction", \
                 labelCol="Chance of Admit ",metricName="r2")
print("R Squared (R2) on test data =", pred_evaluator.evaluate(predictions))

R Squared (R2) on test data = 0.8396280668784868


# Random Forest Regressor

In [68]:
new_indexed_data = featureIndexer.select("features", "Chance of Admit ")

In [69]:
training, test = new_indexed_data.randomSplit([0.7, 0.3])

In [70]:
training.show(10)

+--------------------+----------------+
|            features|Chance of Admit |
+--------------------+----------------+
|[290.0,100.0,1.0,...|            0.47|
|[290.0,104.0,4.0,...|            0.45|
|[293.0,97.0,2.0,2...|            0.64|
|[294.0,93.0,1.0,1...|            0.46|
|[294.0,95.0,1.0,1...|            0.49|
|[295.0,96.0,2.0,1...|            0.47|
|[295.0,101.0,2.0,...|            0.69|
|[295.0,107.207244...|            0.37|
|[296.0,95.0,2.0,3...|            0.44|
|[296.0,97.0,2.0,1...|            0.49|
+--------------------+----------------+
only showing top 10 rows



In [71]:
test.show(10,truncate=False)

+-----------------------------------------------+----------------+
|features                                       |Chance of Admit |
+-----------------------------------------------+----------------+
|[295.0,93.0,1.0,2.0,2.0,7.199999809265137,0.0] |0.46            |
|[295.0,99.0,2.0,2.5,3.0,7.650000095367432,0.0] |0.57            |
|[296.0,101.0,1.0,2.5,3.0,7.679999828338623,0.0]|0.6             |
|[297.0,98.0,2.0,2.5,3.0,7.670000076293945,0.0] |0.59            |
|[298.0,92.0,1.0,2.0,2.0,7.880000114440918,0.0] |0.51            |
|[298.0,98.0,2.0,1.5,2.5,7.5,1.0]               |0.44            |
|[298.0,100.0,3.0,2.5,4.0,7.949999809265137,1.0]|0.58            |
|[299.0,100.0,1.0,1.5,2.0,7.889999866485596,0.0]|0.59            |
|[299.0,100.0,2.0,3.0,3.5,7.880000114440918,0.0]|0.68            |
|[299.0,102.0,3.0,4.0,3.5,8.619999885559082,0.0]|0.56            |
+-----------------------------------------------+----------------+
only showing top 10 rows



In [72]:
random_forest_reg = RandomForestRegressor(featuresCol="features",labelCol="Chance of Admit " )

In [73]:
# Train model.  This also runs the indexer.
model = random_forest_reg.fit(train_df)

In [74]:
# Make predictions.
predictions = model.transform(test)

In [76]:
predictions.show(10)

+--------------------+----------------+-------------------+
|            features|Chance of Admit |         prediction|
+--------------------+----------------+-------------------+
|[295.0,93.0,1.0,2...|            0.46| 0.4834797187946397|
|[295.0,99.0,2.0,2...|            0.57| 0.5168903199651191|
|[296.0,101.0,1.0,...|             0.6| 0.5333596962431614|
|[297.0,98.0,2.0,2...|            0.59| 0.5018491108603806|
|[298.0,92.0,1.0,2...|            0.51|0.49783305617881696|
|[298.0,98.0,2.0,1...|            0.44| 0.4859081707896702|
|[298.0,100.0,3.0,...|            0.58| 0.5803707009639663|
|[299.0,100.0,1.0,...|            0.59| 0.5265990165871485|
|[299.0,100.0,2.0,...|            0.68| 0.5691238151141786|
|[299.0,102.0,3.0,...|            0.56| 0.6436917222118692|
+--------------------+----------------+-------------------+
only showing top 10 rows



### Randomforest Regression Model Evaluation

In [77]:
evaluator = RegressionEvaluator(labelCol="Chance of Admit ", predictionCol="prediction", metricName="rmse")
print ("Root Mean Squared Error (RMSE) on test data = ",evaluator.evaluate(predictions))

Root Mean Squared Error (RMSE) on test data =  0.05256958903202675


In [79]:
evaluator = RegressionEvaluator(labelCol="Chance of Admit ", predictionCol="prediction", metricName="r2")
print("R Squared (R2) on test data =", evaluator.evaluate(predictions))

R Squared (R2) on test data = 0.8404575841219026


# DecissionTree Refression

In [82]:
decission_tree = DecisionTreeRegressor(featuresCol="features",labelCol="Chance of Admit " )

In [84]:
# Train model. 
tree_model = decission_tree.fit(train_df)

In [86]:
# Make predictions.
tree_predictions = tree_model.transform(test)

In [87]:
tree_predictions.show(10)

+--------------------+----------------+-------------------+
|            features|Chance of Admit |         prediction|
+--------------------+----------------+-------------------+
|[295.0,93.0,1.0,2...|            0.46|0.44733333587646484|
|[295.0,99.0,2.0,2...|            0.57| 0.5572727241299369|
|[296.0,101.0,1.0,...|             0.6| 0.5572727241299369|
|[297.0,98.0,2.0,2...|            0.59| 0.5572727241299369|
|[298.0,92.0,1.0,2...|            0.51| 0.5099999862057822|
|[298.0,98.0,2.0,1...|            0.44|0.44733333587646484|
|[298.0,100.0,3.0,...|            0.58| 0.5572727241299369|
|[299.0,100.0,1.0,...|            0.59| 0.5099999862057822|
|[299.0,100.0,2.0,...|            0.68| 0.5572727241299369|
|[299.0,102.0,3.0,...|            0.56| 0.6330769199591416|
+--------------------+----------------+-------------------+
only showing top 10 rows



### DecissionTree Regression Model Evaluation

In [88]:
evaluator = RegressionEvaluator(labelCol="Chance of Admit ", predictionCol="prediction", metricName="rmse")
print ("Root Mean Squared Error (RMSE) on test data = ",evaluator.evaluate(tree_predictions))

Root Mean Squared Error (RMSE) on test data =  0.06245197077916565


In [89]:
evaluator = RegressionEvaluator(labelCol="Chance of Admit ", predictionCol="prediction", metricName="r2")
print("R Squared (R2) on test data =", evaluator.evaluate(tree_predictions))

R Squared (R2) on test data = 0.7748358233347991
