In [52]:
from __future__ import print_function
import findspark
import os
import sys
os.environ['SPARK_HOME'] = r'/Users/subham/Downloads/spark-3.0.0-bin-hadoop2.7'
findspark.init()
findspark.find()
import pyspark
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression

In [53]:
if __name__=="__main__":
    spark = SparkSession\
    .builder\
    .appName("LinearRegWithSpark")\
    .getOrCreate()

In [54]:
dataset = spark.read.csv("/Users/subham/Desktop/PySpark/Admission_Predict_Ver1.1.csv",header=True)

In [55]:
newdata = dataset.drop('Serial No.')

In [56]:
newdata.show(5)

+---------+-----------+-----------------+---+----+----+--------+----------------+
|GRE Score|TOEFL Score|University Rating|SOP|LOR |CGPA|Research|Chance of Admit |
+---------+-----------+-----------------+---+----+----+--------+----------------+
|      337|        118|                4|4.5| 4.5|9.65|       1|            0.92|
|      324|        107|                4|  4| 4.5|8.87|       1|            0.76|
|      316|        104|                3|  3| 3.5|   8|       1|            0.72|
|      322|        110|                3|3.5| 2.5|8.67|       1|             0.8|
|      314|        103|                2|  2|   3|8.21|       0|            0.65|
+---------+-----------+-----------------+---+----+----+--------+----------------+
only showing top 5 rows



In [57]:
newdata.printSchema()

root
 |-- GRE Score: string (nullable = true)
 |-- TOEFL Score: string (nullable = true)
 |-- University Rating: string (nullable = true)
 |-- SOP: string (nullable = true)
 |-- LOR : string (nullable = true)
 |-- CGPA: string (nullable = true)
 |-- Research: string (nullable = true)
 |-- Chance of Admit : string (nullable = true)



__Try out some sql functions__

In [58]:
from pyspark.sql.functions import col
# make all the columns float or numeric
new_data = newdata.select(*(col(c).cast("float").alias(c) for c in newdata.columns))

In [59]:
new_data.show(3)

+---------+-----------+-----------------+---+----+----+--------+----------------+
|GRE Score|TOEFL Score|University Rating|SOP|LOR |CGPA|Research|Chance of Admit |
+---------+-----------+-----------------+---+----+----+--------+----------------+
|    337.0|      118.0|              4.0|4.5| 4.5|9.65|     1.0|            0.92|
|    324.0|      107.0|              4.0|4.0| 4.5|8.87|     1.0|            0.76|
|    316.0|      104.0|              3.0|3.0| 3.5| 8.0|     1.0|            0.72|
+---------+-----------+-----------------+---+----+----+--------+----------------+
only showing top 3 rows



In [60]:
new_data.printSchema()

root
 |-- GRE Score: float (nullable = true)
 |-- TOEFL Score: float (nullable = true)
 |-- University Rating: float (nullable = true)
 |-- SOP: float (nullable = true)
 |-- LOR : float (nullable = true)
 |-- CGPA: float (nullable = true)
 |-- Research: float (nullable = true)
 |-- Chance of Admit : float (nullable = true)



In [74]:
new_data.describe().show()

+-------+------------------+-----------------+-----------------+------------------+------------------+------------------+------------------+-------------------+
|summary|         GRE Score|      TOEFL Score|University Rating|               SOP|              LOR |              CGPA|          Research|   Chance of Admit |
+-------+------------------+-----------------+-----------------+------------------+------------------+------------------+------------------+-------------------+
|  count|               500|              500|              500|               500|               500|               500|               500|                500|
|   mean|           316.472|          107.192|            3.114|             3.374|             3.484| 8.576440006256103|              0.56|  0.721739999115467|
| stddev|11.295148372354712|6.081867659564538|1.143511800759815|0.9910036207566072|0.9254495738978191|0.6048127839150954|0.4968840786090358|0.14114040501251301|
|    min|             290.0|      

In [75]:
new_data.describe('Chance of Admit ').show()

+-------+-------------------+
|summary|   Chance of Admit |
+-------+-------------------+
|  count|                500|
|   mean|  0.721739999115467|
| stddev|0.14114040501251301|
|    min|               0.34|
|    max|               0.97|
+-------+-------------------+



In [64]:
from pyspark.mllib.stat import Statistics
from pyspark import SparkContext
from pyspark import SparkConf
import pandas as pd

###  We need to convert dataframe intp a RDD to check for correlation

In [62]:
col_names = new_data.columns
features_set = new_data.rdd.map(lambda row: row[0:])

### Checking correaltion using pearson method



In [65]:
corr_mat=Statistics.corr(features_set, method="pearson")
corr_df = pd.DataFrame(corr_mat)
corr_df.index, corr_df.columns = col_names, col_names

In [66]:
corr_df.columns

Index(['GRE Score', 'TOEFL Score', 'University Rating', 'SOP', 'LOR ', 'CGPA',
       'Research', 'Chance of Admit '],
      dtype='object')

In [67]:
corr_df.index

Index(['GRE Score', 'TOEFL Score', 'University Rating', 'SOP', 'LOR ', 'CGPA',
       'Research', 'Chance of Admit '],
      dtype='object')

In [68]:
corr_df

Unnamed: 0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
GRE Score,1.0,0.8272,0.635376,0.613498,0.524679,0.825878,0.563398,0.810351
TOEFL Score,0.8272,1.0,0.649799,0.64441,0.541563,0.810574,0.467012,0.792228
University Rating,0.635376,0.649799,1.0,0.728024,0.608651,0.705254,0.427047,0.690132
SOP,0.613498,0.64441,0.728024,1.0,0.663707,0.712154,0.408116,0.684137
LOR,0.524679,0.541563,0.608651,0.663707,1.0,0.637469,0.372526,0.645365
CGPA,0.825878,0.810574,0.705254,0.712154,0.637469,1.0,0.501311,0.882413
Research,0.563398,0.467012,0.427047,0.408116,0.372526,0.501311,1.0,0.545871
Chance of Admit,0.810351,0.792228,0.690132,0.684137,0.645365,0.882413,0.545871,1.0


## Dataframe to Heatmap

In [72]:
corr_df.style.background_gradient(cmap='Blues')

Unnamed: 0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
GRE Score,1.0,0.8272,0.635376,0.613498,0.524679,0.825878,0.563398,0.810351
TOEFL Score,0.8272,1.0,0.649799,0.64441,0.541563,0.810574,0.467012,0.792228
University Rating,0.635376,0.649799,1.0,0.728024,0.608651,0.705254,0.427047,0.690132
SOP,0.613498,0.64441,0.728024,1.0,0.663707,0.712154,0.408116,0.684137
LOR,0.524679,0.541563,0.608651,0.663707,1.0,0.637469,0.372526,0.645365
CGPA,0.825878,0.810574,0.705254,0.712154,0.637469,1.0,0.501311,0.882413
Research,0.563398,0.467012,0.427047,0.408116,0.372526,0.501311,1.0,0.545871
Chance of Admit,0.810351,0.792228,0.690132,0.684137,0.645365,0.882413,0.545871,1.0


## Checking for Null values

In [79]:
from pyspark.sql.functions import col,count,isnan,when

In [80]:
# Let's check for missing values
newdata.select([count(when(col(c).isNull(),c)).alias(c) for c in new_data.columns]).show()

+---------+-----------+-----------------+---+----+----+--------+----------------+
|GRE Score|TOEFL Score|University Rating|SOP|LOR |CGPA|Research|Chance of Admit |
+---------+-----------+-----------------+---+----+----+--------+----------------+
|        0|          0|                0|  0|   0|   0|       0|               0|
+---------+-----------+-----------------+---+----+----+--------+----------------+



In [81]:
from pyspark.ml.feature import Imputer


In [82]:
'''Imputer(
    strategy='mean',
    missingValue=nan,
    inputCols=None,
    outputCols=None,
    inputCol=None,
    outputCol=None,
    relativeError=0.001,
)'''
# imputer = Imputer(inputCols = ["GRE Score","TOEFL Score","University Rating"],  # select only those cols which have nans
#                  outputCols = ["GRE Score","TOEFL Score","University Rating"])
# model = imputer.fit(new_data)
# imputed_data = model.transform(new_data)
# imputed_data.select([count(when(col(c).isNull(),c)).alias(c) for c in imputed_data.columns]).show()

"Imputer(\n    strategy='mean',\n    missingValue=nan,\n    inputCols=None,\n    outputCols=None,\n    inputCol=None,\n    outputCol=None,\n    relativeError=0.001,\n)"

In [83]:
features = new_data.drop('Chance of Admit')

In [84]:
# Let's assemble our features together using VectorAssembler 
# It makes lists row wise and creates feature vector
assembler = VectorAssembler(
                inputCols= features.columns,
                outputCol = 'features')

In [85]:
output = assembler.transform(new_data)

In [86]:
output.show(5)

+---------+-----------+-----------------+---+----+----+--------+----------------+--------------------+
|GRE Score|TOEFL Score|University Rating|SOP|LOR |CGPA|Research|Chance of Admit |            features|
+---------+-----------+-----------------+---+----+----+--------+----------------+--------------------+
|    337.0|      118.0|              4.0|4.5| 4.5|9.65|     1.0|            0.92|[337.0,118.0,4.0,...|
|    324.0|      107.0|              4.0|4.0| 4.5|8.87|     1.0|            0.76|[324.0,107.0,4.0,...|
|    316.0|      104.0|              3.0|3.0| 3.5| 8.0|     1.0|            0.72|[316.0,104.0,3.0,...|
|    322.0|      110.0|              3.0|3.5| 2.5|8.67|     1.0|             0.8|[322.0,110.0,3.0,...|
|    314.0|      103.0|              2.0|2.0| 3.0|8.21|     0.0|            0.65|[314.0,103.0,2.0,...|
+---------+-----------+-----------------+---+----+----+--------+----------------+--------------------+
only showing top 5 rows



In [87]:
# Select features and target
output = output.select("features", "Chance of Admit ")

In [88]:
# Train - Test split
train_df , test_df = output.randomSplit([0.7,0.3])

In [89]:
train_df.show(5)

+--------------------+----------------+
|            features|Chance of Admit |
+--------------------+----------------+
|[290.0,104.0,4.0,...|            0.45|
|[293.0,97.0,2.0,2...|            0.64|
|[294.0,93.0,1.0,1...|            0.46|
|[295.0,99.0,1.0,2...|            0.37|
|[295.0,99.0,2.0,2...|            0.57|
+--------------------+----------------+
only showing top 5 rows



In [90]:
test_df.show(5)

+--------------------+----------------+
|            features|Chance of Admit |
+--------------------+----------------+
|[290.0,100.0,1.0,...|            0.47|
|[294.0,95.0,1.0,1...|            0.49|
|[295.0,93.0,1.0,2...|            0.46|
|[295.0,96.0,2.0,1...|            0.47|
|[295.0,101.0,2.0,...|            0.69|
+--------------------+----------------+
only showing top 5 rows



## MinMaxScaler

In [91]:
from pyspark.ml.feature import MinMaxScaler
from pyspark.ml.feature import StandardScaler
from pyspark.ml.linalg import Vectors
scaler = MinMaxScaler(inputCol="features", outputCol="scaledFeatures")
scalerModel = scaler.fit(train_df)
train_df_sc = scalerModel.transform(train_df)
test_df_sc = scalerModel.transform(test_df)

## Linear Regression

In [36]:
lin_reg = LinearRegression(featuresCol = "scaledFeatures", labelCol = "Chance of Admit ")
linear_model = lin_reg.fit(train_df_sc)

In [37]:
trainSummary = linear_model.summary
print('RMSE:' ,trainSummary.rootMeanSquaredError)
print('R2:' ,trainSummary.r2)

RMSE: 3.755278646595512e-15
R2: 1.0


In [38]:
#Predictions
predictions= linear_model.transform(test_df_sc)
# predictions.select("prediction" , "Chance of Admit ","features").show()

In [39]:
predictions.show()

+--------------------+----------------+--------------------+-------------------+
|            features|Chance of Admit |      scaledFeatures|         prediction|
+--------------------+----------------+--------------------+-------------------+
|[290.0,100.0,1.0,...|            0.47|[0.0,0.2857142857...| 0.4699999988079123|
|[294.0,95.0,1.0,1...|            0.49|[0.08,0.107142857...| 0.4900000095367446|
|[295.0,99.0,2.0,2...|            0.57|[0.1,0.25,0.25,0....| 0.5699999928474463|
|[296.0,97.0,2.0,1...|            0.49|[0.12,0.178571428...|0.49000000953674233|
|[297.0,96.0,2.0,2...|            0.34|[0.14,0.142857142...| 0.3400000035762716|
|[297.0,98.0,2.0,2...|            0.59|[0.14,0.214285714...| 0.5899999737739592|
|[297.0,101.0,3.0,...|            0.57|[0.14,0.321428571...| 0.5699999928474428|
|[298.0,99.0,2.0,4...|            0.46|[0.16,0.25,0.25,0...| 0.4600000083446496|
|[299.0,96.0,2.0,1...|            0.54|[0.18,0.142857142...| 0.5400000214576712|
|[299.0,100.0,1.0,...|      

In [40]:
from pyspark.ml.evaluation import RegressionEvaluator
pred_evaluator = RegressionEvaluator(predictionCol = "prediction",\
                                    labelCol ="Chance of Admit ",metricName ="r2")
print("R Squared on Test Data : ", pred_evaluator.evaluate(predictions))

R Squared on Test Data :  1.0


## Random Forest Regressor

In [42]:
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator
random_forest_reg = RandomForestRegressor(featuresCol="scaledFeatures",labelCol="Chance of Admit " )
rfr = random_forest_reg.fit(train_df_sc)

In [45]:
#Predictions
predictions= rfr.transform(test_df_sc)

In [46]:
predictions.show(5)

+--------------------+----------------+--------------------+-------------------+
|            features|Chance of Admit |      scaledFeatures|         prediction|
+--------------------+----------------+--------------------+-------------------+
|[290.0,100.0,1.0,...|            0.47|[0.0,0.2857142857...| 0.4728511474419026|
|[294.0,95.0,1.0,1...|            0.49|[0.08,0.107142857...|0.45792404719046254|
|[295.0,99.0,2.0,2...|            0.57|[0.1,0.25,0.25,0....| 0.5737547253582364|
|[296.0,97.0,2.0,1...|            0.49|[0.12,0.178571428...|0.47014890617938426|
|[297.0,96.0,2.0,2...|            0.34|[0.14,0.142857142...|0.43394697487372075|
+--------------------+----------------+--------------------+-------------------+
only showing top 5 rows



In [48]:
evaluator = RegressionEvaluator(
    labelCol="Chance of Admit ", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print ("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

Root Mean Squared Error (RMSE) on test data = 0.0244232


## Gradient Boosting Regressor

In [92]:
from pyspark.ml.regression import GBTRegressor
gbt = GBTRegressor(featuresCol="scaledFeatures",labelCol="Chance of Admit ", maxIter=10)
model = gbt.fit(train_df_sc)

In [93]:
#Predictions
predictions= model.transform(test_df_sc)

In [94]:
predictions.show(5)

+--------------------+----------------+--------------------+-------------------+
|            features|Chance of Admit |      scaledFeatures|         prediction|
+--------------------+----------------+--------------------+-------------------+
|[290.0,100.0,1.0,...|            0.47|[0.0,0.2592592592...|0.45654670564795935|
|[294.0,95.0,1.0,1...|            0.49|[0.08,0.074074074...| 0.4594239499691532|
|[295.0,93.0,1.0,2...|            0.46|[0.1,0.0,0.0,0.25...| 0.4538213249374192|
|[295.0,96.0,2.0,1...|            0.47|[0.1,0.1111111111...|0.45768106070898185|
|[295.0,101.0,2.0,...|            0.69|[0.1,0.2962962962...| 0.6852992292604841|
+--------------------+----------------+--------------------+-------------------+
only showing top 5 rows



In [95]:
evaluator = RegressionEvaluator(
    labelCol="Chance of Admit ", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print ("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

Root Mean Squared Error (RMSE) on test data = 0.0117592


In [49]:
spark.stop()