In [1]:
!pip install pyspark py4j

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.4.0.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.4.0-py2.py3-none-any.whl size=311317145 sha256=0621474738b365e751d1a9dd7ed8896c1d680009d4b2c4bb026fec8cecaca8fe
  Stored in directory: /root/.cache/pip/wheels/7b/1b/4b/3363a1d04368e7ff0d408e57ff57966fcdf00583774e761327
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.4.0


In [3]:
from __future__ import print_function
import pyspark
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler,VectorIndexer
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator

In [4]:
spark = SparkSession.builder.appName("Pyspark ML Algorithms").getOrCreate()

In [53]:
dataframe = spark.read.csv("/content/Admission_Prediction (1).csv",header=True)
type (dataframe)
dataframe.show()

+---------+-----------+-----------------+----+----+----+--------+---------------+
|GRE Score|TOEFL Score|University Rating| SOP| LOR|CGPA|Research|Chance of Admit|
+---------+-----------+-----------------+----+----+----+--------+---------------+
|   337.00|     118.00|                4|4.50|4.50|9.65|    1.00|           0.92|
|   324.00|     107.00|                4|4.00|4.50|8.87|    1.00|           0.76|
|     null|     104.00|                3|3.00|3.50|8.00|    1.00|           0.72|
|   322.00|     110.00|                3|3.50|2.50|8.67|    1.00|           0.80|
|   314.00|     103.00|                2|2.00|3.00|8.21|    0.00|           0.65|
|   330.00|     115.00|                5|4.50|3.00|9.34|    1.00|           0.90|
|   321.00|     109.00|             null|3.00|4.00|8.20|    1.00|           0.75|
|   308.00|     101.00|                2|3.00|4.00|7.90|    0.00|           0.68|
|   302.00|     102.00|                1|2.00|1.50|8.00|    0.00|           0.50|
|   323.00|     

In [56]:
from pyspark.sql.functions import col
new_dataframe = dataframe.select(*(col(c).cast("float").alias(c) for c in dataframe.columns))
new_dataframe.printSchema()

root
 |-- GRE Score: float (nullable = true)
 |-- TOEFL Score: float (nullable = true)
 |-- University Rating: float (nullable = true)
 |-- SOP: float (nullable = true)
 |-- LOR: float (nullable = true)
 |-- CGPA: float (nullable = true)
 |-- Research: float (nullable = true)
 |-- Chance of Admit: float (nullable = true)



In [57]:
from pyspark.sql.functions import col, count, isnan, when
for c in new_dataframe.columns:
    print(c)

GRE Score
TOEFL Score
University Rating
SOP
LOR
CGPA
Research
Chance of Admit


In [58]:
#checking for null ir nan type values in our columns
new_dataframe.select([count(when(col(c).isNull(), c)).alias(c) for c in new_dataframe.columns]).show()

+---------+-----------+-----------------+---+---+----+--------+---------------+
|GRE Score|TOEFL Score|University Rating|SOP|LOR|CGPA|Research|Chance of Admit|
+---------+-----------+-----------------+---+---+----+--------+---------------+
|       15|         10|               15|  0|  0|   0|       0|              0|
+---------+-----------+-----------------+---+---+----+--------+---------------+



In [59]:
from pyspark.ml.feature import Imputer

In [60]:
imputer = Imputer(inputCols=["GRE Score", "TOEFL Score","University Rating"], 
                  outputCols=["GRE Score", "TOEFL Score","University Rating"])
model = imputer.fit(new_dataframe)

imputed_data = model.transform(new_dataframe)
imputed_data

DataFrame[GRE Score: float, TOEFL Score: float, University Rating: float, SOP: float, LOR: float, CGPA: float, Research: float, Chance of Admit: float]

In [61]:
#checking for null ir nan type values in our columns
imputed_data.select([count(when(col(c).isNull(), c)).alias(c) for c in imputed_data.columns]).show()

+---------+-----------+-----------------+---+---+----+--------+---------------+
|GRE Score|TOEFL Score|University Rating|SOP|LOR|CGPA|Research|Chance of Admit|
+---------+-----------+-----------------+---+---+----+--------+---------------+
|        0|          0|                0|  0|  0|   0|       0|              0|
+---------+-----------+-----------------+---+---+----+--------+---------------+



In [62]:
features = imputed_data.drop('Chance of Admit')
features

DataFrame[GRE Score: float, TOEFL Score: float, University Rating: float, SOP: float, LOR: float, CGPA: float, Research: float]

In [63]:
#let's assemble our features together using vectorAssembler
assembler = VectorAssembler( inputCols=features.columns,outputCol="features")
output = assembler.transform(imputed_data)

**Linear Regressor**


In [29]:
output = assembler.transform(imputed_data)

In [64]:
output= output.select("features", "Chance of Admit")

In [65]:
train_df,test_df = output.randomSplit([0.7, 0.3])

In [66]:
train_df,test_df = output.randomSplit([0.7, 0.3])
train_df.show()
test_df.show()

+--------------------+---------------+
|            features|Chance of Admit|
+--------------------+---------------+
|[290.0,100.0,1.0,...|           0.47|
|[290.0,104.0,4.0,...|           0.45|
|[293.0,97.0,2.0,2...|           0.64|
|[294.0,93.0,1.0,1...|           0.46|
|[294.0,95.0,1.0,1...|           0.49|
|[295.0,93.0,1.0,2...|           0.46|
|[295.0,96.0,2.0,1...|           0.47|
|[295.0,99.0,2.0,2...|           0.57|
|[296.0,95.0,2.0,3...|           0.44|
|[296.0,99.0,2.0,2...|           0.61|
|[296.0,101.0,1.0,...|            0.6|
|[297.0,96.0,2.0,2...|           0.43|
|[297.0,96.0,2.0,2...|           0.34|
|[297.0,98.0,2.0,2...|           0.59|
|[297.0,100.0,1.0,...|           0.52|
|[297.0,101.0,3.0,...|           0.57|
|[298.0,98.0,2.0,1...|           0.44|
|[298.0,98.0,2.0,4...|           0.34|
|[298.0,99.0,1.0,1...|           0.53|
|[298.0,101.0,2.0,...|           0.54|
+--------------------+---------------+
only showing top 20 rows

+--------------------+---------------+

In [67]:
lin_reg = LinearRegression(featuresCol = 'features', labelCol='Chance of Admit')
linear_model = lin_reg.fit(train_df)


In [68]:
print("Coefficients: " + str(linear_model.coefficients))
print("Intercept: " + str(linear_model.intercept))

Coefficients: [0.0015564282988935548,0.0026856754290320545,0.0036382045911499384,0.006828546733391793,0.017664260612718686,0.12036397170073292,0.02605309485476319]
Intercept: -1.2022142433530705


In [69]:
trainSummary = linear_model.summary
print("RMSE: %f" % trainSummary.rootMeanSquaredError)
print("r2: %f" % trainSummary.r2)

RMSE: 0.060226
r2: 0.823426


In [70]:

predictions = linear_model.transform(test_df)
predictions.select("prediction","Chance of Admit","features").show()

+-------------------+---------------+--------------------+
|         prediction|Chance of Admit|            features|
+-------------------+---------------+--------------------+
| 0.4777609477081064|           0.37|[295.0,99.0,1.0,2...|
| 0.5339224540320673|           0.69|[295.0,101.0,2.0,...|
| 0.5106858024666785|           0.49|[296.0,97.0,2.0,1...|
| 0.4902071013552878|           0.47|[296.0,99.0,2.0,3...|
| 0.5628328116972217|           0.54|[297.0,99.0,4.0,3...|
| 0.5097754592478676|           0.51|[298.0,92.0,1.0,2...|
| 0.4679432217484141|           0.45|[298.0,97.0,3.121...|
| 0.6117586025962025|           0.58|[298.0,100.0,3.0,...|
| 0.5341581917272091|           0.46|[298.0,107.187751...|
|0.42721389093184015|           0.42|[299.0,94.0,1.0,1...|
| 0.5364554955701675|           0.51|[299.0,100.0,2.0,...|
| 0.5023905292001778|           0.42|[299.0,100.0,3.0,...|
| 0.6069809056976478|           0.65|[300.0,97.0,2.0,3...|
| 0.4345766051832849|           0.36|[300.0,99.0,1.0,3..

In [71]:
from pyspark.ml.evaluation import RegressionEvaluator
pred_evaluator = RegressionEvaluator(predictionCol="prediction", \
                 labelCol="Chance of Admit",metricName="r2")
print("R Squared (R2) on test data =", pred_evaluator.evaluate(predictions))

R Squared (R2) on test data = 0.8111305412835983


**Random forest regressor**


In [72]:
featureIndexer =VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(output)

In [76]:
featureIndexer = featureIndexer.transform(output)


In [84]:
new_indexed_data = featureIndexer.select("indexedFeatures", "Chance of Admit")

In [78]:
training, test = new_indexed_data.randomSplit([0.7, 0.3])
training.show()

+--------------------+---------------+
|     indexedFeatures|Chance of Admit|
+--------------------+---------------+
|[290.0,100.0,1.0,...|           0.47|
|[290.0,104.0,4.0,...|           0.45|
|[293.0,97.0,2.0,2...|           0.64|
|[294.0,93.0,1.0,1...|           0.46|
|[294.0,95.0,1.0,1...|           0.49|
|[295.0,93.0,1.0,2...|           0.46|
|[295.0,96.0,2.0,1...|           0.47|
|[295.0,99.0,1.0,2...|           0.37|
|[295.0,99.0,2.0,2...|           0.57|
|[295.0,101.0,2.0,...|           0.69|
|[296.0,95.0,2.0,3...|           0.44|
|[296.0,97.0,2.0,1...|           0.49|
|[296.0,99.0,2.0,2...|           0.61|
|[296.0,99.0,2.0,3...|           0.47|
|[296.0,101.0,1.0,...|            0.6|
|[297.0,96.0,2.0,2...|           0.43|
|[297.0,98.0,2.0,2...|           0.59|
|[297.0,99.0,4.0,3...|           0.54|
|[297.0,100.0,1.0,...|           0.52|
|[298.0,92.0,1.0,2...|           0.51|
+--------------------+---------------+
only showing top 20 rows



In [79]:
test.show()

+--------------------+---------------+
|     indexedFeatures|Chance of Admit|
+--------------------+---------------+
|[297.0,96.0,2.0,2...|           0.34|
|[297.0,101.0,3.0,...|           0.57|
|[298.0,98.0,2.0,4...|           0.34|
|[298.0,99.0,1.0,1...|           0.53|
|[300.0,97.0,2.0,3...|           0.65|
|[300.0,100.0,3.12...|           0.62|
|[300.0,102.0,3.0,...|           0.63|
|[300.0,105.0,1.0,...|           0.58|
|[301.0,97.0,2.0,3...|           0.44|
|[301.0,98.0,1.0,2...|           0.67|
|[301.0,100.0,3.0,...|           0.67|
|[302.0,99.0,2.0,1...|           0.56|
|[303.0,102.0,3.0,...|           0.62|
|[304.0,100.0,4.0,...|           0.42|
|[304.0,101.0,2.0,...|           0.38|
|[304.0,105.0,2.0,...|           0.54|
|[305.0,96.0,4.0,3...|           0.54|
|[305.0,102.0,2.0,...|           0.59|
|[305.0,104.0,2.0,...|           0.53|
|[305.0,106.0,2.0,...|           0.64|
+--------------------+---------------+
only showing top 20 rows



In [85]:
random_forest_reg = RandomForestRegressor(featuresCol="indexedFeatures",labelCol="Chance of Admit" )

In [89]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols=['features'], outputCol='indexedFeatures')
train_df = assembler.transform(train_df)
model = random_forest_reg.fit(train_df)


In [90]:
predictions = model.transform(test)

In [91]:
predictions.show()

+--------------------+---------------+------------------+
|     indexedFeatures|Chance of Admit|        prediction|
+--------------------+---------------+------------------+
|[297.0,96.0,2.0,2...|           0.34|0.4332513699299728|
|[297.0,101.0,3.0,...|           0.57|0.5616086643719378|
|[298.0,98.0,2.0,4...|           0.34|0.4637240589996696|
|[298.0,99.0,1.0,1...|           0.53|0.5223848062682424|
|[300.0,97.0,2.0,3...|           0.65|0.6045113355648417|
|[300.0,100.0,3.12...|           0.62|0.6487530882331868|
|[300.0,102.0,3.0,...|           0.63| 0.642103054930381|
|[300.0,105.0,1.0,...|           0.58| 0.569069962809628|
|[301.0,97.0,2.0,3...|           0.44|0.5802533747593472|
|[301.0,98.0,1.0,2...|           0.67|0.6054317311525524|
|[301.0,100.0,3.0,...|           0.67|0.6082205812535834|
|[302.0,99.0,2.0,1...|           0.56|0.5510958374935878|
|[303.0,102.0,3.0,...|           0.62| 0.655119967362249|
|[304.0,100.0,4.0,...|           0.42|0.5149999431663272|
|[304.0,101.0,

In [92]:
evaluator = RegressionEvaluator(labelCol="Chance of Admit", predictionCol="prediction", metricName="rmse")
print ("Root Mean Squared Error (RMSE) on test data = ",evaluator.evaluate(predictions))

Root Mean Squared Error (RMSE) on test data =  0.05782598126387316


In [93]:
evaluator = RegressionEvaluator(labelCol="Chance of Admit", predictionCol="prediction", metricName="r2")
print("R Squared (R2) on test data =", evaluator.evaluate(predictions))

R Squared (R2) on test data = 0.8458305439898357
