In [1]:
import pyspark
import seaborn as sns 
import matplotlib.pyplot as plt
import numpy as np

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Admit_predict").getOrCreate()

In [3]:
data=spark.read.csv("Admission_Predict.csv",header=True,inferSchema=True)
data.show()

+----------+---------+-----------+-----------------+---+----+----+--------+----------------+
|Serial No.|GRE Score|TOEFL Score|University Rating|SOP|LOR |CGPA|Research|Chance of Admit |
+----------+---------+-----------+-----------------+---+----+----+--------+----------------+
|         1|      337|        118|                4|4.5| 4.5|9.65|       1|            0.92|
|         2|      324|        107|                4|4.0| 4.5|8.87|       1|            0.76|
|         3|      316|        104|                3|3.0| 3.5| 8.0|       1|            0.72|
|         4|      322|        110|                3|3.5| 2.5|8.67|       1|             0.8|
|         5|      314|        103|                2|2.0| 3.0|8.21|       0|            0.65|
|         6|      330|        115|                5|4.5| 3.0|9.34|       1|             0.9|
|         7|      321|        109|                3|3.0| 4.0| 8.2|       1|            0.75|
|         8|      308|        101|                2|3.0| 4.0| 7.9|    

In [4]:
data.printSchema()

root
 |-- Serial No.: integer (nullable = true)
 |-- GRE Score: integer (nullable = true)
 |-- TOEFL Score: integer (nullable = true)
 |-- University Rating: integer (nullable = true)
 |-- SOP: double (nullable = true)
 |-- LOR : double (nullable = true)
 |-- CGPA: double (nullable = true)
 |-- Research: integer (nullable = true)
 |-- Chance of Admit : double (nullable = true)



In [5]:
data_selected = data.select(['GRE Score','TOEFL Score','University Rating','CGPA','Chance of Admit '])
data_selected.show()

+---------+-----------+-----------------+----+----------------+
|GRE Score|TOEFL Score|University Rating|CGPA|Chance of Admit |
+---------+-----------+-----------------+----+----------------+
|      337|        118|                4|9.65|            0.92|
|      324|        107|                4|8.87|            0.76|
|      316|        104|                3| 8.0|            0.72|
|      322|        110|                3|8.67|             0.8|
|      314|        103|                2|8.21|            0.65|
|      330|        115|                5|9.34|             0.9|
|      321|        109|                3| 8.2|            0.75|
|      308|        101|                2| 7.9|            0.68|
|      302|        102|                1| 8.0|             0.5|
|      323|        108|                3| 8.6|            0.45|
|      325|        106|                3| 8.4|            0.52|
|      327|        111|                4| 9.0|            0.84|
|      328|        112|                4

In [6]:
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=['GRE Score','TOEFL Score','University Rating','CGPA'],outputCol='Independent_Features')
output = assembler.transform(data_selected)
output.show()

+---------+-----------+-----------------+----+----------------+--------------------+
|GRE Score|TOEFL Score|University Rating|CGPA|Chance of Admit |Independent_Features|
+---------+-----------+-----------------+----+----------------+--------------------+
|      337|        118|                4|9.65|            0.92|[337.0,118.0,4.0,...|
|      324|        107|                4|8.87|            0.76|[324.0,107.0,4.0,...|
|      316|        104|                3| 8.0|            0.72|[316.0,104.0,3.0,...|
|      322|        110|                3|8.67|             0.8|[322.0,110.0,3.0,...|
|      314|        103|                2|8.21|            0.65|[314.0,103.0,2.0,...|
|      330|        115|                5|9.34|             0.9|[330.0,115.0,5.0,...|
|      321|        109|                3| 8.2|            0.75|[321.0,109.0,3.0,...|
|      308|        101|                2| 7.9|            0.68|[308.0,101.0,2.0,...|
|      302|        102|                1| 8.0|             0.5|[3

In [7]:
output=output.select(['Independent_Features','Chance of Admit '])
output.show()

+--------------------+----------------+
|Independent_Features|Chance of Admit |
+--------------------+----------------+
|[337.0,118.0,4.0,...|            0.92|
|[324.0,107.0,4.0,...|            0.76|
|[316.0,104.0,3.0,...|            0.72|
|[322.0,110.0,3.0,...|             0.8|
|[314.0,103.0,2.0,...|            0.65|
|[330.0,115.0,5.0,...|             0.9|
|[321.0,109.0,3.0,...|            0.75|
|[308.0,101.0,2.0,...|            0.68|
|[302.0,102.0,1.0,...|             0.5|
|[323.0,108.0,3.0,...|            0.45|
|[325.0,106.0,3.0,...|            0.52|
|[327.0,111.0,4.0,...|            0.84|
|[328.0,112.0,4.0,...|            0.78|
|[307.0,109.0,3.0,...|            0.62|
|[311.0,104.0,3.0,...|            0.61|
|[314.0,105.0,3.0,...|            0.54|
|[317.0,107.0,3.0,...|            0.66|
|[319.0,106.0,3.0,...|            0.65|
|[318.0,110.0,3.0,...|            0.63|
|[303.0,102.0,3.0,...|            0.62|
+--------------------+----------------+
only showing top 20 rows



In [8]:
from pyspark.ml.regression import LinearRegression
train_data,test_data=output.randomSplit([0.70,0.30])
train_data.show()

+--------------------+----------------+
|Independent_Features|Chance of Admit |
+--------------------+----------------+
|[290.0,104.0,4.0,...|            0.45|
|[293.0,97.0,2.0,7.8]|            0.64|
|[294.0,93.0,1.0,7...|            0.46|
|[294.0,95.0,1.0,7...|            0.49|
|[295.0,96.0,2.0,7...|            0.47|
|[295.0,99.0,2.0,7...|            0.57|
|[295.0,101.0,2.0,...|            0.69|
|[296.0,97.0,2.0,7.8]|            0.49|
|[296.0,99.0,2.0,7...|            0.47|
|[296.0,99.0,2.0,8...|            0.61|
|[296.0,101.0,1.0,...|             0.6|
|[297.0,96.0,2.0,7...|            0.34|
|[297.0,100.0,1.0,...|            0.52|
|[298.0,98.0,2.0,8...|            0.34|
|[298.0,99.0,1.0,7...|            0.53|
|[298.0,101.0,2.0,...|            0.54|
|[298.0,105.0,3.0,...|            0.69|
|[299.0,96.0,2.0,7...|            0.54|
|[299.0,100.0,1.0,...|            0.59|
|[300.0,97.0,2.0,8.1]|            0.65|
+--------------------+----------------+
only showing top 20 rows



In [22]:
lr=LinearRegression(featuresCol='Independent_Features',labelCol='Chance of Admit ')
regressor=lr.fit(train_data)

In [23]:
regressor.coefficients

DenseVector([0.0019, 0.0034, 0.0073, 0.1364])

In [11]:
regressor.intercept

-1.4283104872419898

In [12]:
pred = regressor.evaluate(test_data)


<pyspark.ml.regression.LinearRegressionSummary at 0x254e99c3400>

In [14]:
pred.predictions.show()

+--------------------+----------------+-------------------+
|Independent_Features|Chance of Admit |         prediction|
+--------------------+----------------+-------------------+
|[290.0,100.0,1.0,...|            0.47|0.49725923322448895|
|[295.0,93.0,1.0,7.2]|            0.46| 0.4342031720928894|
|[296.0,95.0,2.0,7...|            0.44|0.49648113593196075|
|[297.0,96.0,2.0,7...|            0.43| 0.5494732033983953|
|[297.0,98.0,2.0,7...|            0.59| 0.5261737456816389|
|[298.0,92.0,1.0,7...|            0.51| 0.5293034656519182|
|[298.0,98.0,2.0,7.5]|            0.44| 0.5048906786823946|
|[298.0,99.0,2.0,7.6]|            0.46| 0.5218825709320354|
|[299.0,94.0,1.0,7...|            0.42|0.46426264014447516|
|[299.0,97.0,3.0,7...|            0.38| 0.5325596064056974|
|[299.0,100.0,2.0,...|            0.68| 0.5653274313624637|
|[299.0,100.0,3.0,...|            0.63| 0.5917186034711965|
|[299.0,102.0,3.0,...|            0.56| 0.6802578567476572|
|[299.0,106.0,2.0,...|            0.64| 

In [32]:
lr.write().save("/Users/Hipparagi/webapp") 

Py4JJavaError: An error occurred while calling o502.save.
: java.io.IOException: Path /Users/Hipparagi/webapp already exists. To overwrite it, please use write.overwrite().save(path) for Scala and use write().overwrite().save(path) for Java and Python.
	at org.apache.spark.ml.util.FileSystemOverwrite.handleOverwrite(ReadWrite.scala:683)
	at org.apache.spark.ml.util.MLWriter.save(ReadWrite.scala:167)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(Unknown Source)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(Unknown Source)
	at java.lang.reflect.Method.invoke(Unknown Source)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.lang.Thread.run(Unknown Source)
