# TASK 1 : Install Dependencies & Run a SparkSession


In [1]:
#install pyspark
! pip install pyspark



In [3]:
#create a sparksession
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Admission_Prediction").getOrCreate()

In [18]:
from pyspark.sql.functions import col,when,count

# TASK 2 : Clone & Explore dataset

In [4]:
#clone the dataset
! git clone https://github.com/education454/admission_dataset

Cloning into 'admission_dataset'...
remote: Enumerating objects: 3, done.[K
remote: Counting objects: 100% (3/3), done.[K
remote: Compressing objects: 100% (3/3), done.[K
remote: Total 3 (delta 0), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (3/3), 5.60 KiB | 5.60 MiB/s, done.


In [5]:
#check the presence of dataset
! ls admission_dataset

Admission_Predict_Ver1.1.csv


In [6]:
#create a spark dataframe
path = '/content/admission_dataset/Admission_Predict_Ver1.1.csv'
df = spark.read.format('csv').option("inferSchema",True).option("header",True).load(path)
df.show()

+---------+---------+-----------+-----------------+---+---+----+--------+---------------+
|Serial No|GRE Score|TOEFL Score|University Rating|SOP|LOR|CGPA|Research|Chance of Admit|
+---------+---------+-----------+-----------------+---+---+----+--------+---------------+
|        1|      337|        118|                4|4.5|4.5|9.65|       1|           0.92|
|        2|      324|        107|                4|4.0|4.5|8.87|       1|           0.76|
|        3|      316|        104|                3|3.0|3.5| 8.0|       1|           0.72|
|        4|      322|        110|                3|3.5|2.5|8.67|       1|            0.8|
|        5|      314|        103|                2|2.0|3.0|8.21|       0|           0.65|
|        6|      330|        115|                5|4.5|3.0|9.34|       1|            0.9|
|        7|      321|        109|                3|3.0|4.0| 8.2|       1|           0.75|
|        8|      308|        101|                2|3.0|4.0| 7.9|       0|           0.68|
|        9

In [7]:
#display dataframe
df.show()

+---------+---------+-----------+-----------------+---+---+----+--------+---------------+
|Serial No|GRE Score|TOEFL Score|University Rating|SOP|LOR|CGPA|Research|Chance of Admit|
+---------+---------+-----------+-----------------+---+---+----+--------+---------------+
|        1|      337|        118|                4|4.5|4.5|9.65|       1|           0.92|
|        2|      324|        107|                4|4.0|4.5|8.87|       1|           0.76|
|        3|      316|        104|                3|3.0|3.5| 8.0|       1|           0.72|
|        4|      322|        110|                3|3.5|2.5|8.67|       1|            0.8|
|        5|      314|        103|                2|2.0|3.0|8.21|       0|           0.65|
|        6|      330|        115|                5|4.5|3.0|9.34|       1|            0.9|
|        7|      321|        109|                3|3.0|4.0| 8.2|       1|           0.75|
|        8|      308|        101|                2|3.0|4.0| 7.9|       0|           0.68|
|        9

In [8]:
#get the no.of rows & columns
print(df.count(),len(df.columns))

500 9


In [9]:
#print schema
df.printSchema()

root
 |-- Serial No: integer (nullable = true)
 |-- GRE Score: integer (nullable = true)
 |-- TOEFL Score: integer (nullable = true)
 |-- University Rating: integer (nullable = true)
 |-- SOP: double (nullable = true)
 |-- LOR: double (nullable = true)
 |-- CGPA: double (nullable = true)
 |-- Research: integer (nullable = true)
 |-- Chance of Admit: double (nullable = true)



In [10]:
#get the summary statistics
df.describe().show()

+-------+-----------------+------------------+-----------------+-----------------+------------------+------------------+------------------+------------------+-------------------+
|summary|        Serial No|         GRE Score|      TOEFL Score|University Rating|               SOP|               LOR|              CGPA|          Research|    Chance of Admit|
+-------+-----------------+------------------+-----------------+-----------------+------------------+------------------+------------------+------------------+-------------------+
|  count|              500|               500|              500|              500|               500|               500|               500|               500|                500|
|   mean|            250.5|           316.472|          107.192|            3.114|             3.374|             3.484| 8.576440000000003|              0.56| 0.7217399999999996|
| stddev|144.4818327679989|11.295148372354712|6.081867659564538|1.143511800759815|0.9910036207566072|0.92

# TASK 3 : Data Cleaning

In [12]:
#drop the unnecessary column
df = df.drop('Serial No')
df.show()

+---------+-----------+-----------------+---+---+----+--------+---------------+
|GRE Score|TOEFL Score|University Rating|SOP|LOR|CGPA|Research|Chance of Admit|
+---------+-----------+-----------------+---+---+----+--------+---------------+
|      337|        118|                4|4.5|4.5|9.65|       1|           0.92|
|      324|        107|                4|4.0|4.5|8.87|       1|           0.76|
|      316|        104|                3|3.0|3.5| 8.0|       1|           0.72|
|      322|        110|                3|3.5|2.5|8.67|       1|            0.8|
|      314|        103|                2|2.0|3.0|8.21|       0|           0.65|
|      330|        115|                5|4.5|3.0|9.34|       1|            0.9|
|      321|        109|                3|3.0|4.0| 8.2|       1|           0.75|
|      308|        101|                2|3.0|4.0| 7.9|       0|           0.68|
|      302|        102|                1|2.0|1.5| 8.0|       0|            0.5|
|      323|        108|                3

In [None]:
#display the dataframe

In [20]:
#check for null values
for i in df.columns:
  null_count = df.filter(col(i).isNull()).count()

  print(f"Null values in {i}: {null_count}")

Null values in GRE Score: 0
Null values in TOEFL Score: 0
Null values in University Rating: 0
Null values in SOP: 0
Null values in LOR: 0
Null values in CGPA: 0
Null values in Research: 0
Null values in Chance of Admit: 0


In [21]:
#another method
for i in df.columns:
  print(i+ ":",df[df[i].isNull()].count())

GRE Score: 0
TOEFL Score: 0
University Rating: 0
SOP: 0
LOR: 0
CGPA: 0
Research: 0
Chance of Admit: 0


# TASK 4 : Correlation Analysis & Feature Selection

In [26]:
# correlation analysis
for i in df.columns:
  print(i,df.stat.corr("Chance of Admit",i))

GRE Score 0.8103506354632598
TOEFL Score 0.7922276143050823
University Rating 0.6901323687886892
SOP 0.6841365241316723
LOR 0.6453645135280112
CGPA 0.882412574904574
Research 0.5458710294711379
Chance of Admit 1.0


In [29]:
# feature selection
from pyspark.ml.feature import VectorAssembler
features = ['GRE Score','CGPA','TOEFL Score']
assembler = VectorAssembler(inputCols = features,outputCol = 'num_features')
df = assembler.transform(df)
df.show()

+---------+-----------+-----------------+---+---+----+--------+---------------+------------------+
|GRE Score|TOEFL Score|University Rating|SOP|LOR|CGPA|Research|Chance of Admit|      num_features|
+---------+-----------+-----------------+---+---+----+--------+---------------+------------------+
|      337|        118|                4|4.5|4.5|9.65|       1|           0.92|[337.0,9.65,118.0]|
|      324|        107|                4|4.0|4.5|8.87|       1|           0.76|[324.0,8.87,107.0]|
|      316|        104|                3|3.0|3.5| 8.0|       1|           0.72| [316.0,8.0,104.0]|
|      322|        110|                3|3.5|2.5|8.67|       1|            0.8|[322.0,8.67,110.0]|
|      314|        103|                2|2.0|3.0|8.21|       0|           0.65|[314.0,8.21,103.0]|
|      330|        115|                5|4.5|3.0|9.34|       1|            0.9|[330.0,9.34,115.0]|
|      321|        109|                3|3.0|4.0| 8.2|       1|           0.75| [321.0,8.2,109.0]|
|      308

In [30]:
#standardizing features
from pyspark.ml.feature import StandardScaler
scaler = StandardScaler(inputCol = 'num_features',outputCol='Scaled_features',withStd=True,withMean=True)
df= scaler.fit(df).transform(df)
df.show()

+---------+-----------+-----------------+---+---+----+--------+---------------+------------------+--------------------+
|GRE Score|TOEFL Score|University Rating|SOP|LOR|CGPA|Research|Chance of Admit|      num_features|     Scaled_features|
+---------+-----------+-----------------+---+---+----+--------+---------------+------------------+--------------------+
|      337|        118|                4|4.5|4.5|9.65|       1|           0.92|[337.0,9.65,118.0]|[1.81741747193359...|
|      324|        107|                4|4.0|4.5|8.87|       1|           0.76|[324.0,8.87,107.0]|[0.66648084220166...|
|      316|        104|                3|3.0|3.5| 8.0|       1|           0.72| [316.0,8.0,104.0]|[-0.0417878530179...|
|      322|        110|                3|3.5|2.5|8.67|       1|            0.8|[322.0,8.67,110.0]|[0.48941366839675...|
|      314|        103|                2|2.0|3.0|8.21|       0|           0.65|[314.0,8.21,103.0]|[-0.2188550268228...|
|      330|        115|                5

# TASK 5 : Build the Linear Regression Model

In [31]:
#import Linearregression and create final data
from pyspark.ml.regression import LinearRegression
final_data = df.select(['Scaled_features','Chance of Admit'])
final_data.show()

+--------------------+---------------+
|     Scaled_features|Chance of Admit|
+--------------------+---------------+
|[1.81741747193359...|           0.92|
|[0.66648084220166...|           0.76|
|[-0.0417878530179...|           0.72|
|[0.48941366839675...|            0.8|
|[-0.2188550268228...|           0.65|
|[1.19768236361640...|            0.9|
|[0.40088008149429...|           0.75|
|[-0.7500565482376...|           0.68|
|[-1.2812580696523...|            0.5|
|[0.57794725529920...|           0.45|
|[0.75501442910412...|           0.52|
|[0.93208160290903...|           0.84|
|[1.02061518981148...|           0.78|
|[-0.8385901351400...|           0.62|
|[-0.4844557875302...|           0.61|
|[-0.2188550268228...|           0.54|
|[0.04674573388447...|           0.66|
|[0.22381290768938...|           0.65|
|[0.13527932078692...|           0.63|
|[-1.1927244827499...|           0.62|
+--------------------+---------------+
only showing top 20 rows



In [32]:
#print schema of final data
final_data.printSchema()

root
 |-- Scaled_features: vector (nullable = true)
 |-- Chance of Admit: double (nullable = true)



In [34]:
#split the dataset into training and testing set
train,test = final_data.randomSplit([0.7,0.3],seed = 100)

In [35]:
#build & train the model
lr = LinearRegression(featuresCol="Scaled_features",labelCol='Chance of Admit')
model = lr.fit(train)

In [36]:
#get coefficients & intercept
model.coefficients,model.intercept

(DenseVector([0.0281, 0.0892, 0.0134]), 0.7247677263352919)

In [43]:
#get summary of the model
summary = model.summary
summary.rootMeanSquaredError,summary.r2

print("RMSE :" ,{summary.rootMeanSquaredError}) # closer it is to 0 the model perfectly fits
print("R2 :" ,{summary.r2}) # closer to 1 then perfect fit

RMSE : {0.06134803995512053}
R2 : {0.7988259529177988}


# TASK 6 : Evaluate & Save the Model

In [44]:
#transform on the test data
predicted_test = model.transform(test)
predicted_test.show()

+--------------------+---------------+-------------------+
|     Scaled_features|Chance of Admit|         prediction|
+--------------------+---------------+-------------------+
|[-1.9895267648720...|           0.46| 0.4583370937675276|
|[-1.8124595910671...|           0.47|0.46471004496280316|
|[-1.8124595910671...|           0.44| 0.4942501118313623|
|[-1.8124595910671...|           0.61| 0.5753130493456146|
|[-1.7239260041646...|           0.34|0.48271248636119657|
|[-1.7239260041646...|           0.43| 0.5505489957159876|
|[-1.7239260041646...|           0.52| 0.5608260104252407|
|[-1.6353924172621...|           0.53| 0.5345413177040335|
|[-1.6353924172621...|           0.34|  0.578079615369463|
|[-1.5468588303597...|           0.54| 0.5510920185605608|
|[-1.5468588303597...|           0.51| 0.5628437399949179|
|[-1.5468588303597...|           0.59|  0.564318446720022|
|[-1.5468588303597...|           0.63|  0.583489634146376|
|[-1.5468588303597...|           0.56| 0.676373191644699

In [45]:
#display the predictions
predicted_test.select('Chance of Admit','prediction').show()

+---------------+-------------------+
|Chance of Admit|         prediction|
+---------------+-------------------+
|           0.46| 0.4583370937675276|
|           0.47|0.46471004496280316|
|           0.44| 0.4942501118313623|
|           0.61| 0.5753130493456146|
|           0.34|0.48271248636119657|
|           0.43| 0.5505489957159876|
|           0.52| 0.5608260104252407|
|           0.53| 0.5345413177040335|
|           0.34|  0.578079615369463|
|           0.54| 0.5510920185605608|
|           0.51| 0.5628437399949179|
|           0.59|  0.564318446720022|
|           0.63|  0.583489634146376|
|           0.56| 0.6763731916446996|
|           0.36|0.40385840819757474|
|           0.71| 0.6154214077919258|
|           0.64| 0.6803544360629847|
|           0.68| 0.6120061524014518|
|           0.68| 0.6090797051344818|
|           0.68|  0.649668589349703|
+---------------+-------------------+
only showing top 20 rows



In [46]:
#evaluate the model
from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator(predictionCol='prediction',labelCol='Chance of Admit',metricName='r2')
evaluator.evaluate(predicted_test)

0.8121660701107926

In [47]:
#save the model
model.save("model")

In [48]:
#load the model
from pyspark.ml.regression import LinearRegressionModel
model = LinearRegressionModel.load('model')