<a href="https://colab.research.google.com/github/Supakrit65/Predicting_Admissions_using_PySpark_ML/blob/main/Predicting_Admissions_using_PySpark_ML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install Dependencies & Run a SparkSession


In [64]:
# install pyspark
! pip install pyspark



In [65]:
# create a sparksession
from pyspark.sql import SparkSession # entry point for Spark applications and functionalities
spark = SparkSession \
      .builder \
      .appName("Python Spark") \
      .getOrCreate()

# Clone & Explore dataset

In [66]:
# clone the dataset
! git clone https://github.com/education454/admission_dataset

fatal: destination path 'admission_dataset' already exists and is not an empty directory.


In [67]:
# check the presence of dataset
! ls admission_dataset/

Admission_Predict_Ver1.1.csv


In [68]:
# create a spark dataframe
df = spark.read.csv("/content/admission_dataset/Admission_Predict_Ver1.1.csv",
                    header=True, inferSchema=True)
df.show()

+---------+---------+-----------+-----------------+---+---+----+--------+---------------+
|Serial No|GRE Score|TOEFL Score|University Rating|SOP|LOR|CGPA|Research|Chance of Admit|
+---------+---------+-----------+-----------------+---+---+----+--------+---------------+
|        1|      337|        118|                4|4.5|4.5|9.65|       1|           0.92|
|        2|      324|        107|                4|4.0|4.5|8.87|       1|           0.76|
|        3|      316|        104|                3|3.0|3.5| 8.0|       1|           0.72|
|        4|      322|        110|                3|3.5|2.5|8.67|       1|            0.8|
|        5|      314|        103|                2|2.0|3.0|8.21|       0|           0.65|
|        6|      330|        115|                5|4.5|3.0|9.34|       1|            0.9|
|        7|      321|        109|                3|3.0|4.0| 8.2|       1|           0.75|
|        8|      308|        101|                2|3.0|4.0| 7.9|       0|           0.68|
|        9

In [69]:
# get the no.of rows & columns
print((df.count(), len(df.columns)))

(500, 9)


In [70]:
# print schema
df.printSchema()

root
 |-- Serial No: integer (nullable = true)
 |-- GRE Score: integer (nullable = true)
 |-- TOEFL Score: integer (nullable = true)
 |-- University Rating: integer (nullable = true)
 |-- SOP: double (nullable = true)
 |-- LOR: double (nullable = true)
 |-- CGPA: double (nullable = true)
 |-- Research: integer (nullable = true)
 |-- Chance of Admit: double (nullable = true)



In [71]:
# get the summary statistics
df.describe().show()

+-------+-----------------+------------------+-----------------+-----------------+------------------+------------------+------------------+------------------+-------------------+
|summary|        Serial No|         GRE Score|      TOEFL Score|University Rating|               SOP|               LOR|              CGPA|          Research|    Chance of Admit|
+-------+-----------------+------------------+-----------------+-----------------+------------------+------------------+------------------+------------------+-------------------+
|  count|              500|               500|              500|              500|               500|               500|               500|               500|                500|
|   mean|            250.5|           316.472|          107.192|            3.114|             3.374|             3.484| 8.576440000000003|              0.56| 0.7217399999999996|
| stddev|144.4818327679989|11.295148372354712|6.081867659564538|1.143511800759815|0.9910036207566072|0.92

# Data Cleaning

In [72]:
# drop the unnecessary column
df = df.drop('Serial No')

In [73]:
# display the dataframe
df.show()

+---------+-----------+-----------------+---+---+----+--------+---------------+
|GRE Score|TOEFL Score|University Rating|SOP|LOR|CGPA|Research|Chance of Admit|
+---------+-----------+-----------------+---+---+----+--------+---------------+
|      337|        118|                4|4.5|4.5|9.65|       1|           0.92|
|      324|        107|                4|4.0|4.5|8.87|       1|           0.76|
|      316|        104|                3|3.0|3.5| 8.0|       1|           0.72|
|      322|        110|                3|3.5|2.5|8.67|       1|            0.8|
|      314|        103|                2|2.0|3.0|8.21|       0|           0.65|
|      330|        115|                5|4.5|3.0|9.34|       1|            0.9|
|      321|        109|                3|3.0|4.0| 8.2|       1|           0.75|
|      308|        101|                2|3.0|4.0| 7.9|       0|           0.68|
|      302|        102|                1|2.0|1.5| 8.0|       0|            0.5|
|      323|        108|                3

In [74]:
# check for null values
from pyspark.sql.functions import isnan, when, count
df.select([count(when(isnan(c) | df[c].isNull(), c)).alias(c) for c in df.columns]
  ).show()

+---------+-----------+-----------------+---+---+----+--------+---------------+
|GRE Score|TOEFL Score|University Rating|SOP|LOR|CGPA|Research|Chance of Admit|
+---------+-----------+-----------------+---+---+----+--------+---------------+
|        0|          0|                0|  0|  0|   0|       0|              0|
+---------+-----------+-----------------+---+---+----+--------+---------------+



# Correlation Analysis & Feature Selection

In [75]:
# correlation analysis
correlations = {}
for col in df.columns:
  if col != "Chance of Admit":
    corr_score = df.stat.corr('Chance of Admit', col)
    print(f"Correlation to chance of admit for {col} is {corr_score}")
    correlations[col] = corr_score

Correlation to chance of admit for GRE Score is 0.8103506354632598
Correlation to chance of admit for TOEFL Score is 0.7922276143050823
Correlation to chance of admit for University Rating is 0.6901323687886892
Correlation to chance of admit for SOP is 0.6841365241316723
Correlation to chance of admit for LOR is 0.6453645135280112
Correlation to chance of admit for CGPA is 0.882412574904574
Correlation to chance of admit for Research is 0.5458710294711379


In [76]:
# create a VectorAssembler to combine features into a single vector
from pyspark.ml.feature import VectorAssembler, VectorSlicer

feature_cols = [col for col in df.columns if col != 'Chance of Admit']
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")

In [77]:
# display dataframe
output_data = assembler.transform(df)
output_data.show()

+---------+-----------+-----------------+---+---+----+--------+---------------+--------------------+
|GRE Score|TOEFL Score|University Rating|SOP|LOR|CGPA|Research|Chance of Admit|            features|
+---------+-----------+-----------------+---+---+----+--------+---------------+--------------------+
|      337|        118|                4|4.5|4.5|9.65|       1|           0.92|[337.0,118.0,4.0,...|
|      324|        107|                4|4.0|4.5|8.87|       1|           0.76|[324.0,107.0,4.0,...|
|      316|        104|                3|3.0|3.5| 8.0|       1|           0.72|[316.0,104.0,3.0,...|
|      322|        110|                3|3.5|2.5|8.67|       1|            0.8|[322.0,110.0,3.0,...|
|      314|        103|                2|2.0|3.0|8.21|       0|           0.65|[314.0,103.0,2.0,...|
|      330|        115|                5|4.5|3.0|9.34|       1|            0.9|[330.0,115.0,5.0,...|
|      321|        109|                3|3.0|4.0| 8.2|       1|           0.75|[321.0,109.0

# Build the Regression Model

In [78]:
# import GradientBoostingRegressor and create final data
from pyspark.ml.regression import GBTRegressor
final_data = output_data.select('features', 'Chance of Admit')

In [79]:
# print schema of final data
final_data.printSchema()

root
 |-- features: vector (nullable = true)
 |-- Chance of Admit: double (nullable = true)



In [80]:
# split the dataset into training and test sets
train, test = final_data.randomSplit([0.7, 0.3], seed=42)

In [81]:
# initialize the model
gbt = GBTRegressor(labelCol="Chance of Admit", featuresCol="features")

In [82]:
# define a parameter grid for grid search
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit
paramGrid = (ParamGridBuilder()
             .addGrid(gbt.maxDepth, [2, 4, 6])
             .addGrid(gbt.maxBins, [20, 30, 40])
             .addGrid(gbt.maxIter, [10, 20, 30])
             .build())

In [83]:
# create a TrainValidationSplit with the GradientBoostingRegressor
from pyspark.ml.evaluation import RegressionEvaluator
tvs = TrainValidationSplit(estimator=gbt,
                           estimatorParamMaps=paramGrid,
                           evaluator=RegressionEvaluator(labelCol="Chance of Admit", metricName="rmse"),
                           trainRatio=0.8)

In [87]:
# fit the TrainValidationSplit model on the training data
tvs_model = tvs.fit(train)

In [88]:
# get the best model from the grid search
best_model = tvs_model.bestModel

# Evaluate & Save the Model

In [89]:
# make predictions on the training set
train_predictions = best_model.transform(train)

In [90]:
# make predictions on the test set
test_predictions = best_model.transform(test)

In [91]:
# evaluate the final model on the training set
evaluator = RegressionEvaluator(labelCol="Chance of Admit", metricName="rmse")
train_rmse = evaluator.evaluate(train_predictions)
print(f"Training RMSE: {train_rmse}")

Training RMSE: 0.05869405654701518


In [92]:
# Evaluate the final model on the test set
from pyspark.sql.functions import col, lit, pow, sqrt
evaluator = RegressionEvaluator(labelCol="Chance of Admit", metricName="rmse")
test_rmse = evaluator.evaluate(test_predictions)
print(f"Test RMSE: {test_rmse}")
# calculate RMSPE on the test set
test_rmspe = test_predictions.withColumn(
    "percent_error_squared",
    pow((col("Chance of Admit") - col("prediction")) / col("Chance of Admit"), 2)
).withColumn(
    "percent_error_squared",
    when(col("Chance of Admit") == 0, lit(0)).otherwise(col("percent_error_squared"))
).selectExpr("sqrt(avg(percent_error_squared)) as rmspe")

rmspe = test_rmspe.select("rmspe").collect()[0][0]
print(f"Test RMSPE: {rmspe * 100:.2f}%")

Test RMSE: 0.06211256497547917
Test RMSPE: 10.92%


In [93]:
# save the model
best_model.write().overwrite().save('best_model')

In [94]:
# load the model
from pyspark.ml.regression import GBTRegressionModel
loaded_model = GBTRegressionModel.load('best_model')

In [95]:
print("Test RMSE:", evaluator.evaluate(loaded_model.transform(test)))

Test RMSE: 0.06211256497547917
