In [1]:
%pip install pyspark py4j -qq

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone


In [3]:
from pyspark.sql import SparkSession
from pyspark.ml.stat import Correlation
import pyspark.sql.functions as F

In [4]:
spark = SparkSession.builder.getOrCreate()

In [6]:
spark

In [7]:
df = spark.read.csv("/content/student_exam_data.csv", inferSchema=True, header=True)

In [8]:
df.show()

+---+--------------+--------------------+----------------+-----------+-------------------+---------+
|sex|Random answers|Knowledge on subject|Response pattern|Study Hours|Previous Exam Score|Pass/Fail|
+---+--------------+--------------------+----------------+-----------+-------------------+---------+
|  B|             B|                   5|               D| 4.37086107|        81.88970284|        0|
|  A|             C|                   2|               B|9.556428758|        72.16578198|        1|
|  B|             D|                   2|               B|7.587945476|        58.57165698|        0|
|  A|             D|                   9|               D|6.387926358|        88.82770118|        1|
|  A|             A|                   4|               A|2.404167764|        81.08387035|        0|
|  A|             C|                   3|               A|2.403950683|        49.75701636|        0|
|  B|             A|                   2|               B| 1.52275251|        94.65563107| 

In [9]:
df.describe().show()

+-------+----+--------------+--------------------+----------------+------------------+-------------------+------------------+
|summary| sex|Random answers|Knowledge on subject|Response pattern|       Study Hours|Previous Exam Score|         Pass/Fail|
+-------+----+--------------+--------------------+----------------+------------------+-------------------+------------------+
|  count| 500|           500|                 500|             500|               500|                500|               500|
|   mean|NULL|          NULL|               5.602|            NULL| 5.487055410087996|     68.91708366446|             0.368|
| stddev|NULL|          NULL|  2.9828494399756638|            NULL|2.6881956756793355|  17.12960741298545|0.4827443258789654|
|    min|   A|             A|                   1|               A|       1.045554255|        40.27792138|                 0|
|    max|   B|             D|                  10|               D|       9.936683165|         99.9830604|            

StringIndexer: It is used to convert a string column into numerical form. It allocates unique values to each of the categories present in the respective column.

In [10]:
from pyspark.ml.feature import StringIndexer

In [11]:
indexer = StringIndexer(inputCol="sex", outputCol="gender_cat")
indexed = indexer.fit(df).transform(df)

In [12]:
indexer = StringIndexer(inputCol="Random answers", outputCol="rand_ans")
indexed = indexer.fit(indexed).transform(indexed)

In [13]:
indexer = StringIndexer(inputCol="Response pattern", outputCol="res_patrn")
indexed = indexer.fit(indexed).transform(indexed)

In [14]:
indexer = StringIndexer(inputCol="Knowledge on subject", outputCol="sub_know")
indexed = indexer.fit(indexed).transform(indexed)

In [15]:
indexed.show()

+---+--------------+--------------------+----------------+-----------+-------------------+---------+----------+--------+---------+--------+
|sex|Random answers|Knowledge on subject|Response pattern|Study Hours|Previous Exam Score|Pass/Fail|gender_cat|rand_ans|res_patrn|sub_know|
+---+--------------+--------------------+----------------+-----------+-------------------+---------+----------+--------+---------+--------+
|  B|             B|                   5|               D| 4.37086107|        81.88970284|        0|       1.0|     0.0|      1.0|     5.0|
|  A|             C|                   2|               B|9.556428758|        72.16578198|        1|       0.0|     2.0|      2.0|     1.0|
|  B|             D|                   2|               B|7.587945476|        58.57165698|        0|       1.0|     1.0|      2.0|     1.0|
|  A|             D|                   9|               D|6.387926358|        88.82770118|        1|       0.0|     1.0|      1.0|     3.0|
|  A|             A|

Given the multiple columns, we need to merge them into a single column using VectorAssembler. It is a feature transformer that merges multiple columns into a vector column. One can select the number of columns used as input features and pass only those columns through the VectorAssembler. We will pass all seven input columns to create a single feature vector column in our case.

In [16]:
from pyspark.ml.linalg import Vector
from pyspark.ml.feature import VectorAssembler

In [20]:
assembler = VectorAssembler(inputCols=['rand_ans','sub_know','res_patrn','Previous Exam Score'], outputCol='features')

In [21]:
assembler

VectorAssembler_affc5a57c184

In [22]:
output = assembler.transform(indexed)

In [23]:
output.show()

+---+--------------+--------------------+----------------+-----------+-------------------+---------+----------+--------+---------+--------+--------------------+
|sex|Random answers|Knowledge on subject|Response pattern|Study Hours|Previous Exam Score|Pass/Fail|gender_cat|rand_ans|res_patrn|sub_know|            features|
+---+--------------+--------------------+----------------+-----------+-------------------+---------+----------+--------+---------+--------+--------------------+
|  B|             B|                   5|               D| 4.37086107|        81.88970284|        0|       1.0|     0.0|      1.0|     5.0|[0.0,5.0,1.0,81.8...|
|  A|             C|                   2|               B|9.556428758|        72.16578198|        1|       0.0|     2.0|      2.0|     1.0|[2.0,1.0,2.0,72.1...|
|  B|             D|                   2|               B|7.587945476|        58.57165698|        0|       1.0|     1.0|      2.0|     1.0|[1.0,1.0,2.0,58.5...|
|  A|             D|              

In [24]:
output.select('features','Pass/Fail').show()

+--------------------+---------+
|            features|Pass/Fail|
+--------------------+---------+
|[0.0,5.0,1.0,81.8...|        0|
|[2.0,1.0,2.0,72.1...|        1|
|[1.0,1.0,2.0,58.5...|        0|
|[1.0,3.0,1.0,88.8...|        1|
|[3.0,2.0,3.0,81.0...|        0|
|[2.0,4.0,3.0,49.7...|        0|
|[3.0,1.0,2.0,94.6...|        0|
|[3.0,3.0,0.0,89.3...|        1|
|[2.0,2.0,2.0,96.9...|        1|
|[0.0,4.0,3.0,83.5...|        1|
|[3.0,7.0,1.0,76.8...|        0|
|[3.0,9.0,1.0,65.0...|        1|
|[0.0,1.0,0.0,95.9...|        1|
|[2.0,2.0,3.0,91.9...|        0|
|[1.0,6.0,0.0,42.7...|        0|
|[3.0,2.0,0.0,41.5...|        0|
|(4,[3],[62.587802...|        0|
|[1.0,9.0,3.0,88.6...|        1|
|[0.0,5.0,2.0,99.2...|        0|
|[2.0,0.0,0.0,49.0...|        0|
+--------------------+---------+
only showing top 20 rows



In [25]:
final_data = output.select('features', 'Pass/Fail')

In [26]:
train_data, test_data = final_data.randomSplit([0.7, 0.3])

In [27]:
train_data.describe().show()

+-------+------------------+
|summary|         Pass/Fail|
+-------+------------------+
|  count|               341|
|   mean|0.3812316715542522|
| stddev|0.4864030122111498|
|    min|                 0|
|    max|                 1|
+-------+------------------+



In [28]:
test_data.describe().show()

+-------+-------------------+
|summary|          Pass/Fail|
+-------+-------------------+
|  count|                159|
|   mean|0.33962264150943394|
| stddev| 0.4750774577650709|
|    min|                  0|
|    max|                  1|
+-------+-------------------+



In [38]:
from pyspark.ml.classification import LogisticRegression

In [47]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

In [39]:
ship_lr=LogisticRegression(featuresCol="features", labelCol='Pass/Fail')

In [48]:
trained_ship_model = ship_lr.fit(train_data)

In [49]:
ship_results = trained_ship_model.transform(train_data)

In [58]:
# Show the predictions
ship_results.select('features', 'Pass/Fail', 'prediction', 'probability').show()

+--------------------+---------+----------+--------------------+
|            features|Pass/Fail|prediction|         probability|
+--------------------+---------+----------+--------------------+
|(4,[3],[50.886126...|        0|       0.0|[0.91104469342483...|
|(4,[3],[60.741759...|        0|       0.0|[0.84674830540373...|
|(4,[3],[60.780759...|        1|       0.0|[0.84643113778819...|
|(4,[3],[89.0466137])|        0|       1.0|[0.48424024238494...|
|(4,[3],[92.433404...|        0|       1.0|[0.43164748342747...|
|(4,[3],[95.534110...|        0|       1.0|[0.38478398359161...|
|[0.0,0.0,1.0,48.2...|        0|       0.0|[0.91438629688119...|
|[0.0,0.0,1.0,51.0...|        0|       0.0|[0.89927431180627...|
|[0.0,0.0,1.0,90.2...|        1|       1.0|[0.43359351948411...|
|[0.0,0.0,1.0,99.8...|        0|       1.0|[0.29674030030513...|
|[0.0,0.0,2.0,74.4...|        0|       0.0|[0.64529879321108...|
|[0.0,0.0,2.0,83.6...|        1|       0.0|[0.50522968300806...|
|[0.0,0.0,2.0,88.9...|   

In [50]:
evaluator = BinaryClassificationEvaluator(labelCol='Pass/Fail', rawPredictionCol="rawPrediction", metricName="areaUnderROC")

In [62]:
area_under_roc = evaluator.evaluate(ship_results)

In [63]:
print('Area Under ROC: ', area_under_roc)

Area Under ROC:  0.7693765949690118


In [59]:
# for multiclass classification problem, use MulticlassClassificationEvaluator
multi_evaluator = MulticlassClassificationEvaluator(labelCol='Pass/Fail', predictionCol="prediction", metricName="accuracy")


In [60]:
accuracy = multi_evaluator.evaluate(ship_results)
print('Accuracy: ', accuracy)

Accuracy:  0.6656891495601173
