# Academic Performance Practice Using XAPI Dataset
Dataset can be found here: https://www.kaggle.com/datasets/aljarah/xAPI-Edu-Data

## SQL Section

In [0]:
%sql
-- Checking table
select * from xapi_edu_data_csv

gender,Nationality,PlaceofBirth,StageID,GradeID,SectionID,Topic,Semester,Relation,raisedhands,VisITedResources,AnnouncementsView,Discussion,ParentAnsweringSurvey,ParentschoolSatisfaction,StudentAbsenceDays,Class
M,KW,KuwaIT,lowerlevel,G-04,A,IT,F,Father,15,16,2,20,Yes,Good,Under-7,M
M,KW,KuwaIT,lowerlevel,G-04,A,IT,F,Father,20,20,3,25,Yes,Good,Under-7,M
M,KW,KuwaIT,lowerlevel,G-04,A,IT,F,Father,10,7,0,30,No,Bad,Above-7,L
M,KW,KuwaIT,lowerlevel,G-04,A,IT,F,Father,30,25,5,35,No,Bad,Above-7,L
M,KW,KuwaIT,lowerlevel,G-04,A,IT,F,Father,40,50,12,50,No,Bad,Above-7,M
F,KW,KuwaIT,lowerlevel,G-04,A,IT,F,Father,42,30,13,70,Yes,Bad,Above-7,M
M,KW,KuwaIT,MiddleSchool,G-07,A,Math,F,Father,35,12,0,17,No,Bad,Above-7,L
M,KW,KuwaIT,MiddleSchool,G-07,A,Math,F,Father,50,10,15,22,Yes,Good,Under-7,M
F,KW,KuwaIT,MiddleSchool,G-07,A,Math,F,Father,12,21,16,50,Yes,Good,Under-7,M
F,KW,KuwaIT,MiddleSchool,G-07,B,IT,F,Father,70,80,25,70,Yes,Good,Under-7,M


### 1. Identify the topic that has the maximum number of students at each level - Lower, Middle and High

In [0]:
%sql
-- Checking how many students there are in all levels first
select count(StageID) as levels
from xapi_edu_data_csv

levels
480


In [0]:
%sql
-- Getting Lower Level
select count(StageID) as levels, StageID
from xapi_edu_data_csv
where StageID = 'lowerlevel'
group by StageID

union

-- Getting Middle School
select count(StageID) as levels, StageID
from xapi_edu_data_csv
where StageID = 'MiddleSchool'
group by StageID

union

-- Getting High School
select count(StageID) as levels, StageID
from xapi_edu_data_csv
where StageID = 'HighSchool'
group by StageID

levels,StageID
199,lowerlevel
248,MiddleSchool
33,HighSchool


### 2. Which group of student (Nationality, StageID, Gender) has the highest average grade. (if G-04 is the grade, take 4 as the grade)?

In [0]:
%sql
select count(distinct(nationality)) as nationality_count, count(distinct(StageId)) as stage_count, 
  count(distinct(gender)) as gender_count, count(gradeid) as grade_count, gradeid as grade
from xapi_edu_data_csv
group by grade
order by grade_count desc

nationality_count,stage_count,gender_count,grade_count,grade
13,1,2,147,G-02
11,1,2,116,G-08
11,2,2,101,G-07
5,1,2,48,G-04
5,1,2,32,G-06
3,1,2,13,G-11
2,1,2,11,G-12
2,1,2,5,G-09
2,1,1,4,G-10
2,1,2,3,G-05


In [0]:
%sql
select avg(grade_count) as count_of_grades, grade
from
(
select count(gradeid) as grade_count, gradeid as grade
from xapi_edu_data_csv
group by grade
order by grade_count desc
)
group by grade
order by count_of_grades desc

count_of_grades,grade
147.0,G-02
116.0,G-08
101.0,G-07
48.0,G-04
32.0,G-06
13.0,G-11
11.0,G-12
5.0,G-09
4.0,G-10
3.0,G-05


## Create dataframe from table

In [0]:
# Convert table to dataframe
df = spark.read.table("xapi_edu_data_csv")
display(df)

gender,Nationality,PlaceofBirth,StageID,GradeID,SectionID,Topic,Semester,Relation,raisedhands,VisITedResources,AnnouncementsView,Discussion,ParentAnsweringSurvey,ParentschoolSatisfaction,StudentAbsenceDays,Class
M,KW,KuwaIT,lowerlevel,G-04,A,IT,F,Father,15,16,2,20,Yes,Good,Under-7,M
M,KW,KuwaIT,lowerlevel,G-04,A,IT,F,Father,20,20,3,25,Yes,Good,Under-7,M
M,KW,KuwaIT,lowerlevel,G-04,A,IT,F,Father,10,7,0,30,No,Bad,Above-7,L
M,KW,KuwaIT,lowerlevel,G-04,A,IT,F,Father,30,25,5,35,No,Bad,Above-7,L
M,KW,KuwaIT,lowerlevel,G-04,A,IT,F,Father,40,50,12,50,No,Bad,Above-7,M
F,KW,KuwaIT,lowerlevel,G-04,A,IT,F,Father,42,30,13,70,Yes,Bad,Above-7,M
M,KW,KuwaIT,MiddleSchool,G-07,A,Math,F,Father,35,12,0,17,No,Bad,Above-7,L
M,KW,KuwaIT,MiddleSchool,G-07,A,Math,F,Father,50,10,15,22,Yes,Good,Under-7,M
F,KW,KuwaIT,MiddleSchool,G-07,A,Math,F,Father,12,21,16,50,Yes,Good,Under-7,M
F,KW,KuwaIT,MiddleSchool,G-07,B,IT,F,Father,70,80,25,70,Yes,Good,Under-7,M


### 3. What is the correlation coefficient between VisitedResources and RaisedHands?

In [0]:
# Getting the correlation coefficient

cor_coef = df.stat.corr('VisITedResources', 'raisedhands')

print(f'The correlation between Visited Resources and Raised Hands is {round(cor_coef,2)}')

The correlation between Visited Resources and Raised Hands is 0.69


### 4. Create a K-Mean (K=4) cluster with Raisedhands, VisitedResources, AnnouncementView and Discussion

#### Data Prep

In [0]:
# Ensure columns are integers
df.dtypes

Out[64]: [('gender', 'string'),
 ('Nationality', 'string'),
 ('PlaceofBirth', 'string'),
 ('StageID', 'string'),
 ('GradeID', 'string'),
 ('SectionID', 'string'),
 ('Topic', 'string'),
 ('Semester', 'string'),
 ('Relation', 'string'),
 ('raisedhands', 'int'),
 ('VisITedResources', 'int'),
 ('AnnouncementsView', 'int'),
 ('Discussion', 'int'),
 ('ParentAnsweringSurvey', 'string'),
 ('ParentschoolSatisfaction', 'string'),
 ('StudentAbsenceDays', 'string'),
 ('Class', 'string')]

In [0]:
# Import the required libraries

from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import VectorAssembler,StringIndexer
from pyspark.ml import Pipeline

In [0]:
# Selecting required variables

data=df.select(['raisedhands', 'VisITedResources', 'AnnouncementsView', 'Discussion'])
display(data)

raisedhands,VisITedResources,AnnouncementsView,Discussion
15,16,2,20
20,20,3,25
10,7,0,30
30,25,5,35
40,50,12,50
42,30,13,70
35,12,0,17
50,10,15,22
12,21,16,50
70,80,25,70


In [0]:
# Check length of the data
print("Dataframe length before dropping NAs: ", data.count())

# Drop NAs and recheck length of the data
data = data.dropna()
print("Dataframe length after dropping NAs: ", data.count())

Dataframe length before dropping NAs:  480
Dataframe length after dropping NAs:  480


In [0]:
# Vector assembler is used to create a vector of input features

assembler = VectorAssembler(inputCols=['raisedhands', 'VisITedResources', 'AnnouncementsView', 'Discussion'],
                            outputCol="features")

In [0]:
# Pipeline to pass aseembler

pipe = Pipeline(stages=[assembler])

In [0]:
# Fit the data

final_data=pipe.fit(data).transform(data)

In [0]:
# Create a K-Means Cluster of 4 and fit the model

kmeans_model = KMeans(k=4)

fit_model = kmeans_model.fit(final_data)

In [0]:
# Calculate the sum squared error rate
wssse = fit_model.summary.trainingCost # for spark 3.0
print("The within set sum of squared error of the mode is {}".format(wssse))

The within set sum of squared error of the mode is 533714.9661546274


In [0]:
# Get the centriods
centers = fit_model.clusterCenters()

In [0]:
# print the centroids
print("Cluster Centers")
index=1
for cluster in centers:
    print("Centroid {}: {}".format(index,cluster))
    index+=1

Cluster Centers
Centroid 1: [58.09933775 77.94039735 39.50993377 25.47019868]
Centroid 2: [15.12  13.96  14.6   21.224]
Centroid 3: [76.4609375 80.8984375 66.484375  73.625    ]
Centroid 4: [26.34210526 32.02631579 25.         63.85526316]


In [0]:
# Store the results in a dataframe

results = fit_model.transform(final_data)

In [0]:
# View results
results.select(['raisedhands', 'VisITedResources', 'AnnouncementsView', 'Discussion', 'prediction']).show(30)

+-----------+----------------+-----------------+----------+----------+
|raisedhands|VisITedResources|AnnouncementsView|Discussion|prediction|
+-----------+----------------+-----------------+----------+----------+
|         15|              16|                2|        20|         1|
|         20|              20|                3|        25|         1|
|         10|               7|                0|        30|         1|
|         30|              25|                5|        35|         1|
|         40|              50|               12|        50|         3|
|         42|              30|               13|        70|         3|
|         35|              12|                0|        17|         1|
|         50|              10|               15|        22|         1|
|         12|              21|               16|        50|         3|
|         70|              80|               25|        70|         2|
|         50|              88|               30|        80|         2|
|     

In [0]:
# Group by centroids to see which centroid has the highest number of counts
results.groupby('prediction').count().sort('prediction').show()

+----------+-----+
|prediction|count|
+----------+-----+
|         0|  151|
|         1|  125|
|         2|  128|
|         3|   76|
+----------+-----+

