<a href="https://colab.research.google.com/github/PremKumar-V/ML_Projects/blob/main/Predict_Student_Performance_ML_Comp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Machine Learning Competition - Predict Student Performance from Game Play

## Reference
- [Dataset Link](https://www.kaggle.com/competitions/predict-student-performance-from-game-play/)

## ToDo
- [x] Prepare Notebook
- [ ] Data Exploration
- [ ] Data Preprocessing
- [ ] Model Training
- [ ] Hypertuning
- [ ] Evaluation
- [ ] Submission

### Prepare Notebook

In [5]:
# Download Libraries
!pip install kaggle xgboost opendatasets --quiet

In [6]:
# Import Libraries

import os
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import opendatasets as od
import warnings
warnings.filterwarnings('ignore')

In [7]:
# Download Dataset

od.download('https://www.kaggle.com/competitions/predict-student-performance-from-game-play/data')

Downloading predict-student-performance-from-game-play.zip to ./predict-student-performance-from-game-play


100%|██████████| 484M/484M [00:03<00:00, 150MB/s]



Extracting archive ./predict-student-performance-from-game-play/predict-student-performance-from-game-play.zip to ./predict-student-performance-from-game-play


In [8]:
# Inspecting Dataset

dataDir = './predict-student-performance-from-game-play/'

os.listdir(dataDir)

['test.csv',
 'jo_wilder',
 'train.csv',
 'train_labels.csv',
 'sample_submission.csv']

In [9]:
# Load dataset

trainDf = pd.read_csv(dataDir + 'train.csv')
trainDf.head(2)

Unnamed: 0,session_id,index,elapsed_time,event_name,name,level,page,room_coor_x,room_coor_y,screen_coor_x,screen_coor_y,hover_duration,text,fqid,room_fqid,text_fqid,fullscreen,hq,music,level_group
0,20090312431273200,0,0,cutscene_click,basic,0,,-413.991405,-159.314686,380.0,494.0,,undefined,intro,tunic.historicalsociety.closet,tunic.historicalsociety.closet.intro,,,,0-4
1,20090312431273200,1,1323,person_click,basic,0,,-413.991405,-159.314686,380.0,494.0,,"Whatcha doing over there, Jo?",gramps,tunic.historicalsociety.closet,tunic.historicalsociety.closet.gramps.intro_0_...,,,,0-4


In [10]:
trainLabels = pd.read_csv(dataDir + 'train_labels.csv')

In [11]:
testDf = pd.read_csv(dataDir + 'test.csv')

### Data Exploration

In [12]:
trainDf.describe()

Unnamed: 0,session_id,index,elapsed_time,level,page,room_coor_x,room_coor_y,screen_coor_x,screen_coor_y,hover_duration,fullscreen,hq,music
count,13174210.0,13174210.0,13174210.0,13174210.0,284746.0,12137970.0,12137970.0,12137970.0,12137970.0,1000737.0,0.0,0.0,0.0
mean,2.113413e+16,652.6426,3846817.0,12.19194,3.15793,-54.93615,-116.2997,458.1144,385.2363,3186.237,,,
std,566522000000000.0,627.5818,27013870.0,6.499188,2.064042,520.1468,218.5912,247.2144,129.2879,369226.5,,,
min,2.009031e+16,0.0,0.0,0.0,0.0,-1992.355,-918.1587,0.0,0.0,0.0,,,
25%,2.101031e+16,289.0,439430.0,6.0,1.0,-352.9376,-212.8361,269.0,304.0,100.0,,,
50%,2.104022e+16,596.0,1013425.0,13.0,3.0,-11.16317,-97.78151,447.0,397.0,418.0,,,
75%,2.110051e+16,897.0,1740050.0,18.0,5.0,296.3618,22.68531,663.0,471.0,1266.0,,,
max,2.210022e+16,20473.0,1749293000.0,22.0,6.0,1261.774,543.6164,1916.0,1439.0,219907800.0,,,


In [13]:
trainDf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13174211 entries, 0 to 13174210
Data columns (total 20 columns):
 #   Column          Dtype  
---  ------          -----  
 0   session_id      int64  
 1   index           int64  
 2   elapsed_time    int64  
 3   event_name      object 
 4   name            object 
 5   level           int64  
 6   page            float64
 7   room_coor_x     float64
 8   room_coor_y     float64
 9   screen_coor_x   float64
 10  screen_coor_y   float64
 11  hover_duration  float64
 12  text            object 
 13  fqid            object 
 14  room_fqid       object 
 15  text_fqid       object 
 16  fullscreen      float64
 17  hq              float64
 18  music           float64
 19  level_group     object 
dtypes: float64(9), int64(4), object(7)
memory usage: 2.0+ GB


In [35]:
nullDf = pd.DataFrame((trainDf.isna().sum() / len(trainDf)) * 100)
nullDf

Unnamed: 0,0
session_id,0.0
index,0.0
elapsed_time,0.0
event_name,0.0
name,0.0
level,0.0
room_coor_x,7.86567
room_coor_y,7.86567
screen_coor_x,7.86567
screen_coor_y,7.86567


In [38]:
validList = list(nullDf[nullDf[0] < 50].T.columns)
validList

['session_id',
 'index',
 'elapsed_time',
 'event_name',
 'name',
 'level',
 'room_coor_x',
 'room_coor_y',
 'screen_coor_x',
 'screen_coor_y',
 'fqid',
 'room_fqid',
 'level_group']

In [29]:
trainDf = trainDf[validList]
len(trainDf)

13174211

In [33]:
originalPercent = 13174210
cutOffPercent = len(trainDf)

print(f"Percentage Cut off: {(originalPercent / cutOffPercent) * 100}")

Percentage Cut off: 99.9999924094126
