In [None]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from statistics import mean

# Tensorflow imports
import tensorflow as tf
import tensorflow_addons as tfa

# LightGBM imports
from lightgbm import LGBMClassifier


In [2]:
def getDataKaggle(compName,datasetName,folderName,dtype):

 from kaggle.api.kaggle_api_extended import KaggleApi

 api = KaggleApi()
 api.authenticate()

 pathWrite = 'C:\\Users\\olyaa\\Desktop\\datasetBuffer\\' 
 pathRead = pathWrite + '\\' + datasetName


 api.competition_download_file(compName,datasetName,pathWrite)

 dfData = pd.read_csv(pathRead,dtype=dtype)

 return dfData

def feature_engineer(dataset_df):


    CATEGORICAL = ['event_name', 'name','fqid', 'room_fqid', 'text_fqid']
    NUMERICAL = ['elapsed_time','level','page','room_coor_x', 'room_coor_y','screen_coor_x', 'screen_coor_y', 'hover_duration']
             
                  
    dfs = []
    for c in CATEGORICAL:
        tmp = dataset_df.groupby(['session_id','level_group'])[c].agg('nunique')
        tmp.name = tmp.name + '_nunique'
        dfs.append(tmp)
    for c in NUMERICAL:
        tmp = dataset_df.groupby(['session_id','level_group'])[c].agg('mean')
        dfs.append(tmp)
    for c in NUMERICAL:
        tmp = dataset_df.groupby(['session_id','level_group'])[c].agg('std')
        tmp.name = tmp.name + '_std'
        dfs.append(tmp)
    dataset_df = pd.concat(dfs,axis=1)
    dataset_df = dataset_df.fillna(-1)
    dataset_df = dataset_df.reset_index()
    dataset_df = dataset_df.set_index('session_id')

    
    return dataset_df


def split_dataset(dataset, test_ratio=0.20):
    USER_LIST = dataset.index.unique()
    split = int(len(USER_LIST) * (1 - test_ratio))
    return dataset.loc[USER_LIST[:split]], dataset.loc[USER_LIST[split:]]

In [3]:
# The data format.

dtypes={
    'elapsed_time':np.int32,
    'event_name':'category',
    'name':'category',
    'level':np.uint8,
    'room_coor_x':np.float32,
    'room_coor_y':np.float32,
    'screen_coor_x':np.float32,
    'screen_coor_y':np.float32,
    'hover_duration':np.float32,
    'text':'category',
    'fqid':'category',
    'room_fqid':'category',
    'text_fqid':'category',
    'fullscreen':'category',
    'hq':'category',
    'music':'category',
    'level_group':'category'}





In [4]:
# Initial data preparation.

dfTrainUnprocessed = getDataKaggle('predict-student-performance-from-game-play','train.csv','datasetBuffer',dtypes)
dfTrainLabels =  getDataKaggle('predict-student-performance-from-game-play','train_labels.csv','datasetBuffer',{})

dfTrainLabels['session'] = dfTrainLabels.session_id.apply(lambda x: int(x.split('_')[0]) )
dfTrainLabels['q'] = dfTrainLabels.session_id.apply(lambda x: int(x.split('_')[-1][1:]) )

dfTrain = feature_engineer(dfTrainUnprocessed)
del dfTrainUnprocessed

print(f'The shape of dfTrain:{dfTrain.shape}')
print(f'The size of dfTrain: {int(dfTrain.memory_usage(index=True).sum()/(1024)**2)} MB')
print(f'The shape of dfTrainLabels:{dfTrainLabels.shape}')
print(f'The size of dfTrainLabels:{int(dfTrainLabels.memory_usage(index=True).sum()/(1024)**2)} MB')





train.csv.zip: Skipping, found more recently modified local copy (use --force to force download)
train_labels.csv.zip: Skipping, found more recently modified local copy (use --force to force download)
The shape of dfTrain:(70686, 22)
The size of dfTrain: 10 MB
The shape of dfTrainLabels:(424116, 4)
The size of dfTrainLabels:12 MB


In [5]:
# Training validation splitting.

dfTrainTrain, dfTrainValid = split_dataset(dfTrain)

print("{} examples in training, {} examples in testing.".format(
    len(dfTrainTrain), len(dfTrainValid)))




56547 examples in training, 14139 examples in testing.


In [None]:
thr = 0.5

modelsQuestions = []
evaluation_dict ={}

VALID_USER_LIST = dfTrainValid.index.unique()
dfPred = pd.DataFrame(data=np.zeros((len(VALID_USER_LIST),18)), index=VALID_USER_LIST)

for ixq in range(1,19):

    
    if ixq<=3: grp = '0-4'
    elif ixq<=13: grp = '5-12'
    elif ixq<=22: grp = '13-22'
    print("### q_no", ixq, "grp", grp)
    
        
    
    dfTrainQ = dfTrainTrain.loc[dfTrainTrain.level_group == grp]
    dfTrainQusers = dfTrainQ.index.values
    dfValidQ = dfTrainValid.loc[dfTrainValid.level_group == grp]
    dfValidQusers = dfValidQ.index.values

    
    train_labels = dfTrainLabels.loc[dfTrainLabels.q==ixq].set_index('session').loc[dfTrainQusers,['correct']]
    valid_labels = dfTrainLabels.loc[dfTrainLabels.q==ixq].set_index('session').loc[dfValidQusers,['correct']]

    


   
    train_ds = dfTrainQ.loc[:, dfTrainQ .columns != 'level_group']
    valid_ds = dfValidQ.loc[:, dfValidQ.columns != 'level_group']
    
    
    
    gbtm = LGBMClassifier(nthread=-1, verbose=-1,)
    gbtm.fit(train_ds,train_labels)
    valid_predict = gbtm.predict(valid_ds)
    
    # valid_predict = [0 if prob<0.5 else 1 for prob in valid_predict_proba[0] ]

    
    modelsQuestions.append(gbtm) 

   
    # inspector = gbtm.make_inspector()
    # inspector.evaluation()
    print(valid_predict)
    predAcc  =  accuracy_score(valid_predict,valid_labels.values)
    evaluation_dict[ixq] = predAcc         

 
    # predict = gbtm.predict(x=valid_ds)
    # dfPred.loc[dfValidQusers, q_no-1] = predict.flatten()    

In [13]:
valid_ds

Unnamed: 0_level_0,event_name_nunique,name_nunique,fqid_nunique,room_fqid_nunique,text_fqid_nunique,elapsed_time,level,page,room_coor_x,room_coor_y,...,screen_coor_y,hover_duration,elapsed_time_std,level_std,page_std,room_coor_x_std,room_coor_y_std,screen_coor_x_std,screen_coor_y_std,hover_duration_std
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
22000320020067784,10,3,50,12,33,1.209432e+06,17.408411,-1.000000,-212.484375,-128.103012,...,375.319885,9.156429e+02,1.973974e+05,2.492369,-1.000000,641.839442,245.512568,231.453027,115.347695,1.184599e+03
22000321083750010,11,4,49,12,32,9.159485e+05,17.584375,4.000000,-84.533287,-222.210892,...,420.439606,1.011381e+03,1.391398e+05,2.367307,0.000000,581.193437,235.411923,230.552957,112.231324,1.055037e+03
22000401381351532,11,4,53,12,31,8.199372e+05,17.689744,6.000000,-78.305176,-186.757095,...,402.537811,9.497188e+02,1.126127e+05,2.502163,0.000000,562.669468,232.214353,222.768695,106.095605,1.167072e+03
22000407142860316,11,4,51,12,39,1.463694e+06,17.512415,4.512195,-153.064941,-156.079147,...,369.152344,1.232429e+03,2.233004e+05,2.371914,0.745719,620.911836,256.320251,246.245288,132.967778,1.711467e+03
22000407572357990,10,3,53,13,33,3.926818e+06,17.096639,-1.000000,-210.087906,-172.272507,...,389.459961,9.825263e+02,1.513043e+05,2.504545,-1.000000,651.785709,229.219032,274.311601,98.763021,2.729180e+03
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22100215342220508,11,4,52,12,33,1.788386e+07,18.021226,4.750000,-85.291298,-203.115524,...,382.033691,1.465038e+06,2.481224e+07,2.322064,0.462910,634.642438,291.483837,258.707721,126.677303,8.892292e+06
22100215460321130,11,4,56,14,35,1.743533e+06,17.110132,4.666667,-72.409142,-214.776810,...,446.683411,7.803636e+02,2.125569e+05,2.605180,0.516398,611.316199,239.875616,261.171929,119.904896,1.321802e+03
22100217104993650,11,6,55,13,33,1.588070e+06,17.847541,4.673469,-90.367409,-221.660110,...,393.874115,1.072135e+03,2.710403e+05,2.301860,0.774267,624.429951,266.278051,262.005124,126.972362,1.824012e+03
22100219442786200,11,4,49,12,33,9.619192e+05,17.671395,5.230769,-158.599121,-257.988800,...,444.510040,1.110500e+03,1.516019e+05,2.359474,0.908083,589.562720,273.090325,248.584999,134.772721,1.675300e+03


In [7]:
evaluation_dict

{1: 0.7252280925100785,
 2: 0.9751750477402928,
 3: 0.9342244854657331,
 4: 0.792488860598345,
 5: 0.6238064926798218,
 6: 0.7910036070443455,
 7: 0.7432633142372163,
 8: 0.6286866115001061,
 9: 0.7672395501803522,
 10: 0.6015276893698281,
 11: 0.6509654148100997,
 12: 0.8684489709314661,
 13: 0.7161043921069382,
 14: 0.72989603225122,
 15: 0.611075747931254,
 16: 0.7458094631869298,
 17: 0.7006153193295142,
 18: 0.9516231699554424}

In [8]:
print(f'The average accuracy score is:{mean(list(evaluation_dict.values()))}')

The average accuracy score is:0.7531767923238324


In [9]:
# Getting the test set to predict.

dfTestUnprocessed = getDataKaggle('predict-student-performance-from-game-play','test.csv','datasetBuffer',dtypes)
dfTest = feature_engineer(dfTestUnprocessed)

test.csv: Skipping, found more recently modified local copy (use --force to force download)


In [10]:
dfTest

Unnamed: 0_level_0,level_group,event_name_nunique,name_nunique,fqid_nunique,room_fqid_nunique,text_fqid_nunique,elapsed_time,level,page,room_coor_x,...,screen_coor_y,hover_duration,elapsed_time_std,level_std,page_std,room_coor_x_std,room_coor_y_std,screen_coor_x_std,screen_coor_y_std,hover_duration_std
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
20090109393214576,0-4,11,4,24,6,13,117119.8,2.05,0.5,39.449108,...,374.597015,2006.800049,66619.89,1.348194,0.57735,440.144542,145.126812,253.078561,109.49921,2341.772128
20090109393214576,13-22,11,4,52,12,35,6165666.0,17.918089,4.84375,-124.577141,...,393.667877,973.864868,227751.2,2.279627,0.723316,629.713341,274.69674,234.320165,122.118739,1462.983181
20090109393214576,5-12,11,4,43,12,23,2092368.0,7.686567,1.888889,48.260391,...,363.1297,2301.315674,2124842.0,1.898028,0.758395,378.10032,136.966952,241.109236,118.302246,4092.914793
20090312143683264,0-4,11,4,27,7,15,142418.3,1.803681,0.0,91.577133,...,413.104584,3066.555664,78257.63,1.221509,0.0,440.053604,177.281059,233.244085,139.175966,5097.845209
20090312143683264,13-22,11,6,55,15,38,2195108.0,17.335626,4.833333,-21.133173,...,415.654297,864.09613,307875.7,2.688642,1.147211,576.561515,256.058965,250.247781,137.83206,1755.180693
20090312143683264,5-12,11,4,56,13,28,791150.4,8.445172,1.565217,42.823078,...,391.78479,1379.49292,245688.4,2.303566,0.787752,414.653053,170.659545,244.272072,147.151081,2012.309284
20090312331414616,0-4,11,4,23,6,13,112832.8,1.861538,0.0,67.333466,...,374.118652,2176.818115,61004.48,1.15975,0.0,431.739626,205.922773,230.848314,149.588791,3449.709374
20090312331414616,13-22,11,4,62,13,34,1262481.0,18.332689,5.545455,-93.299934,...,407.912476,794.542358,193471.9,2.731803,0.670982,575.032013,236.920471,259.182135,132.65316,1057.100513
20090312331414616,5-12,11,4,46,11,21,558520.5,8.459119,2.25,8.512538,...,389.124542,1561.096191,136600.2,2.121669,0.753778,357.487624,145.548412,221.97351,129.969153,3148.359406


In [11]:
dfTestUnprocessed['session_id']

0       20090109393214576
1       20090109393214576
2       20090109393214576
3       20090109393214576
4       20090109393214576
              ...        
3723    20090312331414616
3724    20090312331414616
3725    20090312331414616
3726    20090312331414616
3727    20090312331414616
Name: session_id, Length: 3728, dtype: int64

In [31]:
# Predicting the test set.

# sessionUnique = np.unique(np.array(dfTest.index))

# dfSubmit = pd.DataFrame(columns=['session_id','correct'])

# for qix in range(1,19):

#     if ixq<=3: grp = '0-4'
#     elif ixq<=13: grp = '5-12'
#     elif ixq<=22: grp = '13-22'

#     modelQuestion = modelsQuestions[qix-1]
#     suffix = '_q' + str(qix)

#     for session in sessionUnique:
        
#         rowSelector = np.logical_and((dfTest['level_group']==grp).values,dfTest.index == session)
#         sample = dfTest.loc[rowSelector, dfTrainQ .columns != 'level_group']
#         sessionQuestion = str(session) + suffix
#         correct = modelQuestion.predict(sample)

#         dfSubmit.loc[len(dfSubmit.index)] = [sessionQuestion,correct[0]]

        





In [None]:
# # Creating Submission file as CSV.

# from kaggle.api.kaggle_api_extended import KaggleApi

# api = KaggleApi()
# api.authenticate()

# submission =  dfSubmit

# submission.to_csv('submission.csv', index=False)
# api.competition_submit('submission.csv','API Submission','predict-student-performance-from-game-play')
