In [7]:
import pandas as pd
import numpy as np
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [8]:
path = Path('./data/')

In [9]:
list(path.iterdir())

[PosixPath('data/session_df.csv'),
 PosixPath('data/data-science-bowl-2019.zip'),
 PosixPath('data/specs.csv'),
 PosixPath('data/train_labels.csv'),
 PosixPath('data/train.csv'),
 PosixPath('data/sample_submission.csv'),
 PosixPath('data/test.csv')]

## Read in data

In [10]:
session_df = pd.read_csv(path/'session_df.csv',index_col=0)
train_labels = pd.read_csv(path/'train_labels.csv')
test_data = pd.read_csv(path/'test.csv')
specs = pd.read_csv(path/'specs.csv')

## Reduce the size of the data frames

In [11]:
## Function to reduce the DF size
## Reference https://www.kaggle.com/caesarlupum/ds-bowl-start-here-a-gentle-introduction
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [12]:
# train_data = reduce_mem_usage(train_data)
train_labels = reduce_mem_usage(train_labels)
test_data = reduce_mem_usage(test_data)
specs = reduce_mem_usage(specs)
session_df = reduce_mem_usage(session_df)

Mem. usage decreased to  0.49 Mb (48.2% reduction)
Mem. usage decreased to 79.40 Mb (18.2% reduction)
Mem. usage decreased to  0.01 Mb (0.0% reduction)
Mem. usage decreased to 3994.70 Mb (23.2% reduction)


In [13]:
# helper function to decompose dates into more relevant parts
def explode_date(col):
    dayofyear = col.dt.dayofyear
    weekofyear = col.dt.weekofyear
    weekday = col.dt.weekday
    month = col.dt.month
    year = col.dt.year
    hour = col.dt.hour
    quarter = col.dt.quarter
    return pd.DataFrame({'dayofyear':dayofyear,
                         'weekofyear':weekofyear,
                         'weekday':weekday,
                         'quarter':quarter,
                         'month':month,
                         'year':year,
                         'hour':hour})

Main funtion that aggregates the sequence data

In [14]:
def aggregate(df,group_by_col):
    
    first_col = df.groupby(group_by_col,as_index=False)\
                  .count()\
                  .loc[:,group_by_col]
    
    # Number of events
    event_counts = df.groupby(group_by_col)\
                             .agg({'game_session':'count'})\
                             .rename(columns={'game_session':'num_events'})\
                             .reset_index()
    # flag to last event
    last_event = df.groupby(group_by_col,as_index=False)\
                            .agg({'timestamp':'max'})
    last_event['last_event_flag'] = 1
    session_df_last_event_flagged = df.merge(last_event,on=[group_by_col,'timestamp'],how='left')
    # get information about the assessment being predicted (title and world)
    assessment_title = session_df_last_event_flagged\
                          .loc[session_df_last_event_flagged.last_event_flag == 1,'title']
    assessment_world = session_df_last_event_flagged\
                          .loc[session_df_last_event_flagged.last_event_flag==1,'world']
    
    # get number of types of events (ie clips, activities,...)
    type_count = df.groupby([group_by_col,'type'],as_index=False)\
                           .agg({'timestamp':'count'})\
                           .rename(columns={'timestamp':'num'})\
                           .pivot(index=group_by_col,columns='type', values='num',)\
                           .fillna(0)\
                           .reset_index()
    # Counts number of sessions associated with each type
    type_session_count = df.groupby([group_by_col,'type'],as_index=False)\
                                   .agg({'game_session':'nunique'})\
                                   .rename({'game_session':'ct'})\
                                   .pivot(index=group_by_col,columns='type',values='game_session')\
                                   .fillna(0)\
                                   .reset_index()

    # Get time information about when the assessment was started
    assessment_ts = session_df_last_event_flagged\
                          .loc[session_df_last_event_flagged.last_event_flag == 1,'timestamp']
    assessment_ts_explode = explode_date(pd.to_datetime(assessment_ts))
    
    # Explode event type
    event_id_counts = session_df.groupby(['session','event_id'],as_index=False)\
                            .agg({'timestamp':'count'})\
                            .pivot(index='session',columns='event_id',values='timestamp')\
                            .fillna(0)
    
    event_id_counts.columns = [col+'_count' for col in event_id_counts.columns]


    out = pd.DataFrame({'installation_id':first_col.to_numpy(),
                        'event_counts':event_counts.num_events.to_numpy(),
                        'assessment_title':assessment_title.to_numpy(),
                        'assessment_world':assessment_world.to_numpy(),
                        'activity_count_events':type_count['Activity'].to_numpy(),
                        'assessment_count_events':type_count['Assessment'].to_numpy(),
                        'clip_count_events':type_count['Clip'].to_numpy(),
                        'game_count_events':type_count['Game'].to_numpy(),
                        'activity_count_sessions':type_session_count['Activity'].to_numpy(),
                        'assessment_count_sessions':type_session_count['Assessment'].to_numpy(),
                        'clip_count_sessions':type_session_count['Clip'].to_numpy(),
                        'game_count_sessions':type_session_count['Game'].to_numpy(),
                        'dayofyear_assess':assessment_ts_explode.dayofyear.to_numpy(),
                        'weekofyear_assess':assessment_ts_explode.weekofyear.to_numpy(),
                        'weekday_assess':assessment_ts_explode.weekday.to_numpy(),
                        'quarter_assess':assessment_ts_explode.quarter.to_numpy(),
                        'month_assess':assessment_ts_explode.month.to_numpy(),
                        'hour_assess':assessment_ts_explode.hour.to_numpy()})
    
    return pd.concat([out,event_id_counts],axis=1)
    
seq_agg = aggregate(session_df,'session')

In [15]:
# concatenate labels to features
train_data_agg = pd.concat([seq_agg,train_labels.accuracy_group],axis=1)

In [19]:
# save aggregated data as csv
train_data_agg.to_csv(path/'data_agg.csv')

# Modeling

In [50]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.metrics import cohen_kappa_score,confusion_matrix, make_scorer
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split

In [28]:
train_data_agg = pd.read_csv(path/'data_agg.csv',index_col=0)

In [43]:
train,test = train_test_split(train_data_agg,test_size=.2)
train_x = train.iloc[:,:-1]
train_y = train.iloc[:,-1]
test_x = test.iloc[:,:-1]
test_y = test.iloc[:,-1]

In [44]:
preprocessor = ColumnTransformer([('drop','drop',['installation_id','dayofyear_assess',
                                                  'weekofyear_assess','weekday_assess',
                                                  'quarter_assess','month_assess',
                                                  'hour_assess']),
                                  ('onehot',OneHotEncoder(),['assessment_title',
                                                             'assessment_world'])],
                                 remainder='passthrough')

In [45]:
pipe = Pipeline([('col_transform',preprocessor),
                 ('cls',RandomForestClassifier(n_estimators=100))])

In [47]:
params = {'cls__n_estimators':range(1,300,10),
          'cls__max_depth':range(1,15,2),
          'cls__min_samples_leaf':range(10,50,2)}

In [67]:
# Hyperparameter tuning and cross validation
cv = RandomizedSearchCV(pipe,params,n_jobs=12,n_iter=10)
cv.fit(train_x,train_y)



RandomizedSearchCV(cv='warn', error_score='raise-deprecating',
                   estimator=Pipeline(memory=None,
                                      steps=[('col_transform',
                                              ColumnTransformer(n_jobs=None,
                                                                remainder='passthrough',
                                                                sparse_threshold=0.3,
                                                                transformer_weights=None,
                                                                transformers=[('drop',
                                                                               'drop',
                                                                               ['installation_id',
                                                                                'dayofyear_assess',
                                                                                'weekofyear_assess',
       

In [66]:
preds = cv.predict(test_x)
cohen_kappa_score(test_y,preds,weights='quadratic')

0.20545560745040026

## Prepare Kaggle Submission (work in progress)

In [42]:
# def prepare_submission():
#     test_agg = aggregate(test_data,'installation_id')
#     test_predictions = cv.predict(test_agg)
#     submission_df = pd.DataFrame({'installation_id':test_agg.installation_id.to_numpy(),
#                                   'accuracy_group':test_predictions})
#     return submission_df
# test = prepare_submission()

In [None]:
# test.to_csv('submission.csv',index=False)

In [161]:
# test.to_csv('submission/submission.csv',index=False)