In [None]:
%%HTML

<style type="text/css">
div.h1 {
    background-color:#339933; 
    color: white; 
    padding: 8px; 
    padding-right: 300px; 
    font-size: 35px; 
    max-width: 1500px; 
    margin: auto; 
    margin-top: 50px;
}

div.h2 {
    background-color:#83ccd2; 
    color: white; 
    padding: 8px; 
    padding-right: 300px; 
    font-size: 35px; 
    max-width: 1500px; 
    margin: auto; 
    margin-top: 50px;
}
</style>

# <div class=h1>About this notebook</div>

In this notebook, I'll load train data and make sample plot to get good insight for data.

In version1, to begin with, the data was too large to read and visualize, so I aimed to read and visualize it first. This notebook is going to update.

In this competition, your challenge is to create algorithms for "Knowledge Tracing," the modeling of student knowledge over time. The goal is to accurately predict how students will perform on future interactions.

Our innovative algorithms will help tackle global challenges in education. If successful, it’s possible that any student with an Internet connection can enjoy the benefits of a personalized learning experience, regardless of where they live. 



<img src="https://storage.googleapis.com/kaggle-media/competitions/Riiid/Graphic%20or%20image%20within%20description%20(min%20size%20350x350).png" width="300">

# <div class=h2> visualization</div>

### load library.

In [None]:
import math
import matplotlib.pyplot as plt
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import seaborn as sns

In [None]:
!ls ../input/riiid-test-answer-prediction/

## Load data

I'll load data. Especially, train.csv is so big (5.45 GB !!!), we have to get creative.

First, I specified a partial dtype. And second, I used reduce_mem_usage function. This is knoladge from "ASHRAE - Great Energy Predictor III" comp.

In [None]:
train = pd.read_csv("../input/riiid-test-answer-prediction/train.csv",
                    dtype = {"content_type_id":"int8", "task_container_id":"int32",
                             "user_answer":"int8", "answered_correctly":"int8"})


In [None]:
#From https://www.kaggle.com/rohanrao/ashrae-half-and-half

from pandas.api.types import is_datetime64_any_dtype as is_datetime
from pandas.api.types import is_categorical_dtype

def reduce_mem_usage(df, use_float16=False):
    """
    Iterate through all the columns of a dataframe and modify the data type to reduce memory usage.        
    """
    
    start_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage of dataframe is {:.2f} MB".format(start_mem))
    
    for col in df.columns:
        if is_datetime(df[col]) or is_categorical_dtype(df[col]):
            continue
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if use_float16 and c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype("category")

    end_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage after optimization is: {:.2f} MB".format(end_mem))
    print("Decreased by {:.1f}%".format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [None]:
train = reduce_mem_usage(train, use_float16=True)

In [None]:
train.head()

In [None]:
questions = pd.read_csv("../input/riiid-test-answer-prediction/questions.csv")
lectures = pd.read_csv("../input/riiid-test-answer-prediction/lectures.csv")

In [None]:
train.dtypes

In [None]:
questions.head()

In [None]:
questions.dtypes

In [None]:
lectures.head()

In [None]:
lectures.dtypes

## Simple visualization

I'll check some features of dat by seaborn.

### train

There are too much data, so I focus on user_id == 115.

In [None]:
g_t1 = sns.distplot(train[train["user_id"]==115]["timestamp"])
g_t1.set_title("distplot of timestamp of user_id=115")

In [None]:
plt.figure(figsize=(15, 5))
g_t2 = sns.countplot(data=train[train["user_id"]==115],x="content_id")
#g_t2 = sns.distplot(train[train["user_id"]==115]["content_id"])
g_t2.set_title("distplot of content_id of user_id=115")

In [None]:
g_t3 = sns.countplot(data=train[train["user_id"]==115],x="content_type_id")
g_t3.set_title("distplot of content_type_id of user_id=115")

In [None]:
g_t4 = sns.countplot(data=train[train["user_id"]==115],x="user_answer")
g_t4.set_title("distplot of user_answer of user_id=115")

In [None]:
g_t5 = sns.countplot(data=train[train["user_id"]==115],x="answered_correctly")
g_t5.set_title("distplot of answered_correctly of user_id=115")

In [None]:
g_t6 = sns.distplot(train[train["user_id"]==115]["prior_question_elapsed_time"])
g_t6.set_title("distplot of prior_question_elapsed_time of user_id=115")

In [None]:
g_t7 = sns.countplot(data=train[train["user_id"]==115],x="prior_question_had_explanation")
g_t7.set_title("distplot of prior_question_had_explanation of user_id=115")

In [None]:
#it takes so much time and memory...
#g2 = sns.countplot(data=train, x="user_id")
#g2.set_title("countplot of user_id")

### questions

In [None]:
plt.figure(figsize=(15, 5))
g_q1 = sns.countplot(data=questions, x="question_id")
g_q1.set_title("countplot of question_id")

In [None]:
g_q2 = sns.countplot(data=questions, x="bundle_id")
g_q2.set_title("countplot of bundle_id")

In [None]:
g_q3 = sns.countplot(data=questions, x="correct_answer")
g_q3.set_title("countplot of correct_answer")

In [None]:
g_q4 = sns.countplot(data=questions, x="part")
g_q4.set_title("countplot of part")

In [None]:
plt.figure(figsize=(15, 5))
g_q5 = sns.countplot(data=questions, x="tags")
g_q5.set_title("countplot of tags")

### lectures

In [None]:
plt.figure(figsize=(15, 5))
g_l1 = sns.countplot(data=lectures, x="lecture_id")
g_l1.set_title("countplot of lecture_id")

In [None]:
g_l3 = sns.countplot(data=lectures, x="part")
g_l3.set_title("countplot of part")

In [None]:
g_l4 = sns.distplot(lectures["tag"],bins=50)
g_l4.set_title("countplot of tag")

In [None]:
g_l5 = sns.countplot(data=lectures, x="type_of")
g_l5.set_title("countplot of type_of")