In [2]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

In [6]:
train = pd.read_csv("train.csv")
questions = pd.read_csv("questions.csv")
lectures = pd.read_csv("lectures.csv")

# Print the shapes
train.shape, lectures.shape, questions.shape

((101230332, 10), (418, 4), (13523, 5))

In [20]:
# Copy the three dataframes

df_train = train.copy()
df_questions = questions.copy()
df_lectures = lectures.copy()

In [21]:
# Summarize the train dataset
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101230332 entries, 0 to 101230331
Data columns (total 10 columns):
 #   Column                          Dtype  
---  ------                          -----  
 0   row_id                          int64  
 1   timestamp                       int64  
 2   user_id                         int64  
 3   content_id                      int64  
 4   content_type_id                 int64  
 5   task_container_id               int64  
 6   user_answer                     int64  
 7   answered_correctly              int64  
 8   prior_question_elapsed_time     float64
 9   prior_question_had_explanation  object 
dtypes: float64(1), int64(8), object(1)
memory usage: 7.5+ GB


In [22]:
# Summarize the lectures dataset
df_lectures.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   lecture_id  418 non-null    int64 
 1   tag         418 non-null    int64 
 2   part        418 non-null    int64 
 3   type_of     418 non-null    object
dtypes: int64(3), object(1)
memory usage: 13.2+ KB


In [23]:
# Merge df_train and df_lectures

df_train = df_train.merge(df_lectures, how="left", 
                          left_on="content_id", right_on="lecture_id")

# Print the shape
df_train.shape

(101230332, 14)

In [24]:
# Quick summarize of df_questions
df_questions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13523 entries, 0 to 13522
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   question_id     13523 non-null  int64 
 1   bundle_id       13523 non-null  int64 
 2   correct_answer  13523 non-null  int64 
 3   part            13523 non-null  int64 
 4   tags            13522 non-null  object
dtypes: int64(4), object(1)
memory usage: 528.4+ KB


In [28]:
# Merge df_train and df_questions

df_train = df_train.merge(df_questions, how="left", 
                          left_on="content_id", right_on="question_id")

# Print the shape
df_train.shape

(101230332, 19)

In [30]:
df_train.head()

Unnamed: 0,row_id,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,lecture_id,tag,part_x,type_of,question_id,bundle_id,correct_answer,part_y,tags
0,0,0,115,5692,0,1,3,1,,,,,,,5692.0,5692.0,3.0,5.0,151
1,1,56943,115,5716,0,2,2,1,37000.0,False,,,,,5716.0,5716.0,2.0,5.0,168
2,2,118363,115,128,0,0,0,1,55000.0,False,,,,,128.0,128.0,0.0,1.0,131 149 92
3,3,131167,115,7860,0,3,0,1,19000.0,False,,,,,7860.0,7860.0,0.0,1.0,131 104 81
4,4,137965,115,7922,0,4,1,1,11000.0,False,,,,,7922.0,7922.0,1.0,1.0,131 149 92


In [29]:
# How many user ids?

df_train.user_id.nunique()

393656

In [31]:
# Compute how many interactions each user id has 
df_train.groupby("user_id").timestamp.count()

user_id
115            46
124            30
2746           20
5382          128
8623          112
             ... 
2147470770    228
2147470777    758
2147481750     50
2147482216    280
2147482888     27
Name: timestamp, Length: 393656, dtype: int64

In [32]:
# Create a mask for user_di 115
mask115 = (df_train.user_id == 115)

# Substract user 115's data
user115 = df_train[mask115]

# Print the shape
user115.shape

(46, 19)

In [33]:
# Take a look at the train data from user 115
user115

Unnamed: 0,row_id,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,lecture_id,tag,part_x,type_of,question_id,bundle_id,correct_answer,part_y,tags
0,0,0,115,5692,0,1,3,1,,,,,,,5692.0,5692.0,3.0,5.0,151
1,1,56943,115,5716,0,2,2,1,37000.0,False,,,,,5716.0,5716.0,2.0,5.0,168
2,2,118363,115,128,0,0,0,1,55000.0,False,,,,,128.0,128.0,0.0,1.0,131 149 92
3,3,131167,115,7860,0,3,0,1,19000.0,False,,,,,7860.0,7860.0,0.0,1.0,131 104 81
4,4,137965,115,7922,0,4,1,1,11000.0,False,,,,,7922.0,7922.0,1.0,1.0,131 149 92
5,5,157063,115,156,0,5,2,1,5000.0,False,,,,,156.0,156.0,2.0,1.0,131 101 162 38
6,6,176092,115,51,0,6,0,1,17000.0,False,,,,,51.0,51.0,0.0,1.0,131 187 81
7,7,194190,115,50,0,7,3,1,17000.0,False,,,,,50.0,50.0,3.0,1.0,131 101 38
8,8,212463,115,7896,0,8,2,1,16000.0,False,,,,,7896.0,7896.0,2.0,1.0,131 104 162 81
9,9,230983,115,7863,0,9,0,1,16000.0,False,,,,,7863.0,7863.0,0.0,1.0,131 68 92


In [34]:
# Create a mask for user_di 115
mask124 = (df_train.user_id == 124)

# Substract user 115's data
user124 = df_train[mask124]

# Print the shape
user124.shape

(30, 19)

In [35]:
user124

Unnamed: 0,row_id,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,lecture_id,tag,part_x,type_of,question_id,bundle_id,correct_answer,part_y,tags
46,46,0,124,7900,0,0,0,1,,,,,,,7900.0,7900.0,0.0,1.0,131 93 81
47,47,32683,124,7876,0,1,0,0,26000.0,False,,,,,7876.0,7876.0,3.0,1.0,10 94 92
48,48,62000,124,175,0,2,2,1,29000.0,False,,,,,175.0,175.0,2.0,1.0,9 10 92
49,49,83632,124,1278,0,3,1,0,26000.0,False,,,,,1278.0,1278.0,3.0,2.0,143 140 81 29
50,50,189483,124,2064,0,4,2,0,18000.0,False,,,,,2064.0,2063.0,1.0,3.0,157 92 29
51,51,189483,124,2063,0,4,3,0,18000.0,False,,,,,2063.0,2063.0,0.0,3.0,136 92 29
52,52,189483,124,2065,0,4,2,1,18000.0,False,,,,,2065.0,2063.0,2.0,3.0,136 162 92 29
53,53,258793,124,3364,0,5,2,0,33333.0,False,,,,,3364.0,3363.0,1.0,4.0,136 103 29
54,54,258793,124,3365,0,5,3,0,33333.0,False,,,,,3365.0,3363.0,2.0,4.0,136 103 29
55,55,258793,124,3363,0,5,0,0,33333.0,False,,,,,3363.0,3363.0,3.0,4.0,74 103 29
