In [1]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

### Acquire the train.csv

In [2]:
# Loade the csv files
train = pd.read_csv("train.csv")

# Inspect the shapes
train.shape

(101230332, 10)

In [3]:
# Take a quck peek of the dataframe
train.head()

Unnamed: 0,row_id,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation
0,0,0,115,5692,0,1,3,1,,
1,1,56943,115,5716,0,2,2,1,37000.0,False
2,2,118363,115,128,0,0,0,1,55000.0,False
3,3,131167,115,7860,0,3,0,1,19000.0,False
4,4,137965,115,7922,0,4,1,1,11000.0,False


In [4]:
# Summarize the train dataset
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101230332 entries, 0 to 101230331
Data columns (total 10 columns):
 #   Column                          Dtype  
---  ------                          -----  
 0   row_id                          int64  
 1   timestamp                       int64  
 2   user_id                         int64  
 3   content_id                      int64  
 4   content_type_id                 int64  
 5   task_container_id               int64  
 6   user_answer                     int64  
 7   answered_correctly              int64  
 8   prior_question_elapsed_time     float64
 9   prior_question_had_explanation  object 
dtypes: float64(1), int64(8), object(1)
memory usage: 7.5+ GB


**Takeaways**
- The train dataset has more than 100 millions records. I only need to small portion to calculate the statistics of the past.<br>
    -**I will take the first 1000 user ids.**

In [8]:
# Extract the first 1000 user ids

user_ids1000 = train.user_id.value_counts().sort_index().iloc[0: 1000]
user_ids1000 = user_ids1000.index.to_list()

# Sanity check
print(len(user_ids1000))

# Print the first 5 user ids
user_ids1000[:5]

1000


[115, 124, 2746, 5382, 8623]

In [10]:
# Set the user_id to the index

train = train.set_index("user_id")
train.head()

Unnamed: 0_level_0,row_id,timestamp,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
115,0,0,5692,0,1,3,1,,
115,1,56943,5716,0,2,2,1,37000.0,False
115,2,118363,128,0,0,0,1,55000.0,False
115,3,131167,7860,0,3,0,1,19000.0,False
115,4,137965,7922,0,4,1,1,11000.0,False


In [11]:
# Subtracts the records for the first 1000 users
train1000 = train.loc[user_ids1000]

# Take a peek at the new dataset
train1000.head()

Unnamed: 0_level_0,row_id,timestamp,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
115,0,0,5692,0,1,3,1,,
115,1,56943,5716,0,2,2,1,37000.0,False
115,2,118363,128,0,0,0,1,55000.0,False
115,3,131167,7860,0,3,0,1,19000.0,False
115,4,137965,7922,0,4,1,1,11000.0,False


In [12]:
# Sanity check
train1000.tail() # should match the tail of user_ids1000

Unnamed: 0_level_0,row_id,timestamp,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
5577726,296610,625311764,5328,0,7,0,0,20000.0,False
5577726,296611,625327721,4303,0,8,3,0,10000.0,True
5577726,296612,625344560,4445,0,9,0,1,6000.0,True
5577726,296613,625367161,5206,0,10,1,0,6000.0,True
5577726,296614,625378759,3928,0,11,0,0,13000.0,True


In [14]:
# Print the tail of the user_ids1000
user_ids1000[999]

5577726

In [15]:
# # Save the train1000 as the csv
# train1000.to_csv("train1000.csv")

In [16]:
# Quick summarize the train1000
train1000.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 296615 entries, 115 to 5577726
Data columns (total 9 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   row_id                          296615 non-null  int64  
 1   timestamp                       296615 non-null  int64  
 2   content_id                      296615 non-null  int64  
 3   content_type_id                 296615 non-null  int64  
 4   task_container_id               296615 non-null  int64  
 5   user_answer                     296615 non-null  int64  
 6   answered_correctly              296615 non-null  int64  
 7   prior_question_elapsed_time     289945 non-null  float64
 8   prior_question_had_explanation  295617 non-null  object 
dtypes: float64(1), int64(7), object(1)
memory usage: 22.6+ MB


**Takeaways**
- The memoery usage of the first 1000 users are only 23 MB
- Missing values are found in columns:
    - prior_question_elapsed_time
    - prior_question_had_explanantion

### Handling the null values

In [20]:
# The values of the two prior columns will always be 0 when 
# Take a first user 115 for example

train.loc[115]

Unnamed: 0_level_0,row_id,timestamp,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
115,0,0,5692,0,1,3,1,,
115,1,56943,5716,0,2,2,1,37000.0,False
115,2,118363,128,0,0,0,1,55000.0,False
115,3,131167,7860,0,3,0,1,19000.0,False
115,4,137965,7922,0,4,1,1,11000.0,False
115,5,157063,156,0,5,2,1,5000.0,False
115,6,176092,51,0,6,0,1,17000.0,False
115,7,194190,50,0,7,3,1,17000.0,False
115,8,212463,7896,0,8,2,1,16000.0,False
115,9,230983,7863,0,9,0,1,16000.0,False


In [21]:
train.index.nuniuqe()

AttributeError: 'Int64Index' object has no attribute 'nuniuqe'