# Data Acquisition

In [1]:
# Import pandas
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import pyspark.sql
from pyspark.sql.functions import *

import acquire

spark = pyspark.sql.SparkSession.builder.getOrCreate()

In [None]:
df_train = spark.read.csv('train.csv', header=True, inferSchema=True)
df_lectures = spark.read.csv('lectures.csv', header=True, inferSchema=True)
df_questions = spark.read.csv('questions.csv', header=True, inferSchema=True)

In [2]:
train_data_types_dict = {
    'timestamp': np.int64,
    'user_id': np.int32,
    'content_id': np.int16,
    'content_type_id': np.int16,
    'task_container_id' : np.int16,
    'user_answer' : np.int8,
    'answered_correctly': np.int8,
    'prior_question_elapsed_time': np.float16,
}

lectures_data_types_dict = {
    'lecture_id' : np.int16,
    'tag' : np.int8,
    'part' : np.int8
}

questions_data_types_dict = {
'question_id' : np.int16,
'bundle_id' : np.int16,
'part' : np.int8
}

In [3]:
df_train = pd.read_csv('train.csv', dtype=train_data_types_dict)
df_lectures = pd.read_csv('lectures.csv', dtype=lectures_data_types_dict)
df_questions = pd.read_csv('questions.csv', dtype=questions_data_types_dict)

In [4]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101230332 entries, 0 to 101230331
Data columns (total 10 columns):
 #   Column                          Dtype  
---  ------                          -----  
 0   row_id                          int64  
 1   timestamp                       int64  
 2   user_id                         int32  
 3   content_id                      int16  
 4   content_type_id                 int16  
 5   task_container_id               int16  
 6   user_answer                     int8   
 7   answered_correctly              int8   
 8   prior_question_elapsed_time     float16
 9   prior_question_had_explanation  object 
dtypes: float16(1), int16(3), int32(1), int64(2), int8(2), object(1)
memory usage: 3.6+ GB


In [24]:
df_merged = df_train.merge(df_lectures, left_on='content_id', right_on='lecture_id', how='left')
df = df_merged.merge(df_questions, left_on='content_id', right_on='question_id', how='left')

df = df.fillna(0)
df.lecture_id = df.lecture_id.astype(np.int16)
df.tag = df.tag.astype(np.int8)
df.part_x = df.part_x.astype(np.int8)
df.part_y = df.part_y.astype(np.int8)
df.question_id = df.question_id.astype(np.int16)
df.bundle = df.bundle_id.astype(np.int16)
df.lecture_id = df.lecture_id.astype(np.int32)

  # Remove the CWD from sys.path while we load stuff.


# Handling the memory issue

In [10]:
df.shape

(101230332, 19)

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 101230332 entries, 0 to 101230331
Data columns (total 19 columns):
 #   Column                          Dtype  
---  ------                          -----  
 0   row_id                          int64  
 1   timestamp                       int64  
 2   user_id                         int32  
 3   content_id                      int16  
 4   content_type_id                 int16  
 5   task_container_id               int16  
 6   user_answer                     int8   
 7   answered_correctly              int8   
 8   prior_question_elapsed_time     float16
 9   prior_question_had_explanation  object 
 10  lecture_id                      float64
 11  tag                             int8   
 12  part_x                          int8   
 13  type_of                         object 
 14  question_id                     int16  
 15  bundle_id                       float64
 16  correct_answer                  float64
 17  part_y                 

In [21]:
user_ids = df.user_id.value_counts().sort_index().index.to_list()

In [25]:
len(user_ids)

393656

In [26]:
int(len(user_ids) * .5)

196828

In [27]:
train_users = user_ids[:196828]

In [33]:
user_ids[99998:99999]

[547588483]

In [34]:
df_train.loc[df_train.user_id <= 547588483]

Unnamed: 0,row_id,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation
0,0,0,115,5692,0,1,3,1,,
1,1,56943,115,5716,0,2,2,1,36992.0,False
2,2,118363,115,128,0,0,0,1,55008.0,False
3,3,131167,115,7860,0,3,0,1,19008.0,False
4,4,137965,115,7922,0,4,1,1,11000.0,False
...,...,...,...,...,...,...,...,...,...,...
10123028,10123028,157429652,219543270,2121,0,88,3,0,36672.0,True
10123029,10123029,157429652,219543270,2120,0,88,1,0,36672.0,True
10123030,10123030,157429652,219543270,2122,0,88,3,1,36672.0,True
10123031,10123031,157745849,219543270,1795,0,89,2,1,36672.0,True


In [13]:
df_train.user_id.sample()

In [14]:
train.shape

(10123033, 19)

0.0        97967581
89.0          24468
100.0         18284
185.0         47470
192.0         11073
             ...   
32535.0        3666
32570.0        2910
32604.0           4
32625.0        8031
32736.0        8013
Name: lecture_id, Length: 418, dtype: int64