# Data Acquisition

In [1]:
# Import functions
import pandas as pd
import numpy as np
import os

import random
random.seed(123)

# Acquire the data

In [None]:
# Load in primary dataset.
df = pd.read_csv('train.csv', usecols=[1,2,3,4,5,6,7,8,9])

# Training dataset
To reduce the memory size of our training data, we selected a random sample of 100,000 students from `train.csv`.
1. Select 100,000 user ids from `train.csv` to use as our train dataset.
2. Cast numeric column data types using an appropriate precision to reduce memory size.

## Sample training data
- Selecting 100_000 users.

In [None]:
# Find all users with more than 10 rows.
user_ids = df['user_id'].value_counts()[df['user_id'].value_counts() > 10].index.to_list()

# Select a random sample of 100_000 user_ids.
sampled_ids = random.sample(user_ids, 100_000)

In [None]:
len(sampled_ids)

In [None]:
def sampled_users(df):
    '''
    This function accepts data from `train.csv` and
    returns a random sample of 100_000 user_ids.
    '''
    user_ids = df['user_id'].value_counts()[df['user_id'].value_counts() > 10].index.to_list()
    sampled_ids = random.sample(user_ids, 100_000)
    return sampled_ids

In [None]:
# Using the selected user_ids, filter the dataset for the first 100_000.
filtered_data = pd.DataFrame()

data = df.loc[df['user_id'].isin(sampled_ids)]

In [None]:
# We have 100_000 users!!!!
data.user_id.nunique()

In [None]:
# Cache local file for next steps.
# data.to_csv('sampled_train.csv', index=False)

# Dataset loads correctly.
# df = pd.read_csv('sampled_train.csv')

# Casting data types

In [None]:
def datatype_converter():
    '''
    This function returns a dictionary of column names and data types to convert.
    '''
    
    train_data_types_dict = {
    'timestamp': np.int64,
    'user_id': np.int32,
    'content_id': np.int16,
    'content_type_id': np.int16,
    'task_container_id' : np.int16,
    'user_answer' : np.int8,
    'answered_correctly': np.int8,
    'prior_question_elapsed_time': np.float16
    }
    
    lectures_data_types_dict = {
    'lecture_id' : np.int16,
    'tag' : np.int8,
    'part' : np.int8
    }

    questions_data_types_dict = {
    'question_id' : np.int16,
    'bundle_id' : np.int16,
    'part' : np.int8
    }
    
    
    return train_data_types_dict, lectures_data_types_dict, questions_data_types_dict

In [None]:
def sampled_train():
    '''
    This function selects a random sample of 100_000 users from the `train.csv` dataset.
    Returns a dataframe of 100_000 users that have more than 10 rows of data.
    
    
    Parameters
    ----------
    None
    
    
    Returns
    -------
    data : pandas.core.frame.DataFrame
        A pandas dataframe of 100,000 randomly selected
        users.
    '''
    train_dtypes, _, _ = datatype_converter()
    
    if os.path.isfile('sampled_train.csv'):
        return pd.read_csv('sampled_train.csv',
                           index_col=False,
                           dtype=train_dtypes)
    else:
        
    # Read in `train.csv` data
        df = pd.read_csv('train.csv', dtype=train_dtypes, usecols=[1,2,3,4,5,6,7,8,9])

        sampled_ids = sampled_users(df)

        sampled_data = df.loc[df['user_id'].isin(sampled_ids)]

        # Cache local file of sampled data.
        sampled_data.to_csv('sampled_train.csv', index=False)
    
    # Return the dataframe
    return sampled_data

Create a function to reproduce the modified training set.

In [None]:
# Test the function.
df = sampled_train()

In [None]:
# It works.
df.user_id.value_counts()

In [None]:
# Check our data types and memory usage.
df.info()

# Merge datasets together

In [None]:
_, lecture_dtypes, question_dtypes = datatype_converter()

df_lectures = pd.read_csv('lectures.csv', dtype=lecture_dtypes)
df_questions = pd.read_csv('questions.csv', dtype=question_dtypes)

# Left join df_train and df_lectures using `content_id` as the primary key.
df_merged = df.merge(df_lectures, left_on='content_id', right_on='lecture_id', how='left')

# Left join df_merged and df_questions using `content_id` as the primary key.
df_data = df_merged.merge(df_questions, left_on='content_id', right_on='question_id', how='left')

In [None]:
df_data.info()

In [3]:
import acquire

In [4]:
df = acquire.get_riiid_data()

In [5]:
df.head()

Unnamed: 0,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,lecture_id,tag,lecture_part,type_of,question_id,bundle_id,correct_answer,question_part,tags
0,0,24600,7900,0,0,0,1,0.0,0,0,0,0,0,7900,7900,0.0,1,131 93 81
1,25379,24600,7876,0,1,2,0,24000.0,False,0,0,0,0,7876,7876,3.0,1,10 94 92
2,50137,24600,175,0,2,2,1,23008.0,False,0,0,0,0,175,175,2.0,1,9 10 92
3,70181,24600,1278,0,3,3,1,22000.0,False,0,0,0,0,1278,1278,3.0,2,143 140 81 29
4,148601,24600,2064,0,4,2,0,18000.0,False,0,0,0,0,2064,2063,1.0,3,157 92 29


In [6]:
train = pd.DataFrame()
validate = pd.DataFrame()
test = pd.DataFrame()

# Set up the train size
train_size = 0.8
validate_size = 0.1

sampled_ids = df.user_id.unique()

In [None]:
for user in sampled_ids:
    data = df.loc[df['user_id'] == user]
    n = data.shape[0]

    train_end_index = int(train_size * n)
    validate_end_index = train_end_index + int(validate_size * n)

    df_train = data.iloc[:train_end_index]
    df_validate = data.iloc[train_end_index:validate_end_index]
    df_test = data.iloc[validate_end_index:]

    train = pd.concat([train, df_train])
    validate = pd.concat([validate, df_validate])
    test = pd.concat([test, df_test])

In [None]:
# Print the shape of the original, train and test
train.shape, validate.shape, test.shape

In [None]:
train.info()

In [None]:
df = acquire.get_riiid_data()