# Data Acquisition

In [1]:
# Import functions
import pandas as pd
import numpy as np
import math
import os
import acquire

import random
random.seed(123)

# Acquire the data

In [None]:
# Load in primary dataset.
df = pd.read_csv('train.csv', usecols=[1,2,3,4,5,6,7,8,9])

# Training dataset
To reduce the memory size of our training data, we selected a random sample of 100,000 students from `train.csv`.
1. Select 100,000 user ids from `train.csv` to use as our train dataset.
2. Cast numeric column data types using an appropriate precision to reduce memory size.

## Sample training data
- Selecting 100_000 users.

In [None]:
# Find all users with more than 10 rows.
user_ids = df['user_id'].value_counts()[df['user_id'].value_counts() > 10].index.to_list()

# Select a random sample of 100_000 user_ids.
sampled_ids = random.sample(user_ids, 100_000)

In [None]:
len(sampled_ids)

In [None]:
def sampled_users(df):
    '''
    This function accepts data from `train.csv` and
    returns a random sample of 100_000 user_ids.
    '''
    user_ids = df['user_id'].value_counts()[df['user_id'].value_counts() > 10].index.to_list()
    sampled_ids = random.sample(user_ids, 100_000)
    return sampled_ids

In [None]:
# Using the selected user_ids, filter the dataset for the first 100_000.
filtered_data = pd.DataFrame()

data = df.loc[df['user_id'].isin(sampled_ids)]

In [None]:
# We have 100_000 users!!!!
data.user_id.nunique()

In [None]:
# Cache local file for next steps.
# data.to_csv('sampled_train.csv', index=False)

# Dataset loads correctly.
# df = pd.read_csv('sampled_train.csv')

# Casting data types

In [None]:
def datatype_converter():
    '''
    This function returns a dictionary of column names and data types to convert.
    '''
    
    train_data_types_dict = {
    'timestamp': np.int64,
    'user_id': np.int32,
    'content_id': np.int16,
    'content_type_id': np.int16,
    'task_container_id' : np.int16,
    'user_answer' : np.int8,
    'answered_correctly': np.int8,
    'prior_question_elapsed_time': np.float16
    }
    
    lectures_data_types_dict = {
    'lecture_id' : np.int16,
    'tag' : np.int8,
    'part' : np.int8
    }

    questions_data_types_dict = {
    'question_id' : np.int16,
    'bundle_id' : np.int16,
    'part' : np.int8
    }
    
    
    return train_data_types_dict, lectures_data_types_dict, questions_data_types_dict

In [None]:
def sampled_train():
    '''
    This function selects a random sample of 100_000 users from the `train.csv` dataset.
    Returns a dataframe of 100_000 users that have more than 10 rows of data.
    
    
    Parameters
    ----------
    None
    
    
    Returns
    -------
    data : pandas.core.frame.DataFrame
        A pandas dataframe of 100,000 randomly selected
        users.
    '''
    train_dtypes, _, _ = datatype_converter()
    
    if os.path.isfile('sampled_train.csv'):
        return pd.read_csv('sampled_train.csv',
                           index_col=False,
                           dtype=train_dtypes)
    else:
        
    # Read in `train.csv` data
        df = pd.read_csv('train.csv', dtype=train_dtypes, usecols=[1,2,3,4,5,6,7,8,9])

        sampled_ids = sampled_users(df)

        sampled_data = df.loc[df['user_id'].isin(sampled_ids)]

        # Cache local file of sampled data.
        sampled_data.to_csv('sampled_train.csv', index=False)
    
    # Return the dataframe
    return sampled_data

Create a function to reproduce the modified training set.

In [None]:
# Test the function.
df = sampled_train()

In [None]:
# It works.
df.user_id.value_counts()

In [None]:
# Check our data types and memory usage.
df.info()

# Merge datasets together

In [None]:
_, lecture_dtypes, question_dtypes = datatype_converter()

df_lectures = pd.read_csv('lectures.csv', dtype=lecture_dtypes)
df_questions = pd.read_csv('questions.csv', dtype=question_dtypes)

# Left join df_train and df_lectures using `content_id` as the primary key.
df_merged = df.merge(df_lectures, left_on='content_id', right_on='lecture_id', how='left')

# Left join df_merged and df_questions using `content_id` as the primary key.
df_data = df_merged.merge(df_questions, left_on='content_id', right_on='question_id', how='left')

In [None]:
df_data.info()

In [None]:
import acquire

In [None]:
df = acquire.get_riiid_data()

In [None]:
df.head()

In [None]:
train = pd.DataFrame()
validate = pd.DataFrame()
test = pd.DataFrame()

# Set up the train size
train_size = 0.8
validate_size = 0.1

sampled_ids = df.user_id.unique()

In [None]:
for user in sampled_ids:
    data = df.loc[df['user_id'] == user]
    n = data.shape[0]

    train_end_index = int(train_size * n)
    validate_end_index = train_end_index + int(validate_size * n)

    df_train = data.iloc[:train_end_index]
    df_validate = data.iloc[train_end_index:validate_end_index]
    df_test = data.iloc[validate_end_index:]

    train = pd.concat([train, df_train])
    validate = pd.concat([validate, df_validate])
    test = pd.concat([test, df_test])

In [None]:
# Print the shape of the original, train and test
train.shape, validate.shape, test.shape

In [None]:
train.to_csv('sampled_trainset.csv', index=False)
validate.to_csv('validate.csv', index=False)
test.to_csv('test.csv', index=False)

In [None]:
type(train)

# Create a function to split data using Quasi-GroupKFold method
The way our data is currently split for MVP:
- 100,000 randomly selected users that have more that 10 interactions with Riiid's Knowledge tracing application.
- Each user has _sequential_ data, indicated by the variable `timestamp`.
- Data is split using a percentage-based method.
- 0% - 80% of a users data is the training set.
- 80% - 90% of a users data is in the validation set.
- 90% - 100% of a users data is in the test set.

The way we split the data is important. As we currently have our splits, several issues arise that impact our data exploration and modeling performance.
- Spliting the data using a percentage-based method removes questions and lectures from our training data. If a model encounters a question it has never seen before, how can it accurately model reality? We need to have all questions appear at least once in our dataset.
    - If this was _purely_ a time series problem, that would be fine.
- The training set uses 80% of a users data, this impacts our statistical analysis. If we have _all_ of a users data, we can correctly calculate population statistics from a _sample_ of users. We can then compare statistics on a user/grouped level with the population.

What is the solution?

> <strong>Splitting by users!</strong>

## Splitting by users
How does this solve our exploration and modeling issues?
> <strong>By creating users the model has never seen before!</strong>

This simulates _new_ users interacting with Riiid's Knowledge Tracing Application.

In [None]:
df_data = acquire.get_riiid_data()

In [None]:
df_data.lecture_id = df_data.lecture_id.astype('Int16')
df_data.tag = df_data.tag.astype('Int8')
df_data.part_x = df_data.part_x.astype('Int8')
df_data.part_y = df_data.part_y.astype('Int8')
df_data.question_id = df_data.question_id.astype('Int16')
df_data.bundle_id = df_data.bundle_id.astype('Int16')
df_data.lecture_id = df_data.lecture_id.astype('Int32')

# Prefix part names with the originating dataframe name.
df_data.rename(columns={'part_x': 'lecture_part',
                        'part_y': 'question_part'},
                   inplace=True)

# Cache the data.
df_data.to_csv('riiid_data.csv', index=False)

### How many users are in the dataset?

In [None]:
users = df_data.user_id.unique()
print(f'There are {len(users):,} users.')

### Create new data splits using percentage of users

In [None]:
# Percentages of orginal dataset
train_size = .8
validate_size = .1
test_size = .1

# Calculate the number of users in each dataset.
train_users = int(len(users)*train_size)
validate_users = math.ceil(len(users)*validate_size)
test_users = math.ceil(len(users)*test_size)

# Display the results.
print(f'Train set would contain {train_users:,} users')
print(f'Validate set would contain {validate_users:,} users')
print(f'Test set would contain {test_users:,} users')

In [None]:
# Check for even splits
print(len(users) == sum([train_users, validate_users, test_users]))

print(len(users))
print(sum([train_users, validate_users, test_users]))

### Test a method to split users into seperate datasets

In [None]:
# Set a random seed to reproduce splits
random.seed(123)

# Toy example
all_users = [1, 2, 3, 4, 5]

# Randomly select train set users
t_ids = random.sample(all_users, 3)

# Remove users assigned to the training set.
list(set(all_users) - set(t_ids))

### Create a function to reproduce data splits

In [None]:
def split_users(df, train_size=.8, validate_size=.1, test_size=.1):
    '''
    This function accepts the merged dataframe from acquire.get_riiid_data()
    and returns train, validate and test sets.
    '''
    # Set a random seed to reproduce splits
    random.seed(123)
    
    # Gather all user ids
    user_ids = list(df['user_id'].unique())
    total_num = len(user_ids)
    
    # Calculate the number of users in train, validate, and test.
    train_num = int(total_num*train_size)
    validate_num = math.ceil(total_num*validate_size)
    test_num = math.ceil(total_num*test_size)
    
    # Randomly select 80% of the users to be in train.
    train_ids = random.sample(user_ids, train_num)
    
    # Remove user_ids assigned to the training set.
    remaining_val_test_users = list(set(user_ids) - set(train_ids))
    
    # Assign the remaining user ids to validate and test.
    validate_ids = random.sample(remaining_val_test_users, validate_num)
    test_ids = list(set(remaining_val_test_users) - set(validate_ids))
    
    # Return the users assigned to train, validate, and test
    return train_ids, validate_ids, test_ids

In [None]:
train_ids, validate_ids, test_ids = split_users(df_data)

In [None]:
train = df_data.loc[df_data['user_id'].isin(train_ids)]
validate = df_data.loc[df_data['user_id'].isin(validate_ids)]
test = df_data.loc[df_data['user_id'].isin(test_ids)]

In [None]:
train.to_csv('train_k.csv', index=False)
validate.to_csv('validate_k.csv', index=False)
test.to_csv('test_k.csv', index=False)

In [None]:
train.info()

### Modify the function to use a subset of the data

In [2]:
def split_users(df, train_size=.8, validate_size=.1, test_size=.1, sample=True):
    '''
    This function accepts the merged dataframe from acquire.get_riiid_data()
    and returns train, validate and test sets.
    '''
    # Set a random seed to reproduce splits
    random.seed(123)
    
    if sample == True:
        # Gather a random sample of 100_000 user ids
        user_ids = random.sample(list(df['user_id'].unique()), 100_000)
    else:
        # Gather all user ids
        user_ids = list(df['user_id'].unique())
    
    # Calculate the number of users
    total_num = len(user_ids)
    
    # Calculate the number of users in train, validate. Remaining users go in test
    train_num = int(total_num*train_size)
    validate_num = math.ceil(total_num*validate_size)
    
    # Randomly select 80% of the users to be in train.
    train_ids = random.sample(user_ids, train_num)
    
    # Remove user_ids assigned to the training set.
    remaining_val_test_users = list(set(user_ids) - set(train_ids))
    
    # Assign the remaining user ids to validate and test.
    validate_ids = random.sample(remaining_val_test_users, validate_num)
    test_ids = list(set(remaining_val_test_users) - set(validate_ids))
    
    # Return the users assigned to train, validate, and test
    return train_ids, validate_ids, test_ids


def train_validate_test(df, sampled=True):
    '''

    '''
    train_ids, validate_ids, test_ids = split_users(df, sample=sampled)

    train = df.loc[df['user_id'].isin(train_ids)]
    validate = df.loc[df['user_id'].isin(validate_ids)]
    test = df.loc[df['user_id'].isin(test_ids)]

    return train, validate, test

In [3]:
train, validate, test = train_validate_test(acquire.get_riiid_data())
train.to_csv('mvp_train.csv', index=False)
validate.to_csv('mvp_validate.csv', index=False)
test.to_csv('mvp_test.csv', index=False)

In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20478221 entries, 96 to 101228712
Data columns (total 19 columns):
 #   Column                          Dtype  
---  ------                          -----  
 0   row_id                          int64  
 1   timestamp                       int64  
 2   user_id                         int32  
 3   content_id                      int16  
 4   content_type_id                 int16  
 5   task_container_id               int16  
 6   user_answer                     int8   
 7   answered_correctly              int8   
 8   prior_question_elapsed_time     float16
 9   prior_question_had_explanation  object 
 10  lecture_id                      Int32  
 11  tag                             Int8   
 12  lecture_part                    Int8   
 13  type_of                         object 
 14  question_id                     Int16  
 15  bundle_id                       Int16  
 16  correct_answer                  float64
 17  question_part          

### Modify the function to use a subset of the data: Sample of 50K users

In [3]:
def split_users(df, train_size=.8, validate_size=.1, sample=True):
    '''
    This function accepts the merged dataframe from acquire.get_riiid_data()
    and returns train, validate and test sets.
    '''
    # Set a random seed to reproduce splits
    random.seed(123)
    
    if sample == True:
        # Gather a random sample of 50_000 user ids
        user_ids = random.sample(list(df['user_id'].unique()), 50_000)
    else:
        # Gather all user ids
        user_ids = list(df['user_id'].unique())
    
    # Calculate the number of users
    total_num = len(user_ids)
    
    # Calculate the number of users in train, validate. Remaining users go in test
    train_num = int(total_num*train_size)
    validate_num = math.ceil(total_num*validate_size)
    
    # Randomly select 80% of the users to be in train.
    train_ids = random.sample(user_ids, train_num)
    
    # Remove user_ids assigned to the training set.
    remaining_val_test_users = list(set(user_ids) - set(train_ids))
    
    # Assign the remaining user ids to validate and test.
    validate_ids = random.sample(remaining_val_test_users, validate_num)
    test_ids = list(set(remaining_val_test_users) - set(validate_ids))
    
    # Return the users assigned to train, validate, and test
    return train_ids, validate_ids, test_ids


def train_validate_test(df, sampled=True):
    '''

    '''
    train_ids, validate_ids, test_ids = split_users(df, sample=sampled)

    train = df.loc[df['user_id'].isin(train_ids)]
    validate = df.loc[df['user_id'].isin(validate_ids)]
    test = df.loc[df['user_id'].isin(test_ids)]

    return train, validate, test

In [4]:
train, validate, test = train_validate_test(acquire.get_riiid_data())

In [5]:
train.to_csv('mvp_50_train.csv', index=False)
validate.to_csv('mvp_50_validate.csv', index=False)
test.to_csv('mvp_50_test.csv', index=False)