# Data Acquisition

In [5]:
# Import functions
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

# Acquire the data

In [None]:
# Load in primary dataset.
df = pd.read_csv('train.csv')

# Training dataset
To reduce the memory size of our training data, we selected a random sample of 100,000 students from `train.csv`.
1. Select 100,000 user ids from `train.csv` to use as our train dataset.
2. Cast numeric column data types using an appropriate precision to reduce memory size.

## Sample training data

In [None]:
# Generate a random sample of 150_000 user ids.
user_ids = df.user_id.sample(150_000, random_state=123)

# Using the selected user_ids, filter the dataset for the first 100_000.
filtered_data = pd.DataFrame()

counter = 0

for user in user_ids:
    
    data = df.loc[df['user_id'] == user]
    if data.shape[0] > 10 and counter < 100_000:
        counter +=1
        filtered_data = pd.concat([filtered_data, data])

In [None]:
# Cache local file for next steps.
filtered_data.to_csv('filtered_train.csv', index=False)

In [None]:
# Dataset loads correctly.
df = pd.read_csv('filtered_train.csv')

Create a function to reproduce the modified training set.

In [6]:
def sampled_train():
    '''
    This function selects a random sample of 100_000 users from the `train.csv` dataset.
    Returns a dataframe of 100_000 users that have more than 10 rows of data.
    
    
    Parameters
    ----------
    None
    
    
    Returns
    -------
    prepared_train : pandas.core.frame.DataFrame
        A pandas dataframe of 100,000 randomly selected
        users.
    '''
    
    train_data_types_dict = {
        'timestamp': np.int64,
        'user_id': np.int32,
        'content_id': np.int16,
        'content_type_id': np.int16,
        'task_container_id' : np.int16,
        'user_answer' : np.int8,
        'answered_correctly': np.int8,
        'prior_question_elapsed_time': np.float16
    }
    
    # Read in `train.csv` data
    df = pd.read_csv('train.csv', dtype=train_data_types_dict)

    # Generate a random sample of 150_000 user ids.
    user_ids = df.user_id.sample(150_000, random_state=123)

    # Using the selected user_ids, filter the dataset for the first 100_000.
    filtered_data = pd.DataFrame()

    # Create a counter to stop at 100_000.
    counter = 0

    for user in user_ids:
        data = df.loc[df['user_id'] == user]
        if data.shape[0] > 10 and counter < 100_000:
            counter +=1
            filtered_data = pd.concat([filtered_data, data])
    
    # Drop the index of filtered users.
    filtered_data.reset_index(drop=True, inplace=True)

    # Cache local file of sampled data.
    filtered_data.to_csv('filtered_train.csv', index=False)
    
    # Return the dataframe
    return prepared_train

In [7]:
# Test the function.
df = sampled_train()

KeyboardInterrupt: 

In [None]:
# It works.
df.user_id.value_counts()

In [None]:
# Check our data types and memory usage.
df.info()

In [None]:
def datatype_converter(file_name='train'):
    '''
    This function accepts a csv file name.
    Returns a dictionary of column names and data types.
    
    
    Parameters
    ----------
    file_name : str, optional, default='train'
        Acceptable file names
        - 'lectures'
            refers to columns in `lectures.csv`
        - 'train'
        - 'questions'
        
    
    '''
    
    train_data_types_dict = {
    'timestamp': np.int64,
    'user_id': np.int32,
    'content_id': np.int16,
    'content_type_id': np.int16,
    'task_container_id' : np.int16,
    'user_answer' : np.int8,
    'answered_correctly': np.int8,
    'prior_question_elapsed_time': np.float16
    }
    
    return train_data_types_dict

In [None]:
for user_id in user_ids1000:
    if train1000.loc[[user_id]].shape[0] <=10:
        print(user_id)
        continue
    elif train1000.loc[[user_id]].shape[0] > 2: 
        df = train1000.loc[[user_id]]
        n = df.shape[0]
        test_start_index = round(train_size * n)
        df_train = df.iloc[:test_start_index]
        df_test = df.iloc[test_start_index:]     
        train = pd.concat([train, df_train])
        test = pd.concat([test, df_test])

In [None]:
user_ids = df.user_id.unique()

train = pd.DataFrame()
validate = pd.DataFrame()
test = pd.DataFrame()

# Set up the train size
train_size = 0.8
validate_size = 0.1
    
train_end_index = int(train_size * n)
validate_end_index = train_end_index + int(validate_size * n)

df_train = df.iloc[:train_end_index]
df_validate = df.iloc[train_end_index:validate_end_index]
df_test = df.iloc[validate_end_index:]

train = pd.concat([train, df_train])
validate = pd.concat([validate, df_validate])
test = pd.concat([test, df_test])

# Print the shape of the original, train and test
df_train.shape, train.shape, test.shape

In [None]:
train.info()

In [None]:
# df = acquire.get_riiid_data()