In [300]:
from __future__ import division
import pandas as pd
from numpy import random
import numpy as np
import matplotlib.pyplot as plt
# import dask.dataframe as dd

# Loading train df

Загрузка `train_df` без признака `row_id` - дублирует индекс

In [301]:
chunksize = 10 ** 5
dtypes = {
            'timestamp': 'int64',
            'user_id': 'int32',
            'content_id': 'int16',
            'content_type_id': 'int8',
            'task_container_id': 'int16',
            'user_answer': 'int8',
            'answered_correctly':'int8',
            'prior_question_elapsed_time': 'float32',
            'prior_question_had_explanation': 'object'
            }
train_path = '/home/ksu/Desktop/magistr/homework/HW_2/data/train.csv'
if chunksize:
    chunks = pd.read_csv(train_path, usecols=[1,2,3,4,5,6,7,8,9],
                         dtype=dtypes, chunksize=chunksize, low_memory=False)
    train_df = pd.DataFrame(chunks.get_chunk(chunksize))
    # df=pd.concat(chunk for chunk in chunks)
else:
    train_df = pd.read_csv(train_path, usecols=[1,2,3,4,5,6,7,8,9], dtype=dtypes)

# Loading questions and lectures data

In [302]:
questions_df = pd.read_csv('/home/ksu/Desktop/magistr/homework/HW_2/data/questions.csv')
lectures_df = pd.read_csv('/home/ksu/Desktop/magistr/homework/HW_2/data/lectures.csv')

## Merging data

In [303]:
train = pd.merge(train_df, questions_df, left_on = 'content_id', right_on = 'question_id', how = 'left')
train = pd.merge(train, lectures_df, left_on = 'content_id', right_on = 'lecture_id', how = 'left', suffixes=('_question', '_lecture'))
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100000 entries, 0 to 99999
Data columns (total 18 columns):
timestamp                         100000 non-null int64
user_id                           100000 non-null int32
content_id                        100000 non-null int16
content_type_id                   100000 non-null int8
task_container_id                 100000 non-null int16
user_answer                       100000 non-null int8
answered_correctly                100000 non-null int8
prior_question_elapsed_time       97834 non-null float32
prior_question_had_explanation    99652 non-null object
question_id                       98872 non-null float64
bundle_id                         98872 non-null float64
correct_answer                    98872 non-null float64
part_question                     98872 non-null float64
tags                              98872 non-null object
lecture_id                        3065 non-null float64
tag                               3065 non-null 

## Data cleaning

In [304]:
train.dropna(inplace=True)
train['prior_question_had_explanation'] = (
    train['prior_question_had_explanation'].
    apply(lambda has_explanation: 1 if has_explanation == 'True' else 0)
)
train['prior_question_had_explanation'] = train['prior_question_had_explanation'].astype('int8')
# now prior_question_elapsed_time will be in seconds
train['prior_question_elapsed_time'] = train['prior_question_elapsed_time'] / 1e3
print('prior_question_elapsed_time min value: {} max value {}\n'.format(
                train['prior_question_elapsed_time'].min(),
                train['prior_question_elapsed_time'].max()))
train['prior_question_elapsed_time'] = train['prior_question_elapsed_time'].astype('int16')
train.info(memory_usage=200, null_counts=True)

prior_question_elapsed_time min value: 0.666000008583 max value 272.0

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1244 entries, 18 to 99888
Data columns (total 18 columns):
timestamp                         1244 non-null int64
user_id                           1244 non-null int32
content_id                        1244 non-null int16
content_type_id                   1244 non-null int8
task_container_id                 1244 non-null int16
user_answer                       1244 non-null int8
answered_correctly                1244 non-null int8
prior_question_elapsed_time       1244 non-null int16
prior_question_had_explanation    1244 non-null int8
question_id                       1244 non-null float64
bundle_id                         1244 non-null float64
correct_answer                    1244 non-null float64
part_question                     1244 non-null float64
tags                              1244 non-null object
lecture_id                        1244 non-null float64
tag

In [305]:
def calc_percents(dlen, flen):
    return 100 * flen / dlen

dlen = train_df.shape[0]

## timestamp

the time in milliseconds between this user interaction and the first event completion from that user

In [369]:
timestamp_secs = train['timestamp']/1e3
print('Timestamp max {:.1f}days min {:.1f}s mode {:.1f}s'.format(timestamp_secs.max()/(3600 * 24),
                                                                    timestamp_secs.min(),
                                                                    timestamp_secs.mode()[0]))
zero_ts_len = len(timestamp_secs[timestamp_secs == 0])
hour_ts_len = len(timestamp_secs[timestamp_secs > 3600])
day_ts_len = len(timestamp_secs[timestamp_secs > 3600 * 24])
print('Zero timestamp (or interaction = event, or garbage) amount {}, in percents: {:.2f}%'.format(zero_ts_len, calc_percents(dlen, zero_ts_len)))
print('Timestamps which are more than 1 hour amount {}, in percents: {:.1f}%'.format(hour_ts_len, calc_percents(dlen, hour_ts_len)))
print('Timestamps which are more than 1 day amount {}, in percents: {:.1f}%'.format(day_ts_len, calc_percents(dlen, day_ts_len)))
correct_ansewers = timestamp_secs[train['answered_correctly'] == 1]
wrong_ansewers = timestamp_secs[train['answered_correctly'] == 0]
print('Timestamps with correct answers: max {:.0f} days, min {:.0f} days, median {:.0f} days'.format(correct_ansewers.max()/(24*3600),
                                                                                 correct_ansewers.min()/(24*3600),
                                                                                 int(correct_ansewers.median()/(24*3600))))
print('Timestamps with wrong answers: max {:.0f} days, min {:.0f} days, median {:.0f} days'.format(wrong_ansewers.max()/(24*3600),
                                                                                 wrong_ansewers.min()/(24*3600),
                                                                                 int(wrong_ansewers.median()/(24*3600))))

Timestamp max 687.1days min 15.3s mode 27714.0s
Zero timestamp (or interaction = event, or garbage) amount 0, in percents: 0.00%
Timestamps which are more than 1 hour amount 1137, in percents: 1.1%
Timestamps which are more than 1 day amount 1065, in percents: 1.1%
Timestamps with correct answers: max 687 days, min 0 days, median 46 days
Timestamps with wrong answers: max 685 days, min 0 days, median 35 days


## user_id

ID code for the user

In [307]:
user_id_unique = len(train['user_id'].unique())
print('Amount of unquie users {}, percents of df length {:.1f}%'.format(user_id_unique, calc_percents(dlen, user_id_unique)))

Amount of unquie users 178, percents of df length 0.2%


## content_id

ID code for the user interaction

In [308]:
content_id_len = len(train['content_id'].unique())
print('Amount of unquie content ids {}, percents of df length {:.2f}%'.format(content_id_len, calc_percents(dlen, content_id_len)))

Amount of unquie content ids 135, percents of df length 0.14%


## task_container_id

Id code for the batch of questions or lectures. For example, a user might see three questions in a row before seeing the explanations for any of them. Those three would all share a task_container_id.

Questions topics (themes)?

In [309]:
task_container_id_len = len(train['task_container_id'].unique())
print('Amount of unquie task containers ids {}, percents of df length {:.2f}%'.format(
    task_container_id_len,
    calc_percents(dlen, task_container_id_len)))

Amount of unquie task containers ids 822, percents of df length 0.82%


## content_type_id

0 if the event was a question being posed to the user, 1 if the event was the user watching a lecture

In [310]:
train['content_type_id'].value_counts(normalize=True)

0    1.0
Name: content_type_id, dtype: float64

all content is questions, so this data is not usefull

In [311]:
train.drop(['content_type_id'], axis=1, inplace=True)
train.head()

Unnamed: 0,timestamp,user_id,content_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,question_id,bundle_id,correct_answer,part_question,tags,lecture_id,tag,part_lecture,type_of
18,437272,115,7926,18,1,1,18,0,7926.0,7926.0,1.0,1.0,9 10 92,7926.0,57.0,5.0,concept
23,557677,115,185,23,3,0,21,0,185.0,185.0,0.0,1.0,131 111 81,185.0,45.0,6.0,concept
30,710402,115,100,30,0,1,20,0,100.0,100.0,0.0,1.0,131 5 81,100.0,70.0,1.0,concept
95,835457,2746,484,19,0,1,20,1,484.0,484.0,0.0,2.0,62 155 163 81 29,484.0,179.0,5.0,concept
112,5355233,5382,185,16,0,1,34,1,185.0,185.0,0.0,1.0,131 111 81,185.0,45.0,6.0,concept


## user_answer

the user's answer to the question, if any. Read -1 as null, for lectures.

In [312]:
print(train['user_answer'].value_counts())
train['user_answer'] = train['user_answer'].astype(np.uint8)

0    429
1    347
3    284
2    184
Name: user_answer, dtype: int64


## answered_correctly

if the user responded correctly. Read -1 as null, for lectures

In [313]:
print(train['answered_correctly'].value_counts())
train['answered_correctly'] = train['answered_correctly'].astype(np.uint8)

1    870
0    374
Name: answered_correctly, dtype: int64


## prior_question_had_explanation

Whether or not the user saw an explanation and the correct response(s) after answering the previous question bundle, ignoring any lectures in between. The value is shared across a single question bundle, and is null for a user's first question bundle or lecture. Typically the first several questions a user sees were part of an onboarding diagnostic test where they did not get any feedback

In [355]:
print('Value counts for prior_question_had_explanation\n{}\n'.format(train['prior_question_had_explanation'].value_counts()))
train['prior_question_had_explanation'] = train['prior_question_had_explanation'].astype(np.uint8)
correct_ansewers = train['prior_question_had_explanation'][train['answered_correctly'] == 1]
wrong_ansewers = train['prior_question_had_explanation'][train['answered_correctly'] == 0]
print('Correct answers distrubition for question had explanation:\n{}'.format(correct_ansewers.value_counts()))
print('Wrong answers distrubition for question had explanation:\n{}'.format(wrong_ansewers.value_counts()))

Value counts for prior_question_had_explanation
1    1182
0      62
Name: prior_question_had_explanation, dtype: int64

Correct answers distrubition for question had explanation:
1    834
0     36
Name: prior_question_had_explanation, dtype: int64
Wrong answers distrubition for question had explanation:
1    348
0     26
Name: prior_question_had_explanation, dtype: int64


## prior_question_elapsed_time

 The average time in milliseconds it took a user to answer each question in the previous question bundle, ignoring any lectures in between. Is null for a user's first question bundle or lecture. Note that the time is the average time a user took to solve each question in the previous bundle

In [348]:
print('Elapsed time min: {} max: {} mode: {}'.format(train['prior_question_elapsed_time'].min(),
                                                  train['prior_question_elapsed_time'].max(),
                                                  train['prior_question_elapsed_time'].mode()[0]))
# train['prior_question_elapsed_time'].value_counts()
train['prior_question_elapsed_time'] = train['prior_question_elapsed_time'].astype(np.uint16)
ac_and_et = train['prior_question_elapsed_time'][train['answered_correctly'] == 1]
print('Median elapsed time {}s max elapsed time {}s min elapsed time {}s for correct answers'.format(
    ac_and_et.median(), ac_and_et.max(), ac_and_et.min()))

aw_and_et = train['prior_question_elapsed_time'][train['answered_correctly'] == 0]
print('Median elapsed time {}s max elapsed time {}s min elapsed time {}s for wrong answers'.format(
    aw_and_et.median(), aw_and_et.max(), aw_and_et.min()))

Elapsed time min: 0 max: 272 mode: 17
Median elapsed time 20.0s max elapsed time 272s min elapsed time 2s for correct answers
Median elapsed time 22.0s max elapsed time 252s min elapsed time 0s for wrong answers


## question_id

foreign key for the train/test content_id column, when the content type is question (0).

In [316]:
print(train['question_id'].head())
print('Question id min: {} max: {}'.format(train['question_id'].min(),
                                                     train['question_id'].max()))
train['question_id'] = train['question_id'].astype(np.uint32)

18     7926.0
23      185.0
30      100.0
95      484.0
112     185.0
Name: question_id, dtype: float64
Question id min: 89.0 max: 13487.0


## bundle_id

code for which questions are served together

In [343]:
print(train['bundle_id'].value_counts().head())
train['bundle_id'] = train['bundle_id'].astype(np.uint32)
print('\nCorrect answers value counts for differnt bundle id:\n{}'.format(
    train['bundle_id'][train['answered_correctly'] == 1].value_counts().nlargest(10)))

10688    125
335       47
3852      46
185       38
5266      30
Name: bundle_id, dtype: int64

Correct answers value counts for differnt bundle id:
10688    94
335      35
3852     29
4118     20
185      20
6797     18
5266     17
761      16
7926     16
7920     16
Name: bundle_id, dtype: int64


## correct_answer

the answer to the question. Can be compared with the train user_answer column to check if the user was right.

In [318]:
print(train['correct_answer'].value_counts())
train['correct_answer'] =  train['correct_answer'].astype(np.uint8)

0.0    461
1.0    339
3.0    297
2.0    147
Name: correct_answer, dtype: int64


## part_question

the relevant section of the TOEIC test.

https://www.iibc-global.org/english/toeic/test/lr/about/format.html

In [340]:
print('Value counts of question part:\n{}\n'.format(train['part_question'].value_counts()))
train['part_question'] = train['part_question'].astype(np.uint8)
print('Value counts of correct answer for different question part:\n{}'.format(
    train['part_question'][train['answered_correctly'] == 1].value_counts()))

Value counts of question part:
5    436
2    296
6    189
1    142
3     93
4     54
7     34
Name: part_question, dtype: int64

Value counts of correct answer for different question part:
5    286
2    228
6    122
1    100
3     64
4     43
7     27
Name: part_question, dtype: int64


## tags

one or more detailed tag codes for the question. The meaning of the tags will not be provided, but these codes are sufficient for clustering the questions together.

In [320]:
train['tags'].head()

18              9 10 92
23           131 111 81
30             131 5 81
95     62 155 163 81 29
112          131 111 81
Name: tags, dtype: object

In [321]:
from collections import defaultdict
def transform_tags(tags_series):
    # tags_dict = defaultdict(list)
    possible_keys = []
    for i, data in train['tags'].items():
        data = data.split()
        possible_keys.extend(data)
    keys_set = sorted(set(possible_keys), key = int)
    tags = []
    # print(keys_set)
    for i, data in train['tags'].items():
        data = data.split()
        data_set = set(data)
        tags_dict = {key:0  for key in keys_set}
        for key in data_set:
            tags_dict[key] = 1
        tags.append(tags_dict)
        # break
    tags = pd.DataFrame(data=tags, index=tags_series.index, dtype=np.int8)
    renaming_cols = {key: 'tag_' + key for key in keys_set}
    tags = tags.rename(columns=renaming_cols)
    return tags
tags_ = transform_tags(train['tags'])
tags_.head()

Unnamed: 0,tag_1,tag_10,tag_100,tag_101,tag_102,tag_104,tag_106,tag_107,tag_109,tag_111,...,tag_84,tag_88,tag_89,tag_9,tag_90,tag_91,tag_92,tag_96,tag_97,tag_98
18,0,1,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,1,0,0,0
23,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
30,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
95,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
112,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [322]:
train = pd.merge(train, tags_, left_index=True, right_index=True, how = 'left')

## lecture_id
foreign key for the train/test content_id column, when the content type is lecture (1).

In [336]:
print('Lecture id boundaries max: {} min {}'.format(train['lecture_id'].max(),
                train['lecture_id'].min()))
train['lecture_id'] = train['lecture_id'].astype(np.uint32)
train['lecture_id'][train['answered_correctly'] == 1].value_counts().nlargest(10)

Lecture id boundaries max: 13487 min 89


10688    94
335      35
3852     29
4118     20
185      20
6797     18
5266     17
7920     16
7926     16
761      16
Name: lecture_id, dtype: int64

## type_of

brief description of the core purpose of the lecture

In [324]:
print(train['type_of'].head())

18     concept
23     concept
30     concept
95     concept
112    concept
Name: type_of, dtype: object


In [334]:
train['type_of'] = pd.Categorical(train['type_of'])
train['type_of'].head()
train['type_of'][train['answered_correctly'] == 1].value_counts()

concept             528
solving question    307
intention            34
starter               1
Name: type_of, dtype: int64

## part_lecture

top level category code for the lecture.

https://www.iibc-global.org/english/toeic/test/lr/about/format.html

In [326]:
train['part_lecture'] = train['part_lecture'].astype(np.uint8)

## tag

one tag codes for the lecture. The meaning of the tags will not be provided, but these codes are sufficient for clustering the lectures together

In [332]:
print('Tag max value {} '.format(train['tag'].max()))
train['tag'] = train['tag'].astype(np.uint8)
train['tag'][train['answered_correctly'] == 1].value_counts().nlargest(10)

Tag max value 182 


39     94
114    35
161    29
93     27
101    26
8      20
45     20
1      19
70     18
123    18
Name: tag, dtype: int64

## Features analysis result

In [328]:
train.info(max_cols=150)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1244 entries, 18 to 99888
Data columns (total 106 columns):
timestamp                         1244 non-null int64
user_id                           1244 non-null int32
content_id                        1244 non-null int16
task_container_id                 1244 non-null int16
user_answer                       1244 non-null uint8
answered_correctly                1244 non-null uint8
prior_question_elapsed_time       1244 non-null uint16
prior_question_had_explanation    1244 non-null uint8
question_id                       1244 non-null uint32
bundle_id                         1244 non-null uint32
correct_answer                    1244 non-null uint8
part_question                     1244 non-null uint8
tags                              1244 non-null object
lecture_id                        1244 non-null uint32
tag                               1244 non-null uint8
part_lecture                      1244 non-null uint8
type_of              