In [26]:
from __future__ import division
import pandas as pd
from numpy import random
# import dask.dataframe as dd

# Loading train df

Загрузка `train_df` без признака `row_id` - дублирует индекс

In [27]:
chunksize = 10 ** 5
dtypes = {
            'timestamp': 'int64',
            'user_id': 'int32',
            'content_id': 'int16',
            'content_type_id': 'int8',
            'task_container_id': 'int16',
            'user_answer': 'int8',
            'answered_correctly':'int8',
            'prior_question_elapsed_time': 'float32',
            'prior_question_had_explanation': 'object'
            }
train_path = '/home/ksu/Desktop/magistr/homework/HW_2/data/train.csv'
if chunksize:
    chunks = pd.read_csv(train_path, usecols=[1,2,3,4,5,6,7,8,9],
                         dtype=dtypes, chunksize=chunksize, low_memory=False)
    train_df = pd.DataFrame(chunks.get_chunk(chunksize))
    # df=pd.concat(chunk for chunk in chunks)
else:
    train_df = pd.read_csv(train_path, usecols=[1,2,3,4,5,6,7,8,9], dtype=dtypes)

# Loading questions and lectures data

In [28]:
questions_df = pd.read_csv('/home/ksu/Desktop/magistr/homework/HW_2/data/questions.csv')
lectures_df = pd.read_csv('/home/ksu/Desktop/magistr/homework/HW_2/data/lectures.csv')

## Merging data

In [29]:
train = pd.merge(train_df, questions_df, left_on = 'content_id', right_on = 'question_id', how = 'left')
train = pd.merge(train, lectures_df, left_on = 'content_id', right_on = 'lecture_id', how = 'left')
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100000 entries, 0 to 99999
Data columns (total 18 columns):
timestamp                         100000 non-null int64
user_id                           100000 non-null int32
content_id                        100000 non-null int16
content_type_id                   100000 non-null int8
task_container_id                 100000 non-null int16
user_answer                       100000 non-null int8
answered_correctly                100000 non-null int8
prior_question_elapsed_time       97834 non-null float32
prior_question_had_explanation    99652 non-null object
question_id                       98872 non-null float64
bundle_id                         98872 non-null float64
correct_answer                    98872 non-null float64
part_x                            98872 non-null float64
tags                              98872 non-null object
lecture_id                        3065 non-null float64
tag                               3065 non-null 

## Data cleaning
- убираются пустые ячейки (строка 1)
- признак `prior_question_had_explanation` преобразуется в `int8` (строки 2 - 6)
- признак `prior_question_elapsed_time` преобразуется в секунды. Так как после этого данные укладываются в 2 байта
признак преобразуется в `int16`

In [37]:
train.dropna(inplace=True)
train['prior_question_had_explanation'] = (
    train['prior_question_had_explanation'].
    apply(lambda has_explanation: 1 if has_explanation == 'True' else 0)
)
train['prior_question_had_explanation'] = train['prior_question_had_explanation'].astype('int8')
# now prior_question_elapsed_time will be in seconds
train['prior_question_elapsed_time'] = train['prior_question_elapsed_time'] / 1e3
print('prior_question_elapsed_time min value: {} max value {}\n'.format(
                train['prior_question_elapsed_time'].min(),
                train['prior_question_elapsed_time'].max()))
train['prior_question_elapsed_time'] = train['prior_question_elapsed_time'].astype('int16')
train.info(memory_usage=200, null_counts=True)

prior_question_elapsed_time min value: 0.000666000007186 max value 0.272000014782

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1244 entries, 18 to 99888
Data columns (total 18 columns):
timestamp                         1244 non-null int64
user_id                           1244 non-null int32
content_id                        1244 non-null int16
content_type_id                   1244 non-null int8
task_container_id                 1244 non-null int16
user_answer                       1244 non-null int8
answered_correctly                1244 non-null int8
prior_question_elapsed_time       1244 non-null int16
prior_question_had_explanation    1244 non-null int8
question_id                       1244 non-null float64
bundle_id                         1244 non-null float64
correct_answer                    1244 non-null float64
part_x                            1244 non-null float64
tags                              1244 non-null object
lecture_id                        1244 non-null

In [38]:
def calc_percents(dlen, flen):
    return 100 * flen / dlen

dlen = train_df.shape[0]

## timestamp

the time in milliseconds between this user interaction and the first event completion from that user

In [39]:
timestamp_secs = train['timestamp']/1e3
print('Timestamp max {:.1f}days min {:.1f}s mode {:.1f}s'.format(timestamp_secs.max()/(3600 * 24),
                                                                    timestamp_secs.min(),
                                                                    timestamp_secs.mode()[0]))
zero_ts_len = len(timestamp_secs[timestamp_secs == 0])
hour_ts_len = len(timestamp_secs[timestamp_secs > 3600])
day_ts_len = len(timestamp_secs[timestamp_secs > 3600 * 24])
print('Zero timestamp (or interaction = event, or garbage) amount {}, in percents: {:.2f}%'.format(zero_ts_len, calc_percents(dlen, zero_ts_len)))
print('Timestamps which are more than 1 hour amount {}, in percents: {:.1f}%'.format(hour_ts_len, calc_percents(dlen, hour_ts_len)))
print('Timestamps which are more than 1 day amount {}, in percents: {:.1f}%'.format(day_ts_len, calc_percents(dlen, day_ts_len)))

Timestamp max 687.1days min 15.3s mode 27714.0s
Zero timestamp (or interaction = event, or garbage) amount 0, in percents: 0.00%
Timestamps which are more than 1 hour amount 1137, in percents: 1.1%
Timestamps which are more than 1 day amount 1065, in percents: 1.1%


## user_id

ID code for the user

In [40]:
user_id_unique = len(train['user_id'].unique())
print('Amount of unquie users {}, percents of df length {:.1f}%'.format(user_id_unique, calc_percents(dlen, user_id_unique)))

Amount of unquie users 178, percents of df length 0.2%


## content_id

ID code for the user interaction

In [41]:
content_id_len = len(train['content_id'].unique())
print('Amount of unquie content ids {}, percents of df length {:.2f}%'.format(content_id_len, calc_percents(dlen, content_id_len)))

Amount of unquie content ids 135, percents of df length 0.14%


## task_container_id

Id code for the batch of questions or lectures. For example, a user might see three questions in a row before seeing the explanations for any of them. Those three would all share a task_container_id.

Questions topics (themes)?

In [42]:
task_container_id_len = len(train['task_container_id'].unique())
print('Amount of unquie task containers ids {}, percents of df length {:.2f}%'.format(
    task_container_id_len,
    calc_percents(dlen, task_container_id_len)))

Amount of unquie task containers ids 822, percents of df length 0.82%


## content_type_id

0 if the event was a question being posed to the user, 1 if the event was the user watching a lecture

In [43]:
train['content_type_id'].value_counts(normalize=True)

0    1.0
Name: content_type_id, dtype: float64

all content is questions, so this data is not usefull

In [44]:
train.drop(['content_type_id'], axis=1, inplace=True)
train.head()

Unnamed: 0,timestamp,user_id,content_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,question_id,bundle_id,correct_answer,part_x,tags,lecture_id,tag,part_y,type_of
18,437272,115,7926,18,1,1,0,0,7926.0,7926.0,1.0,1.0,9 10 92,7926.0,57.0,5.0,concept
23,557677,115,185,23,3,0,0,0,185.0,185.0,0.0,1.0,131 111 81,185.0,45.0,6.0,concept
30,710402,115,100,30,0,1,0,0,100.0,100.0,0.0,1.0,131 5 81,100.0,70.0,1.0,concept
95,835457,2746,484,19,0,1,0,0,484.0,484.0,0.0,2.0,62 155 163 81 29,484.0,179.0,5.0,concept
112,5355233,5382,185,16,0,1,0,0,185.0,185.0,0.0,1.0,131 111 81,185.0,45.0,6.0,concept


## user_answer

the user's answer to the question, if any. Read -1 as null, for lectures.

In [45]:
train['user_answer'].value_counts()

0    429
1    347
3    284
2    184
Name: user_answer, dtype: int64

## answered_correctly

if the user responded correctly. Read -1 as null, for lectures

In [46]:
train['answered_correctly'].value_counts()

1    870
0    374
Name: answered_correctly, dtype: int64

## prior_question_had_explanation

Whether or not the user saw an explanation and the correct response(s) after answering the previous question bundle, ignoring any lectures in between. The value is shared across a single question bundle, and is null for a user's first question bundle or lecture. Typically the first several questions a user sees were part of an onboarding diagnostic test where they did not get any feedback

In [47]:
train['prior_question_had_explanation'].value_counts()

0    1244
Name: prior_question_had_explanation, dtype: int64

## prior_question_elapsed_time

 The average time in milliseconds it took a user to answer each question in the previous question bundle, ignoring any lectures in between. Is null for a user's first question bundle or lecture. Note that the time is the average time a user took to solve each question in the previous bundle

In [49]:
print('Elapsed time min: {} max: {} mode: {}'.format(train['prior_question_elapsed_time'].min(),
                                                  train['prior_question_elapsed_time'].max(),
                                                  train['prior_question_elapsed_time'].mode()[0]))
train['prior_question_elapsed_time'].value_counts()

Elapsed time min: 0 max: 0 mode: 0


0    1244
Name: prior_question_elapsed_time, dtype: int64

## question_id

foreign key for the train/test content_id column, when the content type is question (0).

In [17]:
train['question_id'].head()

0    0
1    1
2    2
3    3
4    4
Name: question_id, dtype: int64

In [50]:
train.drop(['question_id'], inplace=True, axis=1)
train.head()

Unnamed: 0,timestamp,user_id,content_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,bundle_id,correct_answer,part_x,tags,lecture_id,tag,part_y,type_of
18,437272,115,7926,18,1,1,0,0,7926.0,1.0,1.0,9 10 92,7926.0,57.0,5.0,concept
23,557677,115,185,23,3,0,0,0,185.0,0.0,1.0,131 111 81,185.0,45.0,6.0,concept
30,710402,115,100,30,0,1,0,0,100.0,0.0,1.0,131 5 81,100.0,70.0,1.0,concept
95,835457,2746,484,19,0,1,0,0,484.0,0.0,2.0,62 155 163 81 29,484.0,179.0,5.0,concept
112,5355233,5382,185,16,0,1,0,0,185.0,0.0,1.0,131 111 81,185.0,45.0,6.0,concept


### Correct this feature!!!

## bundle_id

code for which questions are served together

In [51]:
train['bundle_id'].value_counts().head()

10688.0    125
335.0       47
3852.0      46
185.0       38
5266.0      30
Name: bundle_id, dtype: int64

## correct_answer

the answer to the question. Can be compared with the train user_answer column to check if the user was right.

In [52]:
train['correct_answer'].value_counts()

0.0    461
1.0    339
3.0    297
2.0    147
Name: correct_answer, dtype: int64

## part

the relevant section of the TOEIC test.

https://www.iibc-global.org/english/toeic/test/lr/about/format.html

In [54]:
train['part'].value_counts()

KeyError: 'part'

## tag

one or more detailed tag codes for the question. The meaning of the tags will not be provided, but these codes are sufficient for clustering the questions together.

In [53]:
train['tags'].head()

18              9 10 92
23           131 111 81
30             131 5 81
95     62 155 163 81 29
112          131 111 81
Name: tags, dtype: object

## type_of

brief description of the core purpose of the lecture

In [55]:
train['type_of'].value_counts()

concept             746
solving question    450
intention            46
starter               2
Name: type_of, dtype: int64

## part

top level category code for the lecture.

https://www.iibc-global.org/english/toeic/test/lr/about/format.html

In [56]:
train['part'].value_counts()

KeyError: 'part'

## tag

one tag codes for the lecture. The meaning of the tags will not be provided, but these codes are sufficient for clustering the lectures together

In [57]:
train['tag']

18        57.0
23        45.0
30        70.0
95       179.0
112       45.0
137      123.0
158       39.0
254      114.0
264       39.0
298      172.0
346       39.0
402        9.0
405       48.0
493       89.0
674      156.0
807      114.0
1105       8.0
1167       1.0
1236     152.0
1298     182.0
1327       1.0
1463       7.0
1538       4.0
1665      61.0
1730     161.0
1803      49.0
1823      70.0
1936      39.0
2022     101.0
2025      69.0
         ...  
97597     28.0
97660     33.0
97839     54.0
97903     93.0
98006    155.0
98202    172.0
98336    132.0
98394    179.0
98427    109.0
98654      1.0
98794     45.0
98816    114.0
98823     39.0
98972    172.0
99043     43.0
99114     54.0
99197    101.0
99202    116.0
99401     47.0
99501      6.0
99519    157.0
99530    141.0
99559    161.0
99586     24.0
99629    136.0
99673     93.0
99703     39.0
99756      8.0
99763    161.0
99888     67.0
Name: tag, Length: 1244, dtype: float64