In [3]:
from __future__ import division
import pandas as pd
# import dask.dataframe as dd

# Loading train df

Загрузка `train_df` без признака `row_id` - дублирует индекс

In [4]:
dtypes = {
            'timestamp': 'int64',
            'user_id': 'int32',
            'content_id': 'int16',
            'content_type_id': 'int8',
            'task_container_id': 'int16',
            'user_answer': 'int8',
            'answered_correctly':'int8',
            'prior_question_elapsed_time': 'float32',
            'prior_question_had_explanation': 'object'
            }
train_path = '/home/ksu/Desktop/magistr/homework/HW_2/data/train.csv'
train_df = pd.read_csv(train_path, usecols=[1,2,3,4,5,6,7,8,9], dtype=dtypes)

## Чистка данных
- убираются пустые ячейки (строка 1)
- признак `prior_question_had_explanation` преобразуется в `int8` (строки 2 - 6)
- признак `prior_question_elapsed_time` преобразуется в секунды. Так как после этого данные укладываются в 2 байта
признак преобразуется в `int16`

In [5]:
train_df.dropna(inplace=True)
train_df['prior_question_had_explanation'] = (
    train_df['prior_question_had_explanation'].
    apply(lambda has_explanation: 1 if has_explanation == 'True' else 0)
)
train_df['prior_question_had_explanation'] = train_df['prior_question_had_explanation'].astype('int8')
# now prior_question_elapsed_time will be in seconds
train_df['prior_question_elapsed_time'] = train_df['prior_question_elapsed_time'] / 1e3
print('prior_question_elapsed_time min value: {} max value {}\n'.format(
                train_df['prior_question_elapsed_time'].min(),
                train_df['prior_question_elapsed_time'].max()))
train_df['prior_question_elapsed_time'] = train_df['prior_question_elapsed_time'].astype('int16')
train_df.info(memory_usage=200, null_counts=True)

prior_question_elapsed_time min value: 0.0 max value 300.0

<class 'pandas.core.frame.DataFrame'>
Int64Index: 98878794 entries, 1 to 101230331
Data columns (total 9 columns):
timestamp                         98878794 non-null int64
user_id                           98878794 non-null int32
content_id                        98878794 non-null int16
content_type_id                   98878794 non-null int8
task_container_id                 98878794 non-null int16
user_answer                       98878794 non-null int8
answered_correctly                98878794 non-null int8
prior_question_elapsed_time       98878794 non-null int16
prior_question_had_explanation    98878794 non-null int8
dtypes: int16(3), int32(1), int64(1), int8(4)
memory usage: 2.8 GB


None

In [6]:
def calc_percents(dlen, flen):
    return 100 * flen / dlen

dlen = train_df.shape[0]

## timestamp

the time in milliseconds between this user interaction and the first event completion from that user

In [7]:
timestamp_secs = train_df['timestamp']/1e3
print('Timestamp max {:.1f}days min {:.1f}s mode {:.1f}s'.format(timestamp_secs.max()/(3600 * 24),
                                                                    timestamp_secs.min(),
                                                                    timestamp_secs.mode()[0]))
zero_ts_len = len(timestamp_secs[timestamp_secs == 0])
hour_ts_len = len(timestamp_secs[timestamp_secs > 3600])
day_ts_len = len(timestamp_secs[timestamp_secs > 3600 * 24])
print('Zero timestamp (or interaction = event, or garbage) amount {}, in percents: {:.2f}%'.format(zero_ts_len, calc_percents(dlen, zero_ts_len)))
print('Timestamps which are more than 1 hour amount {}, in percents: {:.1f}%'.format(hour_ts_len, calc_percents(dlen, hour_ts_len)))
print('Timestamps which are more than 1 day amount {}, in percents: {:.1f}%'.format(day_ts_len, calc_percents(dlen, day_ts_len)))

Timestamp max 1011.9days min 0.0s mode 0.0s
Zero timestamp (or interaction = event, or garbage) amount 3896, in percents: 0.00%
Timestamps which are more than 1 hour amount 90069811, in percents: 91.1%
Timestamps which are more than 1 day amount 86039183, in percents: 87.0%


## user_id

ID code for the user

In [8]:
user_id_unique = len(train_df['user_id'].unique())
print('Amount of unquie users {}, percents of df length {:.1f}%'.format(user_id_unique, calc_percents(dlen, user_id_unique)))

Amount of unquie users 393569, percents of df length 0.4%


## content_id

ID code for the user interaction

In [9]:
content_id_len = len(train_df['content_id'].unique())
print('Amount of unquie content ids {}, percents of df length {:.2f}%'.format(content_id_len, calc_percents(dlen, content_id_len)))

Amount of unquie content ids 13523, percents of df length 0.01%


## task_container_id

Id code for the batch of questions or lectures. For example, a user might see three questions in a row before seeing the explanations for any of them. Those three would all share a task_container_id.

Questions topics (themes)?

In [10]:
task_container_id_len = len(train_df['task_container_id'].unique())
print('Amount of unquie task containers ids {}, percents of df length {:.2f}%'.format(
    task_container_id_len,
    calc_percents(dlen, task_container_id_len)))

Amount of unquie task containers ids 10000, percents of df length 0.01%


## content_type_id

0 if the event was a question being posed to the user, 1 if the event was the user watching a lecture

In [11]:
train_df['content_type_id'].value_counts(normalize=True)

0    1.0
Name: content_type_id, dtype: float64

all content is questions, so this data is not usefull

In [13]:
train_df.drop(['content_type_id'], axis=1, inplace=True)
train_df.head()

Unnamed: 0,timestamp,user_id,content_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation
1,56943,115,5716,2,2,1,37,0
2,118363,115,128,0,0,1,55,0
3,131167,115,7860,3,0,1,19,0
4,137965,115,7922,4,1,1,11,0
5,157063,115,156,5,2,1,5,0


## user_answer

the user's answer to the question, if any. Read -1 as null, for lectures.

In [14]:
train_df['user_answer'].value_counts()

0    27989383
1    26912164
3    26025115
2    17952132
Name: user_answer, dtype: int64

## answered_correctly

if the user responded correctly. Read -1 as null, for lectures

In [15]:
train_df['answered_correctly'].value_counts()

1    64977687
0    33901107
Name: answered_correctly, dtype: int64

## prior_question_had_explanation

Whether or not the user saw an explanation and the correct response(s) after answering the previous question bundle, ignoring any lectures in between. The value is shared across a single question bundle, and is null for a user's first question bundle or lecture. Typically the first several questions a user sees were part of an onboarding diagnostic test where they did not get any feedback

In [16]:
train_df['prior_question_had_explanation'].value_counts()

1    89685560
0     9193234
Name: prior_question_had_explanation, dtype: int64

## prior_question_elapsed_time

 The average time in milliseconds it took a user to answer each question in the previous question bundle, ignoring any lectures in between. Is null for a user's first question bundle or lecture. Note that the time is the average time a user took to solve each question in the previous bundle

In [23]:
print('Elapsed time min: {} max: {} mode: {}'.format(train_df['prior_question_elapsed_time'].min(),
                                                  train_df['prior_question_elapsed_time'].max(),
                                                  train_df['prior_question_elapsed_time'].mode()[0]))
train_df['prior_question_elapsed_time'].value_counts()

Elapsed time min: 0 max: 300 mode: 17


17     5485415
16     5061745
18     5022869
19     4412104
20     4063010
15     3924757
21     3898864
22     3763852
23     3520538
24     3231931
25     2931446
14     2885152
26     2635191
27     2349886
13     2275213
28     2088975
12     1991117
29     1878755
11     1780508
30     1705451
10     1673832
9      1634798
31     1544286
8      1478082
32     1407656
33     1293700
7      1264454
34     1187995
35     1089025
6      1087464
        ...   
268        585
287        575
271        574
274        571
277        570
273        569
282        561
270        549
280        528
272        525
281        522
276        520
278        513
283        501
289        495
290        492
284        491
296        489
288        487
295        483
279        481
294        469
291        467
292        464
297        448
299        447
293        445
286        443
298        439
285        420
Name: prior_question_elapsed_time, Length: 301, dtype: int64

# Loading questions data

In [25]:
questions_df = pd.read_csv('/home/ksu/Desktop/magistr/homework/HW_2/data/questions.csv')
questions_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13523 entries, 0 to 13522
Data columns (total 5 columns):
question_id       13523 non-null int64
bundle_id         13523 non-null int64
correct_answer    13523 non-null int64
part              13523 non-null int64
tags              13522 non-null object
dtypes: int64(4), object(1)
memory usage: 528.3+ KB


## question_id

foreign key for the train/test content_id column, when the content type is question (0).

In [28]:
questions_df['question_id'].head()

0    0
1    1
2    2
3    3
4    4
Name: question_id, dtype: int64

In [30]:
questions_df.drop(['question_id'], inplace=True, axis=1)
questions_df.head()

Unnamed: 0,bundle_id,correct_answer,part,tags
0,0,0,1,51 131 162 38
1,1,1,1,131 36 81
2,2,0,1,131 101 162 92
3,3,0,1,131 149 162 29
4,4,3,1,131 5 162 38


## bundle_id

code for which questions are served together

In [32]:
questions_df['bundle_id'].value_counts().head()

7795    5
6971    5
7421    5
7770    5
8144    5
Name: bundle_id, dtype: int64

## correct_answer

the answer to the question. Can be compared with the train user_answer column to check if the user was right.

In [33]:
questions_df['correct_answer'].value_counts()

0    3716
3    3544
1    3478
2    2785
Name: correct_answer, dtype: int64

## part

the relevant section of the TOEIC test.

https://www.iibc-global.org/english/toeic/test/lr/about/format.html

In [None]:
questions_df['part'].value_counts()

5    5511
2    1647
3    1562
4    1439
6    1212
7    1160
1     992
Name: part, dtype: int64

## tag

one or more detailed tag codes for the question. The meaning of the tags will not be provided, but these codes are sufficient for clustering the questions together.

In [34]:
questions_df['tags'].head()

0     51 131 162 38
1         131 36 81
2    131 101 162 92
3    131 149 162 29
4      131 5 162 38
Name: tags, dtype: object

# Loading lectures data

In [35]:
lectures_df = pd.read_csv('/home/ksu/Desktop/magistr/homework/HW_2/data/lectures.csv')
lectures_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 4 columns):
lecture_id    418 non-null int64
tag           418 non-null int64
part          418 non-null int64
type_of       418 non-null object
dtypes: int64(3), object(1)
memory usage: 13.1+ KB


## type_of

brief description of the core purpose of the lecture

In [None]:
lectures_df['type_of'].value_counts()

concept             222
solving question    186
intention             7
starter               3
Name: type_of, dtype: int64

## part

top level category code for the lecture.

https://www.iibc-global.org/english/toeic/test/lr/about/format.html

In [None]:
lectures_df['part'].value_counts()

5    143
6     83
2     56
1     54
7     32
4     31
3     19
Name: part, dtype: int64

## tag

one tag codes for the lecture. The meaning of the tags will not be provided, but these codes are sufficient for clustering the lectures together

In [36]:
lectures_df['tag'].head()

0    159
1     70
2     45
3     79
4    156
Name: tag, dtype: int64