In [31]:
from __future__ import division
import pandas as pd
import dask.dataframe as dd

# Loading

Загрузка `train_df` без признака `row_id` - дублирует индекс

In [3]:
dtypes = {
            'timestamp': 'int64',
            'user_id': 'int32',
            'content_id': 'int16',
            'content_type_id': 'int8',
            'task_container_id': 'int16',
            'user_answer': 'int8',
            'answered_correctly':'int8',
            'prior_question_elapsed_time': 'float32',
            'prior_question_had_explanation': 'object'
            }
train_path = '/home/ksu/Desktop/magistr/homework/HW_2/data/train.csv'
train_df = pd.read_csv(train_path, usecols=[1,2,3,4,5,6,7,8,9], dtype=dtypes)

## Чистка данных
- убираются пустые ячейки (строка 1)
- признак `prior_question_had_explanation` преобразуется в `int8` (строки 2 - 6)
- признак `prior_question_elapsed_time` преобразуется в секунды. Так как после этого данные укладываются в 2 байта
признак преобразуется в `int16`

In [4]:
train_df.dropna(inplace=True)
train_df['prior_question_had_explanation'] = (
    train_df['prior_question_had_explanation'].
    apply(lambda has_explanation: 1 if has_explanation == 'True' else 0)
)
train_df['prior_question_had_explanation'] = train_df['prior_question_had_explanation'].astype('int8')
# now prior_question_elapsed_time will be in seconds
train_df['prior_question_elapsed_time'] = train_df['prior_question_elapsed_time'] / 1e3
print('prior_question_elapsed_time min value: {} max value {}\n'.format(
                train_df['prior_question_elapsed_time'].min(),
                train_df['prior_question_elapsed_time'].max()))
train_df['prior_question_elapsed_time'] = train_df['prior_question_elapsed_time'].astype('int16')
train_df.info(memory_usage=200, null_counts=True)

prior_question_elapsed_time min value: 0.0 max value 300.0

<class 'pandas.core.frame.DataFrame'>
Int64Index: 98878794 entries, 1 to 101230331
Data columns (total 9 columns):
timestamp                         98878794 non-null int64
user_id                           98878794 non-null int32
content_id                        98878794 non-null int16
content_type_id                   98878794 non-null int8
task_container_id                 98878794 non-null int16
user_answer                       98878794 non-null int8
answered_correctly                98878794 non-null int8
prior_question_elapsed_time       98878794 non-null int16
prior_question_had_explanation    98878794 non-null int8
dtypes: int16(3), int32(1), int64(1), int8(4)
memory usage: 2.8 GB


## Timestamp

the time in milliseconds between this user interaction and the first event completion from that user

In [24]:
timestamp_secs = train_df['timestamp']/1e3
print('Timestamp max {:.1f}days min {:.1f}s mean {:.1f}days'.format(timestamp_secs.max()/(3600 * 24),
                                                           timestamp_secs.min(),
                                                           timestamp_secs.mean()/((3600 * 24))))
print('Zero timestamp (interaction = event?) amount {}'.format(len(timestamp_secs[timestamp_secs == 0])))
print('Timestamps which are more than 1 hour amount {}'.format(len(timestamp_secs[timestamp_secs > 3600])))
print('Timestamps which are  more than 1 day amount {}'.format(len(timestamp_secs[timestamp_secs > 3600 * 24])))

Timestamp max 1011.9days min 0.0s mean 89.5days
Zero timestamp (interaction = event?) amount 3896
Timestamps which are more than 1 hour amount 90069811
Timestamps which are  more than 1 day amount 86039183


## user_id

ID code for the user

In [32]:
user_id_unique = len(train_df['user_id'].unique())
user_id_len = len(train_df['user_id'])
print('Amount of unquie users {}, percents of df length {:.1f}%'.format(user_id_unique, 100 * user_id_unique/user_id_len))

Amount of unquie users 393569, percents of df length 0.4%


In [55]:
lectures_df = pd.read_csv('/home/ksu/Desktop/magistr/homework/HW_2/data/lectures.csv')
lectures_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 4 columns):
lecture_id    418 non-null int64
tag           418 non-null int64
part          418 non-null int64
type_of       418 non-null object
dtypes: int64(3), object(1)
memory usage: 13.1+ KB


In [56]:
questions_df = pd.read_csv('/home/ksu/Desktop/magistr/homework/HW_2/data/questions.csv')
questions_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13523 entries, 0 to 13522
Data columns (total 5 columns):
question_id       13523 non-null int64
bundle_id         13523 non-null int64
correct_answer    13523 non-null int64
part              13523 non-null int64
tags              13522 non-null object
dtypes: int64(4), object(1)
memory usage: 528.3+ KB


## Questions

In [59]:
questions_df['tags'].value_counts()

8                        738
73                       617
53                       523
1                        413
96                       373
55                       308
79                       293
27                       261
14                       232
173                      222
109                      204
89                       202
136 81 92                140
91                       125
136 38 29                110
60                       108
116                       99
7                         76
147                       75
72                        72
82 81 92                  69
136 92 29                 66
74 81 92                  65
134                       65
133                       62
166                       60
136 38 81                 59
37 153 21                 59
125                       57
179                       56
                        ... 
98 118 160 22 122 162      1
106 81 29                  1
2 107 62 92 102            1
106 169 162 81

Эталонные ответы

In [61]:
questions_df['correct_answer'].value_counts()

0    3716
3    3544
1    3478
2    2785
Name: correct_answer, dtype: int64

In [63]:
questions_df['part'].value_counts()

5    5511
2    1647
3    1562
4    1439
6    1212
7    1160
1     992
Name: part, dtype: int64

usefull columns is ['correct_answer', 'tag', 'part']

## Lectures

maybe 'starter' and 'intention' are garbage?

In [65]:
lectures_df['type_of'].value_counts()

concept             222
solving question    186
intention             7
starter               3
Name: type_of, dtype: int64

find out what is 1..7

In [67]:
lectures_df['part'].value_counts()

5    143
6     83
2     56
1     54
7     32
4     31
3     19
Name: part, dtype: int64

In [68]:
lectures_df['tag'].value_counts()

136    7
116    6
134    6
27     6
161    6
62     6
74     6
113    6
114    5
70     5
94     5
82     5
119    5
173    5
1      4
26     4
36     4
156    4
53     4
109    4
101    4
149    4
48     4
151    4
152    4
4      4
157    4
73     4
14     4
159    4
      ..
175    2
172    2
12     1
169    1
2      1
158    1
15     1
183    1
155    1
17     1
150    1
171    1
67     1
138    1
148    1
144    1
143    1
139    1
31     1
137    1
39     1
126    1
120    1
112    1
103    1
99     1
57     1
90     1
83     1
187    1
Name: tag, Length: 151, dtype: int64

## Answers

In [69]:
train_df.head()

Unnamed: 0,row_id,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation
0,0,0,115,5692,0,1,3,1,,
1,1,56943,115,5716,0,2,2,1,37000.0,False
2,2,118363,115,128,0,0,0,1,55000.0,False
3,3,131167,115,7860,0,3,0,1,19000.0,False
4,4,137965,115,7922,0,4,1,1,11000.0,False


### find out what year is it and how to make transform correctly

In [71]:
pd.to_datetime(train_df['timestamp']).head()

0   1970-01-01 00:00:00.000000000
1   1970-01-01 00:00:00.000056943
2   1970-01-01 00:00:00.000118363
3   1970-01-01 00:00:00.000131167
4   1970-01-01 00:00:00.000137965
Name: timestamp, dtype: datetime64[ns]

In [72]:
train_df['content_id'].value_counts()

6116     1064
6173     1031
4120     1011
175      1001
7876      984
7900      931
2063      892
2064      892
2065      892
3363      881
3365      881
3364      881
4492      877
1278      874
4696      861
6370      860
2948      847
2947      847
2946      847
2595      820
2594      820
2593      820
7218      817
7216      817
7219      817
7217      817
10688     767
10686     747
294       745
10685     729
         ... 
9713        1
12980       1
1708        1
1707        1
310         1
1706        1
11195       1
12279       1
13240       1
12592       1
12837       1
12728       1
11643       1
13149       1
674         1
12964       1
12836       1
12962       1
1332        1
12963       1
12960       1
12278       1
13177       1
12961       1
5480        1
11574       1
11642       1
12838       1
13164       1
12750       1
Name: content_id, Length: 13129, dtype: int64

In [73]:
train_df['prior_question_had_explanation'].value_counts()

True     491583
False     56373
Name: prior_question_had_explanation, dtype: int64

In [76]:
(train_df['prior_question_elapsed_time']/1e3).max()

300.0

In [77]:
(train_df['prior_question_elapsed_time']/1e3).min()

0.0

In [78]:
(train_df['prior_question_elapsed_time']/1e3).mode()

0    17.0
dtype: float64