In [879]:
import pandas as pd
import numpy as np
from sklearn import tree
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.ensemble import RandomForestClassifier

get models

In [880]:
full_event_data_train = pd.read_csv('./models/event_data_train.csv')
full_submission_data_train = pd.read_csv('./models/submissions_data_train.csv')

In [881]:
event_data_train = pd.read_csv('./models/event_data_train.csv')
submission_data_train = pd.read_csv('./models/submissions_data_train.csv')

In [882]:
event_data_train.head()

Unnamed: 0,step_id,timestamp,action,user_id
0,32815,1434340848,viewed,17632
1,32815,1434340848,passed,17632
2,32815,1434340848,discovered,17632
3,32811,1434340895,discovered,17632
4,32811,1434340895,viewed,17632


In [883]:
submission_data_train.user_id.nunique()

9940

prepare models

add columns 'first_action_timestamp' for user and his unique_steps number

In [884]:
fiirst_user_action = event_data_train.groupby('user_id', as_index=False) \
    .agg({'timestamp': 'min', 'step_id': 'nunique'}) \
    .rename(columns={'timestamp': 'first_action_timestamp', 'step_id': 'unique_steps'})

In [885]:
event_data_train = event_data_train.merge(fiirst_user_action, how='outer', on='user_id')

timestamp for 2 days

In [886]:
two_days_timestamp = 2 * 24 * 60 * 60
two_days_timestamp

172800

query all data suitable our timestamp border

In [887]:
event_data_train = event_data_train.query(f'(timestamp - first_action_timestamp) <= {two_days_timestamp}')

check that we dont have values that do not suite for our timestamp

In [888]:
test_time = event_data_train.groupby('user_id', as_index=False) \
    .agg({'timestamp': ['min', 'max']}) 
test_time['res'] = test_time.timestamp['max'] - test_time.timestamp['min']
test_time.res.max()

172800

In [889]:
test_time_final = event_data_train.groupby('user_id', as_index=False) \
    .agg({'timestamp': 'max', 'first_action_timestamp': 'max'}) 

test_time_final['res'] = test_time_final.timestamp - test_time_final.first_action_timestamp
test_time_final.res.max()

172800

create pivot table for users actions

In [890]:
user_action_data = event_data_train.pivot_table(index='user_id', columns='action', values='step_id', aggfunc='count', fill_value=0).reset_index()
user_action_data

action,user_id,discovered,passed,started_attempt,viewed
0,1,1,0,0,1
1,2,9,9,2,9
2,3,15,15,4,20
3,5,1,1,0,1
4,7,1,1,0,1
...,...,...,...,...,...
19229,26790,2,2,0,2
19230,26793,1,0,1,1
19231,26794,50,50,24,90
19232,26797,10,10,2,10


In [891]:
event_data_train = event_data_train.merge(user_action_data, how='outer', on='user_id')

first action timestamp for submission data for each user

In [892]:
fiirst_user_action_sub = submission_data_train.groupby('user_id', as_index=False) \
    .agg({'timestamp': 'min'}) \
    .rename(columns={'timestamp': 'first_action_timestamp'})

In [893]:
submission_data_train = submission_data_train.merge(fiirst_user_action_sub, how='outer', on='user_id')

In [894]:
submission_data_train = submission_data_train.query(f'(timestamp - first_action_timestamp) <= {two_days_timestamp}')

pivot table for user submission_status

In [895]:
user_submission_status = submission_data_train.pivot_table(index='user_id', columns='submission_status', values='step_id', aggfunc='count', fill_value=0).reset_index()


In [896]:
submission_data_train = submission_data_train.merge(user_submission_status, how='outer', on='user_id')
submission_data_train


Unnamed: 0,step_id,timestamp,submission_status,user_id,first_action_timestamp,correct,wrong
0,31971,1434349275,correct,15853,1434346056,23,3
1,31972,1434348300,correct,15853,1434346056,23,3
2,31976,1434348123,wrong,15853,1434346056,23,3
3,31976,1434348188,correct,15853,1434346056,23,3
4,31977,1434347371,correct,15853,1434346056,23,3
...,...,...,...,...,...,...,...
134607,120745,1501946959,wrong,2615,1501946959,0,1
134608,120745,1503059927,correct,13177,1503059927,1,0
134609,120745,1506348153,wrong,15253,1506348153,0,1
134610,120745,1523035316,wrong,1424,1523035316,0,1


In [897]:
submission_data_train = submission_data_train.groupby('user_id', as_index=False) \
    .agg({'correct': 'max', 'wrong': 'max'})

In [898]:
submission_data_train

Unnamed: 0,user_id,correct,wrong
0,2,2,0
1,3,4,4
2,5,2,2
3,8,9,21
4,14,0,1
...,...,...,...
9935,26787,3,0
9936,26790,1,0
9937,26794,24,7
9938,26797,2,0


In [899]:
event_data_train = event_data_train.merge(submission_data_train, how='outer', on='user_id')

In [900]:
event_data_train = event_data_train.drop_duplicates(subset='user_id', keep='last').fillna(0)

In [901]:
event_data_train['correct_rating'] = event_data_train.correct / (event_data_train.correct + event_data_train.wrong)

actual data about user's passed steps

In [902]:
user_post_factum_data = full_event_data_train.pivot_table(index='user_id', columns='action', values='step_id', aggfunc='count', fill_value=0).reset_index()
user_post_factum_data = user_post_factum_data.rename(columns={'passed': 'fact_passed'})
user_post_factum_data

action,user_id,discovered,fact_passed,started_attempt,viewed
0,1,1,0,0,1
1,2,9,9,2,10
2,3,91,87,30,192
3,5,11,11,4,12
4,7,1,1,0,1
...,...,...,...,...,...
19229,26790,8,8,1,9
19230,26793,1,0,1,1
19231,26794,69,69,34,180
19232,26797,10,10,2,13


In [903]:
event_data_train = event_data_train.merge(user_post_factum_data[['user_id', 'fact_passed']], how='outer', on='user_id')

if user has done 40 tasks or more, he finished the course

In [904]:
event_data_train['finished'] = event_data_train.fact_passed >= 40

In [905]:
event_data_train = event_data_train.set_index(event_data_train.user_id)
event_data_train = event_data_train.drop(columns=['step_id', 'timestamp', 'first_action_timestamp', 'action', 'user_id'])


In [906]:
event_data_train = event_data_train.fillna(0).astype('float')

In [907]:
event_data_train.value_counts('finished')

finished
0.0    14482
1.0     4752
dtype: int64

In [908]:
4752 / (4752 + 14482)

0.24706249350109183

In [909]:
event_data_train.finished = event_data_train.finished.astype('int64')

In [910]:
event_data_train

Unnamed: 0_level_0,unique_steps,discovered,passed,started_attempt,viewed,correct,wrong,correct_rating,fact_passed,finished
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
17632,119.0,17.0,14.0,2.0,29.0,2.0,0.0,1.000000,86.0,1
12494,79.0,17.0,14.0,1.0,24.0,4.0,2.0,0.666667,67.0,1
442,91.0,19.0,18.0,7.0,31.0,5.0,16.0,0.238095,81.0,1
22254,60.0,27.0,27.0,10.0,36.0,8.0,15.0,0.347826,59.0,1
6646,186.0,2.0,2.0,0.0,3.0,2.0,0.0,1.000000,181.0,1
...,...,...,...,...,...,...,...,...,...,...
795,14.0,14.0,14.0,3.0,20.0,2.0,1.0,0.666667,14.0,0
13475,5.0,5.0,5.0,11.0,5.0,2.0,9.0,0.181818,5.0,0
22851,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.000000,1.0,0
943,4.0,4.0,4.0,1.0,4.0,1.0,0.0,1.000000,4.0,0


In [911]:
finished_predict = event_data_train.finished
train_dt = event_data_train.drop(columns=["finished", "fact_passed"])

In [912]:
train_dt.head()

Unnamed: 0_level_0,unique_steps,discovered,passed,started_attempt,viewed,correct,wrong,correct_rating
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
17632,119.0,17.0,14.0,2.0,29.0,2.0,0.0,1.0
12494,79.0,17.0,14.0,1.0,24.0,4.0,2.0,0.666667
442,91.0,19.0,18.0,7.0,31.0,5.0,16.0,0.238095
22254,60.0,27.0,27.0,10.0,36.0,8.0,15.0,0.347826
6646,186.0,2.0,2.0,0.0,3.0,2.0,0.0,1.0


In [913]:
rf_clf = RandomForestClassifier()

In [914]:
params = {"n_estimators": range(10, 50, 10), "max_depth": range(1, 12, 2), "min_samples_leaf": range(1, 7), "min_samples_split": range(2, 10, 2)}

In [915]:
best_clf = GridSearchCV(rf_clf, params, cv=5, n_jobs=-1)

In [916]:
X_train, X_test, y_train, y_test = train_test_split(train_dt, finished_predict, train_size = 0.75)


In [917]:
best_clf.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'max_depth': range(1, 12, 2),
                         'min_samples_leaf': range(1, 7),
                         'min_samples_split': range(2, 10, 2),
                         'n_estimators': range(10, 50, 10)})

In [918]:
best_clf.best_params_

{'max_depth': 9,
 'min_samples_leaf': 1,
 'min_samples_split': 4,
 'n_estimators': 10}

In [919]:
best_clf.predict(X_test)

array([0, 0, 0, ..., 0, 0, 0])

In [920]:
best_clf.predict_proba(X_test)[:, 1]

array([0.        , 0.03339533, 0.        , ..., 0.        , 0.03339533,
       0.02401216])

In [921]:
pd.Series(best_clf.predict_proba(X_test)[:, 1])

0       0.000000
1       0.033395
2       0.000000
3       0.000000
4       1.000000
          ...   
4804    0.000000
4805    1.000000
4806    0.000000
4807    0.033395
4808    0.024012
Length: 4809, dtype: float64

In [922]:
test_event_data = pd.read_csv('https://stepik.org/media/attachments/course/4852/events_data_test.csv')
test_submission_data = pd.read_csv('https://stepik.org/media/attachments/course/4852/submission_data_test.csv')

In [923]:
test_user_action_data = test_event_data.pivot_table(index='user_id', columns='action', values='step_id', aggfunc='count', fill_value=0).reset_index()
test_user_action_data

action,user_id,discovered,passed,started_attempt,viewed
0,4,1,1,0,1
1,6,1,1,0,1
2,10,2,2,0,6
3,12,11,9,4,14
4,13,70,70,35,105
...,...,...,...,...,...
6179,26791,1,1,0,1
6180,26795,1,1,0,1
6181,26796,6,4,2,12
6182,26799,6,6,2,6


In [924]:
test_event_data = test_event_data.merge(test_user_action_data, how='outer', on='user_id')

In [925]:
additional_fields = test_event_data.groupby('user_id', as_index=False) \
    .agg({'step_id': 'nunique'}) \
    .rename(columns={'step_id': 'unique_steps'})

In [926]:
test_event_data = test_event_data.merge(additional_fields, how='outer', on='user_id')

In [927]:
test_submission_status = test_submission_data.pivot_table(index='user_id', columns='submission_status', values='step_id', aggfunc='count', fill_value=0).reset_index()
test_submission_status

submission_status,user_id,correct,wrong
0,12,1,0
1,13,29,36
2,15,10,30
3,21,24,103
4,35,7,35
...,...,...,...
2798,26775,46,160
2799,26780,16,7
2800,26785,3,1
2801,26796,2,3


In [928]:
test_event_data = test_event_data.merge(test_submission_status, how='outer', on='user_id')

In [929]:
test_event_data = test_event_data.drop_duplicates(subset='user_id', keep='first')

In [930]:
test_event_data = test_event_data.set_index(test_event_data.user_id)
test_event_data = test_event_data.drop(columns=['step_id', 'timestamp', 'action', 'user_id'])

In [931]:
test_event_data['correct_rating'] = test_event_data.correct / (test_event_data.correct + test_event_data.wrong)

In [932]:
test_event_data = test_event_data.fillna(0).astype('float')

In [933]:
test_event_data = test_event_data[['unique_steps', 'discovered', 'passed', 'started_attempt', 'viewed', 'correct', 'wrong', 'correct_rating']]

In [934]:
predictions_proba = best_clf.predict_proba(test_event_data)

In [935]:
predictions = best_clf.predict(test_event_data)

In [936]:
pd.Series(predictions).value_counts()

0    5769
1     415
dtype: int64

In [937]:
test_event_data.reset_index(inplace=True)


In [949]:
df = pd.DataFrame({'user_id': test_event_data.user_id, 'is_gone': predictions_proba[:, 1]})

In [950]:
df = df.sort_values('user_id')

In [951]:
df.is_gone = df.is_gone.round(2)

In [952]:
df

Unnamed: 0,user_id,is_gone
4116,4,0.00
5191,6,0.00
4443,10,0.00
1558,12,0.02
208,13,1.00
...,...,...
5044,26791,0.00
5671,26795,0.00
2269,26796,0.02
1570,26799,0.02


In [941]:
df.to_csv('./answer/predictions.csv', index=False)

In [942]:
test_event_data

Unnamed: 0,user_id,unique_steps,discovered,passed,started_attempt,viewed,correct,wrong,correct_rating
0,24417,33.0,33.0,27.0,6.0,65.0,0.0,0.0,0.0
1,999,6.0,6.0,6.0,0.0,8.0,0.0,0.0,0.0
2,26244,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0
3,14192,13.0,13.0,13.0,0.0,22.0,0.0,0.0,0.0
4,12684,103.0,103.0,75.0,25.0,336.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
6179,7315,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0
6180,24300,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0
6181,820,1.0,1.0,1.0,0.0,2.0,0.0,0.0,0.0
6182,3649,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0
