In [1]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import sklearn.externals
import joblib

import sys
import os

from preprocessing.prepare_data import get_x_y

In [2]:
events = pd.read_csv('data/event_data_train.zip')
submissions = pd.read_csv('data/submissions_data_train.zip')

### Raw events data:

In [3]:
events.head(3)

Unnamed: 0,step_id,timestamp,action,user_id
0,32815,1434340848,viewed,17632
1,32815,1434340848,passed,17632
2,32815,1434340848,discovered,17632


### Raw submissions data:

In [4]:
submissions.head(3)

Unnamed: 0,step_id,timestamp,submission_status,user_id
0,31971,1434349275,correct,15853
1,31972,1434348300,correct,15853
2,31972,1478852149,wrong,15853


In [5]:

X, y = get_x_y(events,submissions)


In [6]:
print("Shape X:", X.shape)
X.head(3)

Shape X: (19234, 7)


Unnamed: 0_level_0,correct,wrong,discovered,passed,started_attempt,viewed,day
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,0.0,0.0,1,0,0,1,1
2,2.0,0.0,9,9,2,9,1
3,4.0,4.0,15,15,4,20,1


In [7]:
print("Shape y:", y.shape)
y.value_counts(dropna=False)

Shape y: (19234,)


False    17310
True      1924
Name: is_gone, dtype: int64

### Splitting

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

# Обучение и проверка модели

## RandomForest

In [9]:
rf = RandomForestClassifier(n_estimators=100, n_jobs=2,
                            min_samples_leaf=10, min_samples_split=10,
                            class_weight='balanced')

rf.fit(X_train, y_train)
pred_prob = rf.predict_proba(X_test)

### Evaluating on roc auc:

In [10]:
print('ROC AUC test score', roc_auc_score(y_test, pred_prob[:, 1]))

ROC AUC test score 0.8836108905889145


### Saving model

In [11]:
joblib.dump(rf,'data/random_forest.bin')


['data/random_forest.bin']