## Псотроение модели предсказания оттока пользователей

### import библиотек

In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

### Подгружаем train датасеты, полученные на предыдущем шаге

In [3]:
X_train = pd.read_csv('X_train.csv')
y_train = pd.read_csv('y_train.csv')

In [4]:
X_train

Unnamed: 0,user_id,n_viewed,n_discovered,n_passed,n_started_attempt,n_steps,n_correct,n_wrong
0,1,1.0,1.0,0.0,0.0,1,0.0,0.0
1,2,9.0,9.0,9.0,2.0,9,2.0,0.0
2,3,20.0,15.0,15.0,4.0,15,4.0,4.0
3,5,1.0,1.0,1.0,0.0,1,0.0,0.0
4,7,1.0,1.0,1.0,0.0,1,0.0,0.0
...,...,...,...,...,...,...,...,...
19229,26790,6.0,6.0,6.0,1.0,6,1.0,0.0
19230,26793,1.0,1.0,0.0,1.0,1,0.0,0.0
19231,26794,134.0,61.0,61.0,32.0,61,31.0,9.0
19232,26797,10.0,10.0,10.0,2.0,10,2.0,0.0


In [5]:
y_train

Unnamed: 0,user_id,is_completed_course
0,1,0
1,2,0
2,3,1
3,5,0
4,7,0
...,...,...
19229,26790,0
19230,26793,0
19231,26794,1
19232,26797,0


In [6]:
X_train.corr()

Unnamed: 0,user_id,n_viewed,n_discovered,n_passed,n_started_attempt,n_steps,n_correct,n_wrong
user_id,1.0,-0.006687,-0.003158,-0.004559,-4.3e-05,-0.003473,-0.002,-0.003679
n_viewed,-0.006687,1.0,0.893119,0.882922,0.8324,0.896572,0.804014,0.63916
n_discovered,-0.003158,0.893119,1.0,0.989159,0.895011,0.999512,0.879153,0.578789
n_passed,-0.004559,0.882922,0.989159,1.0,0.912259,0.9886,0.926303,0.592832
n_started_attempt,-4.3e-05,0.8324,0.895011,0.912259,1.0,0.895319,0.930222,0.687295
n_steps,-0.003473,0.896572,0.999512,0.9886,0.895319,1.0,0.878566,0.578417
n_correct,-0.002,0.804014,0.879153,0.926303,0.930222,0.878566,1.0,0.621011
n_wrong,-0.003679,0.63916,0.578789,0.592832,0.687295,0.578417,0.621011,1.0


### Набллюдаем высокую корреляцию признаков, что означает что большая их часть будет только мешать обучению 

### Поэтому уменьшаем количество признаков, оставляя самые важные

#### Устанавливаем user_id в качестве индекса у обоих датасетов

In [7]:
y_train = y_train.set_index('user_id')
X_train = X_train.set_index('user_id')

#### Создаем фичу correct_ratio, как отношение верных ответов ко всем ответам

In [8]:
X_train['correct_ratio'] = X_train.n_correct / (X_train.n_correct + X_train.n_wrong)

#### Удаляем малозначимые колонки

In [9]:
#drop_columns = ['n_started_attempt', 'n_correct', 'n_wrong', 'n_viewed', 'n_discovered', 'n_passed']

In [10]:
#X_train = X_train.drop(drop_columns, axis = 1)

#### На всякий случай избавляемся от NaN значение 

In [11]:
X_train = X_train.fillna(0)
X_def = X_train.copy(deep=True)
X_train

Unnamed: 0_level_0,n_viewed,n_discovered,n_passed,n_started_attempt,n_steps,n_correct,n_wrong,correct_ratio
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,1.0,1.0,0.0,0.0,1,0.0,0.0,0.000000
2,9.0,9.0,9.0,2.0,9,2.0,0.0,1.000000
3,20.0,15.0,15.0,4.0,15,4.0,4.0,0.500000
5,1.0,1.0,1.0,0.0,1,0.0,0.0,0.000000
7,1.0,1.0,1.0,0.0,1,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...
26790,6.0,6.0,6.0,1.0,6,1.0,0.0,1.000000
26793,1.0,1.0,0.0,1.0,1,0.0,0.0,0.000000
26794,134.0,61.0,61.0,32.0,61,31.0,9.0,0.775000
26797,10.0,10.0,10.0,2.0,10,2.0,0.0,1.000000


### Делим данные на train и test выборки для уменьшения вероятности переобучения модели

In [12]:
# разделение данных для обучения на train и test
X_tr, X_tst, y_tr, y_tst = train_test_split(X_train, y_train, test_size=0.2)

### Ищем оптимальные значения параметров для RandomForest с помощью перебора последних

In [13]:
%%time
rf = RandomForestClassifier()

# Валидируемые параметры: 
params = {'n_estimators' : range(20, 100, 10), 'max_depth' : range(1, 5), 'min_samples_leaf' : range(1, 5)}

# Осуществляем поиск оптимальной комбинации параметров по метрике roc_auc
clf = GridSearchCV(rf, params, n_jobs = -1, cv = 3, scoring = 'roc_auc').fit(X_tr, y_tr.is_completed_course)

Wall time: 34.9 s


#### Лучшие параметры:

In [14]:
clf.best_params_

{'max_depth': 4, 'min_samples_leaf': 4, 'n_estimators': 60}

### roc auc score на test выборке и на всем наборе данных 

In [15]:
roc_auc_score(y_tst, clf.best_estimator_.predict_proba(X_tst)[:,1])

0.888102302897212

In [16]:
roc_auc_score(y_train, clf.best_estimator_.predict_proba(X_train)[:,1])

0.8861894089019527

### Значимость фичей:

In [17]:
clf.best_estimator_.feature_importances_

array([0.09289185, 0.28723567, 0.19702066, 0.03552701, 0.15299939,
       0.19585964, 0.01912366, 0.01934212])

### Попробуем удалить наименее значимые колонки ( Значимость которых меньше 0.1)

#### Удаляем малозначимые колонки

In [18]:
drop_columns = ['n_viewed', 'n_started_attempt', 'n_wrong', 'correct_ratio']

In [19]:
X_train = X_train.drop(drop_columns, axis = 1)

In [20]:
X_train

Unnamed: 0_level_0,n_discovered,n_passed,n_steps,n_correct
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1.0,0.0,1,0.0
2,9.0,9.0,9,2.0
3,15.0,15.0,15,4.0
5,1.0,1.0,1,0.0
7,1.0,1.0,1,0.0
...,...,...,...,...
26790,6.0,6.0,6,1.0
26793,1.0,0.0,1,0.0
26794,61.0,61.0,61,31.0
26797,10.0,10.0,10,2.0


In [21]:
# разделение данных для обучения на train и test
X_tr, X_tst, y_tr, y_tst = train_test_split(X_train, y_train, test_size=0.2)

In [22]:
%%time
rf = RandomForestClassifier()

# Валидируемые параметры: 
params = {'n_estimators' : range(20, 100, 10), 'max_depth' : range(1, 5), 'min_samples_leaf' : range(1, 5)}

# Осуществляем поиск оптимальной комбинации параметров по метрике roc_auc
clf = GridSearchCV(rf, params, n_jobs = -1, cv = 3, scoring = 'roc_auc').fit(X_tr, y_tr.is_completed_course)

Wall time: 33.8 s


In [23]:
clf.best_params_

{'max_depth': 4, 'min_samples_leaf': 4, 'n_estimators': 70}

In [24]:
roc_auc_score(y_tst, clf.best_estimator_.predict_proba(X_tst)[:,1])

0.8883531963432676

In [25]:
roc_auc_score(y_train, clf.best_estimator_.predict_proba(X_train)[:,1])

0.8846775082719502

### Значимость фичей:

In [26]:
clf.best_estimator_.feature_importances_

array([0.14090147, 0.43634039, 0.30087733, 0.1218808 ])

### Попробуем оставить 2 фичи: n_passed и correct_ratio

In [28]:
X_train = X_def[['n_passed', 'correct_ratio']]

In [29]:
# разделение данных для обучения на train и test
X_tr, X_tst, y_tr, y_tst = train_test_split(X_train, y_train, test_size=0.2)

In [30]:
%%time
rf = RandomForestClassifier()

# Валидируемые параметры: 
params = {'n_estimators' : range(20, 100, 10), 'max_depth' : range(1, 5), 'min_samples_leaf' : range(1, 5)}

# Осуществляем поиск оптимальной комбинации параметров по метрике roc_auc
clf = GridSearchCV(rf, params, n_jobs = -1, cv = 3, scoring = 'roc_auc').fit(X_tr, y_tr.is_completed_course)

Wall time: 32.6 s


In [31]:
clf.best_params_

{'max_depth': 4, 'min_samples_leaf': 3, 'n_estimators': 90}

In [32]:
roc_auc_score(y_tst, clf.best_estimator_.predict_proba(X_tst)[:,1])

0.8879901761639943

In [33]:
roc_auc_score(y_train, clf.best_estimator_.predict_proba(X_train)[:,1])

0.8853160739425321

### Значимость фичей:

In [34]:
clf.best_estimator_.feature_importances_

array([0.6991558, 0.3008442])