# Рекомендация тарифов

В нашем распоряжении данные о поведении клиентов, которые уже перешли на эти тарифы. Нужно построить модель для задачи классификации, которая выберет подходящий тариф. Предобработка данных не понадобится — мы её уже сделали.

Построим модель с максимально большим значением *accuracy*.

## Откроем и изучим файл

In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import (roc_auc_score, accuracy_score, precision_score, recall_score)
from sklearn.model_selection import (StratifiedShuffleSplit, GridSearchCV) #подбор гипер параметров
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('/datasets/users_behavior.csv')

df.head()

Unnamed: 0,calls,minutes,messages,mb_used,is_ultra
0,40.0,311.9,83.0,19915.42,0
1,85.0,516.75,56.0,22696.96,0
2,77.0,467.66,86.0,21060.45,0
3,106.0,745.53,81.0,8437.39,1
4,66.0,418.74,1.0,14502.75,0


In [3]:
df.isna().sum()

calls       0
minutes     0
messages    0
mb_used     0
is_ultra    0
dtype: int64

###### Пропусков не иммеется

In [4]:
df.describe()

Unnamed: 0,calls,minutes,messages,mb_used,is_ultra
count,3214.0,3214.0,3214.0,3214.0,3214.0
mean,63.038892,438.208787,38.281269,17207.673836,0.306472
std,33.236368,234.569872,36.148326,7570.968246,0.4611
min,0.0,0.0,0.0,0.0,0.0
25%,40.0,274.575,9.0,12491.9025,0.0
50%,62.0,430.6,30.0,16943.235,0.0
75%,82.0,571.9275,57.0,21424.7,1.0
max,244.0,1632.06,224.0,49745.73,1.0


###### Данные корректны

## Разобьём данные на выборки

In [5]:
x = df.drop('is_ultra', axis='columns')
y = df.is_ultra


x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=12345, test_size=0.25)

## Исследуем модели

###### Не люблю подбирать руками гиперпараметры, так что пусть за меня это сделает машина. Если бы df был бы пошире (в плане колонок) - я бы ещё и их урезал и оставил только, например top-5 из 10, но их тут немного, оставим.

In [6]:
cv=StratifiedShuffleSplit(n_splits=10, random_state=None, test_size=0.25,
            train_size=None)

In [7]:
%%time

dtc = DecisionTreeClassifier(max_leaf_nodes=4, random_state=15)

parameters_grid = {
    'criterion': ['gini', 'entropy', 'log_loss'],
    'max_leaf_nodes': range(2,11,1),
    'min_samples_split': range(1, 11,1)
}

dtc_grid_cv = GridSearchCV(dtc, parameters_grid, scoring = 'accuracy', cv = cv)

dtc_grid_cv.fit(x_train, y_train)

Traceback (most recent call last):
  File "/opt/conda/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/conda/lib/python3.9/site-packages/sklearn/tree/_classes.py", line 898, in fit
    super().fit(
  File "/opt/conda/lib/python3.9/site-packages/sklearn/tree/_classes.py", line 237, in fit
    raise ValueError("min_samples_split must be an integer "
ValueError: min_samples_split must be an integer greater than 1 or a float in (0.0, 1.0]; got the integer 1

Traceback (most recent call last):
  File "/opt/conda/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/conda/lib/python3.9/site-packages/sklearn/tree/_classes.py", line 898, in fit
    super().fit(
  File "/opt/conda/lib/python3.9/site-packages/sklearn/tree/_classes.py", line 237, in fit
    raise ValueError("min_samp

CPU times: user 14.5 s, sys: 160 ms, total: 14.6 s
Wall time: 14.8 s


Traceback (most recent call last):
  File "/opt/conda/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/conda/lib/python3.9/site-packages/sklearn/tree/_classes.py", line 898, in fit
    super().fit(
  File "/opt/conda/lib/python3.9/site-packages/sklearn/tree/_classes.py", line 347, in fit
    criterion = CRITERIA_CLF[self.criterion](self.n_outputs_,
KeyError: 'log_loss'

Traceback (most recent call last):
  File "/opt/conda/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/conda/lib/python3.9/site-packages/sklearn/tree/_classes.py", line 898, in fit
    super().fit(
  File "/opt/conda/lib/python3.9/site-packages/sklearn/tree/_classes.py", line 347, in fit
    criterion = CRITERIA_CLF[self.criterion](self.n_outputs_,
KeyError: 'log_loss'

Traceback (most recent call last

GridSearchCV(cv=StratifiedShuffleSplit(n_splits=10, random_state=None, test_size=0.25,
            train_size=None),
             estimator=DecisionTreeClassifier(max_leaf_nodes=4,
                                              random_state=15),
             param_grid={'criterion': ['gini', 'entropy', 'log_loss'],
                         'max_leaf_nodes': range(2, 11),
                         'min_samples_split': range(1, 11)},
             scoring='accuracy')

In [8]:
print(f'Наилучшая точность: {round(dtc_grid_cv.best_score_, 3)}')
print(f'Наилучшая модель: {dtc_grid_cv.best_estimator_}') 
print(f'Наилучшие параметры: {dtc_grid_cv.best_params_}') 

Наилучшая точность: 0.804
Наилучшая модель: DecisionTreeClassifier(max_leaf_nodes=9, random_state=15)
Наилучшие параметры: {'criterion': 'gini', 'max_leaf_nodes': 9, 'min_samples_split': 2}


In [9]:
%%time

rfc = RandomForestClassifier(random_state=12345)

parameters_grid = {
    'n_estimators': range(50, 160, 50),
    'max_leaf_nodes': range(1,13,3),
    'min_samples_split': range(1, 13,3)
}

rfc_grid_cv = GridSearchCV(rfc, parameters_grid, scoring = 'accuracy', cv = cv)

rfc_grid_cv.fit(x_train, y_train)

Traceback (most recent call last):
  File "/opt/conda/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/conda/lib/python3.9/site-packages/sklearn/ensemble/_forest.py", line 387, in fit
    trees = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
  File "/opt/conda/lib/python3.9/site-packages/joblib/parallel.py", line 1043, in __call__
    if self.dispatch_one_batch(iterator):
  File "/opt/conda/lib/python3.9/site-packages/joblib/parallel.py", line 861, in dispatch_one_batch
    self._dispatch(tasks)
  File "/opt/conda/lib/python3.9/site-packages/joblib/parallel.py", line 779, in _dispatch
    job = self._backend.apply_async(batch, callback=cb)
  File "/opt/conda/lib/python3.9/site-packages/joblib/_parallel_backends.py", line 208, in apply_async
    result = ImmediateResult(func)
  File "/opt/conda/lib/python3.9/site-packages/joblib/_parallel_backends.py", line 572, in __init_

CPU times: user 1min 6s, sys: 295 ms, total: 1min 7s
Wall time: 1min 7s


GridSearchCV(cv=StratifiedShuffleSplit(n_splits=10, random_state=None, test_size=0.25,
            train_size=None),
             estimator=RandomForestClassifier(random_state=12345),
             param_grid={'max_leaf_nodes': range(1, 13, 3),
                         'min_samples_split': range(1, 13, 3),
                         'n_estimators': range(50, 160, 50)},
             scoring='accuracy')

In [10]:
print(f'Наилучшая точность: {round(rfc_grid_cv.best_score_, 3)}')
print(f'Наилучшая модель: {rfc_grid_cv.best_estimator_}') 
print(f'Наилучшие параметры: {rfc_grid_cv.best_params_}') 

Наилучшая точность: 0.804
Наилучшая модель: RandomForestClassifier(max_leaf_nodes=10, min_samples_split=4,
                       random_state=12345)
Наилучшие параметры: {'max_leaf_nodes': 10, 'min_samples_split': 4, 'n_estimators': 100}


In [11]:
%%time

knn = KNeighborsClassifier(n_neighbors=11)

#сетка гипер параметров
parameters_grid = {
    'n_neighbors': range(3, 19, 1),
    'p': range(1,2+1,1)
}

#с помощью grid... подберем гипер параметры для knn

knn_grid_cv = GridSearchCV(knn, parameters_grid, scoring = 'accuracy', cv = cv)

knn_grid_cv.fit(x_train, y_train)

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mo

CPU times: user 6.17 s, sys: 141 ms, total: 6.31 s
Wall time: 6.53 s


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


GridSearchCV(cv=StratifiedShuffleSplit(n_splits=10, random_state=None, test_size=0.25,
            train_size=None),
             estimator=KNeighborsClassifier(n_neighbors=11),
             param_grid={'n_neighbors': range(3, 19), 'p': range(1, 3)},
             scoring='accuracy')

In [12]:
print(f'Наилучшая точность: {round(knn_grid_cv.best_score_, 3)}')
print(f'Наилучшая модель: {knn_grid_cv.best_estimator_}') 
print(f'Наилучшие параметры: {knn_grid_cv.best_params_}') 

Наилучшая точность: 0.769
Наилучшая модель: KNeighborsClassifier(n_neighbors=11, p=1)
Наилучшие параметры: {'n_neighbors': 11, 'p': 1}


In [13]:
knn_best = knn_grid_cv.best_estimator_
dtc_best = dtc_grid_cv.best_estimator_
rfc_best = rfc_grid_cv.best_estimator_

models_best = [rfc_best]

## Проверим модель на тестовой выборке

In [14]:
for model in models_best:
    print(f'Показатели модели {model}: \n')
    print(f'Метрика roc_auc_score: {roc_auc_score(y_test, model.predict(x_test))}')
    print(f'Метрика accuracy_score: {accuracy_score(y_test, model.predict(x_test))}')
    print(f'Метрика precision_score: {precision_score(y_test, model.predict(x_test))}')
    print(f'Метрика recall_score: {recall_score(y_test, model.predict(x_test))}\n')

Показатели модели RandomForestClassifier(max_leaf_nodes=10, min_samples_split=4,
                       random_state=12345): 

Метрика roc_auc_score: 0.7137297966583876
Метрика accuracy_score: 0.8034825870646766
Метрика precision_score: 0.7712418300653595
Метрика recall_score: 0.4896265560165975

