# Рекомендация тарифов

В вашем распоряжении данные о поведении клиентов, которые уже перешли на эти тарифы (из проекта курса «Статистический анализ данных»). Нужно построить модель для задачи классификации, которая выберет подходящий тариф. Предобработка данных не понадобится — вы её уже сделали.

Постройте модель с максимально большим значением *accuracy*. Чтобы сдать проект успешно, нужно довести долю правильных ответов по крайней мере до 0.75. Проверьте *accuracy* на тестовой выборке самостоятельно.

## 1. Откройте и изучите файл

In [230]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

df = pd.read_csv('/datasets/users_behavior.csv')
df.head()
df.info()

Unnamed: 0,calls,minutes,messages,mb_used,is_ultra
0,40.0,311.9,83.0,19915.42,0
1,85.0,516.75,56.0,22696.96,0
2,77.0,467.66,86.0,21060.45,0
3,106.0,745.53,81.0,8437.39,1
4,66.0,418.74,1.0,14502.75,0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3214 entries, 0 to 3213
Data columns (total 5 columns):
calls       3214 non-null float64
minutes     3214 non-null float64
messages    3214 non-null float64
mb_used     3214 non-null float64
is_ultra    3214 non-null int64
dtypes: float64(4), int64(1)
memory usage: 125.7 KB


Выборка состоит из 3214 объектов и 5 признаков, один из которых является целевым (is_ultra). Нам предстоит бинарная классификация - в зависимости от рекомендаций.

## 2. Разбейте данные на выборки

#### Исходя из стандартной пропорции 3:1:1 разобъем для начала на обучающую и валидационную, в пропорции 60/40.

In [231]:
df_train, df_valid_test=train_test_split(df,train_size=0.6,random_state=27)

#### Валидационную выборку разобъем пополам, на валидационную и тестовую

In [232]:
df_valid,df_test=train_test_split(df_valid_test,test_size=0.5,random_state=27)

#### Проверим размеры выборок

In [233]:
len(df_train)
len(df_valid)
len(df_test)

1928

643

643

#### Сформируем датасет объектов и целевого признака для трех выборок

In [234]:
features_train=df_train.drop(['is_ultra'],axis=1)
target_train=df_train['is_ultra']

In [235]:
features_valid=df_valid.drop(['is_ultra'],axis=1)
target_valid=df_valid['is_ultra'] 

In [236]:
features_test=df_valid.drop(['is_ultra'],axis=1)  
target_test=df_valid['is_ultra']

## 3. Исследуйте модели

#### Посмотрим на модель дерева решений

In [237]:
for depth in range(1,15,2):
    model_tree=DecisionTreeClassifier(criterion='gini',random_state=27, max_depth=depth) 
    model_tree.fit(features_train,target_train) 
    prediction_tree=model_tree.predict(features_valid) 
    print('max_depth:',depth,end='')
    print(' accuracy:',accuracy_score(prediction_tree,target_valid))
    

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=1,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=27, splitter='best')

max_depth: 1 accuracy: 0.7682737169517885


DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=27, splitter='best')

max_depth: 3 accuracy: 0.807153965785381


DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=27, splitter='best')

max_depth: 5 accuracy: 0.8087091757387247


DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=7,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=27, splitter='best')

max_depth: 7 accuracy: 0.8118195956454122


DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=9,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=27, splitter='best')

max_depth: 9 accuracy: 0.8087091757387247


DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=11,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=27, splitter='best')

max_depth: 11 accuracy: 0.7931570762052877


DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=13,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=27, splitter='best')

max_depth: 13 accuracy: 0.7947122861586314


##### Лучший результат дерева решений: max_depth: 7 accuracy: 0.8118195956454122

#### Посмотрим на модель случайного леса

In [238]:
for depth in range(7,16,1):    
    model_forest=RandomForestClassifier(max_depth=depth,n_estimators=16,random_state=27,min_samples_leaf=1) 
    model_forest.fit(features_train,target_train)
    prediction_forest=model_forest.predict(features_valid) 
    print('max_depth:',depth,end='')
    print(' accuracy:',accuracy_score(prediction_forest,target_valid))
    

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=7, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=16,
                       n_jobs=None, oob_score=False, random_state=27, verbose=0,
                       warm_start=False)

max_depth: 7 accuracy: 0.8195956454121306


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=8, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=16,
                       n_jobs=None, oob_score=False, random_state=27, verbose=0,
                       warm_start=False)

max_depth: 8 accuracy: 0.8195956454121306


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=9, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=16,
                       n_jobs=None, oob_score=False, random_state=27, verbose=0,
                       warm_start=False)

max_depth: 9 accuracy: 0.8180404354587869


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=10, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=16,
                       n_jobs=None, oob_score=False, random_state=27, verbose=0,
                       warm_start=False)

max_depth: 10 accuracy: 0.8242612752721618


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=11, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=16,
                       n_jobs=None, oob_score=False, random_state=27, verbose=0,
                       warm_start=False)

max_depth: 11 accuracy: 0.8149300155520995


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=12, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=16,
                       n_jobs=None, oob_score=False, random_state=27, verbose=0,
                       warm_start=False)

max_depth: 12 accuracy: 0.8195956454121306


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=13, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=16,
                       n_jobs=None, oob_score=False, random_state=27, verbose=0,
                       warm_start=False)

max_depth: 13 accuracy: 0.8211508553654744


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=14, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=16,
                       n_jobs=None, oob_score=False, random_state=27, verbose=0,
                       warm_start=False)

max_depth: 14 accuracy: 0.8180404354587869


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=15, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=16,
                       n_jobs=None, oob_score=False, random_state=27, verbose=0,
                       warm_start=False)

max_depth: 15 accuracy: 0.8258164852255054


##### Лучший результат максимальной глубины дерева - max_depth: 15 accuracy: 0.8258164852255054

In [239]:
for estim in range(17,22,1):    
    model_forest=RandomForestClassifier(max_depth=15,n_estimators=estim,random_state=27,min_samples_leaf=1) 
    model_forest.fit(features_train,target_train)
    prediction_forest=model_forest.predict(features_valid) 
    print('n_estimators:',estim,end="")
    print(' accuracy:',accuracy_score(prediction_forest,target_valid))
    

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=15, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=17,
                       n_jobs=None, oob_score=False, random_state=27, verbose=0,
                       warm_start=False)

n_estimators: 17 accuracy: 0.8164852255054432


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=15, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=18,
                       n_jobs=None, oob_score=False, random_state=27, verbose=0,
                       warm_start=False)

n_estimators: 18 accuracy: 0.8227060653188181


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=15, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=19,
                       n_jobs=None, oob_score=False, random_state=27, verbose=0,
                       warm_start=False)

n_estimators: 19 accuracy: 0.8258164852255054


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=15, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=20,
                       n_jobs=None, oob_score=False, random_state=27, verbose=0,
                       warm_start=False)

n_estimators: 20 accuracy: 0.8258164852255054


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=15, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=21,
                       n_jobs=None, oob_score=False, random_state=27, verbose=0,
                       warm_start=False)

n_estimators: 21 accuracy: 0.8258164852255054


##### Лучший результат кол-ва деревьев в лесу - n_estimators: n_estimators: 19 accuracy: 0.8258164852255054

In [240]:
for leaf in range(1,8,1):    
    model_forest=RandomForestClassifier(max_depth=15,n_estimators=19,random_state=27,min_samples_leaf=leaf) 
    model_forest.fit(features_train,target_train)
    prediction_forest=model_forest.predict(features_valid) 
    print('min_samples_leaf:',leaf,end="")
    print(' accuracy:',accuracy_score(prediction_forest,target_valid))
    

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=15, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=19,
                       n_jobs=None, oob_score=False, random_state=27, verbose=0,
                       warm_start=False)

min_samples_leaf: 1 accuracy: 0.8258164852255054


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=15, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=2, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=19,
                       n_jobs=None, oob_score=False, random_state=27, verbose=0,
                       warm_start=False)

min_samples_leaf: 2 accuracy: 0.8242612752721618


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=15, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=3, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=19,
                       n_jobs=None, oob_score=False, random_state=27, verbose=0,
                       warm_start=False)

min_samples_leaf: 3 accuracy: 0.8133748055987559


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=15, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=4, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=19,
                       n_jobs=None, oob_score=False, random_state=27, verbose=0,
                       warm_start=False)

min_samples_leaf: 4 accuracy: 0.8211508553654744


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=15, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=5, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=19,
                       n_jobs=None, oob_score=False, random_state=27, verbose=0,
                       warm_start=False)

min_samples_leaf: 5 accuracy: 0.8211508553654744


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=15, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=6, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=19,
                       n_jobs=None, oob_score=False, random_state=27, verbose=0,
                       warm_start=False)

min_samples_leaf: 6 accuracy: 0.8133748055987559


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=15, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=7, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=19,
                       n_jobs=None, oob_score=False, random_state=27, verbose=0,
                       warm_start=False)

min_samples_leaf: 7 accuracy: 0.8149300155520995


##### Лучший результат минимального количества выборок, необходимых для работы на листовом узле - min_samples_leaf: 1 accuracy: 0.8258164852255054

In [241]:
model_forest=RandomForestClassifier(n_estimators=10, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, bootstrap=True, oob_score=False, n_jobs=1, random_state=None, verbose=0, warm_start=False, class_weight=None) 
model_forest.fit(features_train,target_train)
prediction_forest=model_forest.predict(features_valid) 
#print('min_samples_leaf:',leaf,end="")
print(' accuracy:',accuracy_score(prediction_forest,target_valid))

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
                       oob_score=False, random_state=None, verbose=0,
                       warm_start=False)

 accuracy: 0.7900466562986003


##### Для модели случайного леса методом перебора интервалов подобрали следующие лучшие характиристики: max_depth=15, n_estimators=19, min_samples_leaf=1. Лучший результат: 0.82. Модель со значениями на режиме "авто" дает результат: 0,79 - старания не напрасны.

#### В данном методе выберем гиперпараметры penalty (l1, l2) и solver (liblinear, lbfgs)

In [242]:
model_logistic_Reg=LogisticRegression(penalty = 'l2' , solver='lbfgs') 
model_logistic_Reg.fit(features_train,target_train) 
predict_LogisticReg=model_logistic_Reg.predict(features_valid) 
print('accuracy:',accuracy_score(predict_LogisticReg,target_valid))

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

accuracy: 0.7262830482115086


In [243]:
model_logistic_Reg=LogisticRegression(penalty = 'l1', solver='liblinear') 
model_logistic_Reg.fit(features_train,target_train) 
predict_LogisticReg=model_logistic_Reg.predict(features_valid) 
print('accuracy:',accuracy_score(predict_LogisticReg,target_valid))

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l1',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

accuracy: 0.7729393468118196


##### Вывод: на данном наборе данных самой эффективной на обучающем и валидационным выборках стала модель случайного леса.

## 4. Проверьте модель на тестовой выборке

In [244]:
model_tree=DecisionTreeClassifier(criterion='gini',random_state=27, max_depth=7)
model_tree.fit(features_train,target_train)
prediction_tree=model_tree.predict(features_test)
#print('max_depth:',depth,end='')
print(' accuracy:',accuracy_score(prediction_tree,target_test))

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=7,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=27, splitter='best')

 accuracy: 0.8118195956454122


In [245]:
model_forest=RandomForestClassifier(max_depth=15,n_estimators=19,random_state=27,min_samples_leaf=1)
model_forest.fit(features_train,target_train)
prediction_forest=model_forest.predict(features_test)
#print('n_estimators:',estim,end="")
print(' accuracy:',accuracy_score(prediction_forest,target_test))

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=15, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=19,
                       n_jobs=None, oob_score=False, random_state=27, verbose=0,
                       warm_start=False)

 accuracy: 0.8258164852255054


In [246]:
model_logistic_Reg=LogisticRegression(penalty = 'l1', solver='liblinear')
model_logistic_Reg.fit(features_train,target_train)
predict_LogisticReg=model_logistic_Reg.predict(features_test)
print('accuracy:',accuracy_score(predict_LogisticReg,target_test))

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l1',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

accuracy: 0.7729393468118196


##### Модели на тестовых выборках дают значения равные выборкам для кросс-валидации 

## Чек-лист готовности проекта

Поставьте 'x' в выполненных пунктах. Далее нажмите Shift+Enter.

- [x] Jupyter Notebook открыт
- [x] Весь код исполняется без ошибок
- [x] Ячейки с кодом расположены в порядке исполнения
- [x] Выполнено задание 1: данные загружены и изучены
- [x] Выполнено задание 2: данные разбиты на три выборки
- [x] Выполнено задание 3: проведено исследование моделей
    - [x] Рассмотрено больше одной модели
    - [x] Рассмотрено хотя бы 3 значения гипепараметров для какой-нибудь модели
    - [x] Написаны выводы по результатам исследования
- [x] Выполнено задание 3: Проведено тестирование
- [x] Удалось достичь accuracy не меньше 0.75
