# Итоговый проект по курсу от Megafon

### Задание
У нас появился запрос из отдела продаж и маркетинга. Как вы знаете «МегаФон» предлагает обширный набор различных услуг своим абонентам. При этом разным пользователям интересны разные услуги. Поэтому необходимо построить алгоритм, который для каждой пары пользователь-услуга определит вероятность подключения услуги.
### Данные
В качестве исходных данных вам будет доступна информация об отклике
абонентов на предложение подключения одной из услуг. Каждому пользователю может быть сделано несколько предложений в разное время, каждое из которых он может или принять, или отклонить.
Отдельным набором данных будет являться нормализованный
анонимизированный набор признаков, характеризующий профиль потребления абонента. Эти данные привязаны к определенному времени, поскольку профиль абонента может меняться с течением времени.

In [2]:
import pandas as pd
import dask.dataframe as dd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
import datetime as dtm
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import f1_score, make_scorer
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
import warnings
warnings.filterwarnings("ignore")
import pickle

In [3]:
data_train = pd.read_csv('data_train.csv')
data_test = pd.read_csv('data_test.csv')

In [4]:
%%time
df_features = dd.read_csv('features.csv', sep='\t')

Wall time: 437 ms


#### Визуальный просмотр данных и их размерностей

In [5]:
df_features.head()

Unnamed: 0.1,Unnamed: 0,id,buy_time,0,1,2,3,4,5,6,...,243,244,245,246,247,248,249,250,251,252
0,0,2013026,1531688400,18.910029,46.980888,4.969214,-1.386798,3.791754,-14.01179,-16.08618,...,-977.373846,-613.770792,-25.996269,-37.630448,-301.747724,-25.832889,-0.694428,-12.175933,-0.45614,0.0
1,1,2014722,1539550800,36.690029,152.400888,448.069214,563.833202,463.841754,568.99821,-16.08618,...,-891.373846,-544.770792,-20.996269,48.369552,80.252276,-13.832889,-0.694428,-1.175933,-0.45614,0.0
2,2,2015199,1545598800,-67.019971,157.050888,-63.180786,178.103202,-68.598246,156.99821,3.51382,...,-977.373846,-613.770792,-12.996269,-37.630448,10829.252276,-25.832889,-0.694428,-12.175933,-0.45614,0.0
3,3,2021765,1534107600,7.010029,150.200888,-6.930786,216.213202,76.621754,351.84821,-16.08618,...,-973.373846,-613.770792,-23.996269,-37.630448,-205.747724,-24.832889,-0.694428,-11.175933,-0.45614,1.0
4,4,2027465,1533502800,-90.439971,134.220888,-104.380786,153.643202,-109.798246,132.53821,-16.08618,...,1643.626154,2007.229208,206.003731,-21.630448,6667.252276,92.167111,-0.694428,49.824067,47.54386,0.0


In [6]:
data_train.head()

Unnamed: 0.1,Unnamed: 0,id,vas_id,buy_time,target
0,0,540968,8.0,1537131600,0.0
1,1,1454121,4.0,1531688400,0.0
2,2,2458816,1.0,1534107600,0.0
3,3,3535012,5.0,1535922000,0.0
4,4,1693214,1.0,1535922000,0.0


In [7]:
data_train['buy_time'].unique()

array([1537131600, 1531688400, 1534107600, 1535922000, 1532898000,
       1544994000, 1545598800, 1534712400, 1541970000, 1538341200,
       1533502800, 1541365200, 1539550800, 1536526800, 1546203600,
       1532293200, 1544389200, 1542574800, 1537736400, 1535317200,
       1540760400, 1531083600, 1538946000, 1540155600, 1543784400,
       1543179600], dtype=int64)

In [8]:
df_features['buy_time'].drop_duplicates().compute()

0      1531688400
1      1539550800
2      1545598800
3      1534107600
4      1533502800
6      1544994000
7      1537736400
9      1544389200
10     1536526800
11     1535317200
13     1534712400
14     1542574800
16     1540760400
18     1540155600
23     1541365200
26     1537131600
28     1531083600
33     1538341200
34     1532293200
40     1546203600
46     1535922000
51     1538946000
55     1547413200
57     1543784400
64     1548018000
67     1541970000
69     1532898000
148    1546808400
165    1543179600
Name: buy_time, dtype: int64

In [9]:
data_train.shape

(831653, 5)

In [11]:
%%time
df_features.shape[0].compute()

Wall time: 8min 45s


4512528

#### Слияние данных

In [25]:
id_in_train = list(data_train.id)
id_in_test = list(data_test.id)

In [26]:
%%time
df_features_for_train = df_features[df_features.id.isin(id_in_train)]

Wall time: 13.3 s


In [27]:
%%time
df_features_for_train_in_pd = df_features_for_train.compute()

Wall time: 9min 24s


In [28]:
%%time
df_features_for_test = df_features[df_features.id.isin(id_in_test)]

Wall time: 2.23 s


In [29]:
%%time
df_features_for_test_in_pd = df_features_for_test.compute()

Wall time: 9min 15s


In [30]:
data_train.head()

Unnamed: 0.1,Unnamed: 0,id,vas_id,buy_time,target
0,0,540968,8.0,1537131600,0.0
1,1,1454121,4.0,1531688400,0.0
2,2,2458816,1.0,1534107600,0.0
3,3,3535012,5.0,1535922000,0.0
4,4,1693214,1.0,1535922000,0.0


In [31]:
df_features_for_test_in_pd.shape

(72552, 256)

In [32]:
sort_data_train = data_train.sort_values(by='id')
sort_df_features_for_train_in_pd = df_features_for_train_in_pd.sort_values(by='id')

In [33]:
sort_data_test = data_test.sort_values(by='id')
sort_df_features_for_test_in_pd = df_features_for_test_in_pd.sort_values(by='id')

In [34]:
%%time
full_train = pd.merge_asof(sort_data_train, sort_df_features_for_train_in_pd, on='id', by='buy_time', direction='nearest')

Wall time: 53.9 s
Parser   : 141 ms


In [35]:
%%time
full_test = pd.merge_asof(sort_data_test, sort_df_features_for_test_in_pd, on='id', by='buy_time', direction='nearest')

Wall time: 2.56 s


In [36]:
full_test.head()

Unnamed: 0,Unnamed: 0_x,id,vas_id,buy_time,Unnamed: 0_y,0,1,2,3,4,...,243,244,245,246,247,248,249,250,251,252
0,87,55,2.0,1547413200,2266581,-23.859971,-105.359112,-37.800786,-153.726798,-43.218246,...,-977.373846,-613.770792,-24.996269,48.369552,-244.747724,-24.832889,-0.694428,-12.175933,-0.45614,0.0
1,98,64,4.0,1548018000,3102545,-93.799971,-337.249112,-107.740786,-360.686798,-113.158246,...,934.626154,-611.770792,-25.996269,3571.369552,-120.747724,25.167111,-0.694428,-12.175933,-0.45614,1.0
2,278,151,2.0,1547413200,2266581,-23.859971,-105.359112,-37.800786,-153.726798,-43.218246,...,-977.373846,-613.770792,-24.996269,48.369552,-244.747724,-24.832889,-0.694428,-12.175933,-0.45614,0.0
3,552,274,4.0,1548018000,3102545,-93.799971,-337.249112,-107.740786,-360.686798,-113.158246,...,934.626154,-611.770792,-25.996269,3571.369552,-120.747724,25.167111,-0.694428,-12.175933,-0.45614,1.0
4,551,274,2.0,1547413200,2266581,-23.859971,-105.359112,-37.800786,-153.726798,-43.218246,...,-977.373846,-613.770792,-24.996269,48.369552,-244.747724,-24.832889,-0.694428,-12.175933,-0.45614,0.0


In [37]:
del df_features
del data_train
del df_features_for_train
del df_features_for_test
del df_features_for_train_in_pd
del df_features_for_test_in_pd
del sort_data_train
del sort_df_features_for_train_in_pd
del sort_data_test
del sort_df_features_for_test_in_pd

In [38]:
full_train.describe()

Unnamed: 0,Unnamed: 0_x,id,vas_id,buy_time,target,Unnamed: 0_y,0,1,2,3,...,243,244,245,246,247,248,249,250,251,252
count,831653.0,831653.0,831653.0,831653.0,831653.0,831653.0,831653.0,831653.0,831653.0,831653.0,...,831653.0,831653.0,831653.0,831653.0,831653.0,831653.0,831653.0,831653.0,831653.0,831653.0
mean,415826.0,2158304.0,2.686185,1538937000.0,0.072369,2254994.0,2.062392,5.161329,1.987254,5.766886,...,-42.733928,-28.101598,-2.797341,-4.402215,-11.587021,-1.660674,-0.039381,0.884418,0.013525,0.268059
std,240077.686048,1257907.0,1.952034,5019053.0,0.259098,1302122.0,302.308236,495.918888,309.148563,576.275542,...,3110.526314,2146.368014,837.827712,245.841644,1231.708277,189.573153,7.176362,149.893098,18.732853,2.067714
min,0.0,2.0,1.0,1531084000.0,0.0,13.0,-1151.889971,-1447.969112,-2651.840786,-1488.716798,...,-977.373846,-613.770792,-25.996269,-37.630448,-306.747724,-25.832889,-0.694428,-12.175933,-0.45614,0.0
25%,207913.0,1067319.0,1.0,1534108000.0,0.0,1127703.0,-96.799971,-204.789112,-110.740786,-238.766798,...,-977.373846,-613.770792,-25.996269,-37.630448,-305.747724,-25.832889,-0.694428,-12.175933,-0.45614,0.0
50%,415826.0,2144574.0,2.0,1538341000.0,0.0,2251585.0,-84.089971,-56.489112,-83.620786,-75.166798,...,-960.373846,-613.770792,-25.996269,-37.630448,-240.747724,-23.832889,-0.694428,-12.175933,-0.45614,0.0
75%,623739.0,3244254.0,4.0,1544389000.0,0.0,3380386.0,12.960029,123.190888,28.699214,131.583202,...,-517.373846,-423.770792,-24.996269,-33.630448,-83.747724,-10.832889,-0.694428,-8.175933,-0.45614,1.0
max,831652.0,4362694.0,9.0,1546204000.0,1.0,4512484.0,211730.720029,212606.130888,211716.779214,213357.743202,...,161254.626154,145602.229208,214933.003731,37358.369552,141994.252276,34368.167111,2285.305572,33050.824067,9489.54386,1743.0


In [39]:
# Столбцы 'Unnamed: 0_x' и 'Unnamed: 0_y' удалим, т.к. они не являются признаками
full_train.drop(columns=['Unnamed: 0_x', 'Unnamed: 0_y'], inplace=True)
full_test.drop(columns=['Unnamed: 0_x', 'Unnamed: 0_y'], inplace=True)

In [45]:
name_columns_in_train = list(full_train.columns)

full_len = full_train.shape[0]
for column_name in name_columns_in_train:
    if full_train[column_name].count() != full_len:
        count_nan = full_len - full_train[column_name].count()
        print('В столбце {column_name} пропущено {count_nan} значений')

В тренировочном датасете нет пропущенных данных

In [46]:
full_train.target.value_counts()

0.0    771467
1.0     60186
Name: target, dtype: int64

В итоге видим сильный дисбаланс классов

In [47]:
sort_train = full_train.sort_values(by='buy_time')

In [48]:
print(dtm.datetime.fromtimestamp(min(sort_train.buy_time)))
print(dtm.datetime.fromtimestamp(max(sort_train.buy_time)))

2018-07-09 00:00:00
2018-12-30 23:00:00


In [49]:
max(sort_train.buy_time)

1546203600

In [50]:
int(dtm.datetime.strptime('01.12.2018 00:00:00', '%d.%m.%Y %H:%M:%S').timestamp())

1543615200

In [51]:
timestamp_separation = int(dtm.datetime.strptime('01.12.2018 00:00:00', '%d.%m.%Y %H:%M:%S').timestamp())
df_train = sort_train[sort_train.buy_time < timestamp_separation]
df_test = sort_train[sort_train.buy_time >= timestamp_separation]

In [52]:
df_train.target.value_counts()

0.0    546089
1.0     37984
Name: target, dtype: int64

In [53]:
df_test.target.value_counts()

0.0    225378
1.0     22202
Name: target, dtype: int64

In [54]:
X_train = df_train.drop(columns=['target'])
y_train = df_train.target
X_test = df_test.drop(columns=['target'])
y_test_real = df_test.target

In [55]:
del df_train
del df_test

In [57]:
list_name_features = list(X_train.columns)[3:]

In [58]:
# Функция создания, обучения модели, формирования предсказаний и подсчета метрик
def model_preds(model, X_train, y_train, X_test, y_test_real):
    model_class = RandomForestClassifier(random_state=21, class_weight={0 : 1, 1 : 10})
    model_class.fit(X_train, y_train)
    y_preds = model_class.predict(X_test)
    metrics = f1_score(y_test_real, y_preds, average='macro')
    return model_class, y_preds, metrics

In [60]:
# Датафрейм с результатами
columns_name = ['model', 'standart', 'with_std', 'with_PCA_and_std', 
                'for_slice', 'for_slice_with_std', 'for_slice_with_std_and_PCA']
df_metrics_for_models = pd.DataFrame(columns=columns_name)
df_metrics_for_models

Unnamed: 0,model,standart,with_std,with_PCA_and_std,for_slice,for_slice_with_std,for_slice_with_std_and_PCA


#### Рассмотрим Random Forest без дополнительных параметров

In [None]:
%%time
model_rf = RandomForestClassifier(random_state=21)
trained_model_rf, y_preds_rf, metrics_rf = model_preds(model_rf, X_train, y_train, X_test, y_test_real)
metrics_rf

In [None]:
# Проверка модели Random Forest с весами для классов
%%time
model_rf_with_std = RandomForestClassifier(random_state=21, class_weight={0 : 1, 1 : 10})
trained_model_rf_with_std, y_preds_rf_with_std, metrics_rf_with_std = model_preds(model_rf_with_std, X_train, y_train, \
                                                                                  X_test, y_test_real)
metrics_rf_with_std

In [None]:
# Стандартизируем данные
standard_scaler = StandardScaler()
X_std_train = X_train.copy()
X_std_train[list_name_features] = standard_scaler.fit_transform(X_std_train[list_name_features])

In [None]:
X_std_test = X_test.copy()
X_std_test[list_name_features] = standard_scaler.fit_transform(X_std_test[list_name_features])

In [None]:
%%time
model_rf_with_weight = RandomForestClassifier(random_state=21, class_weight={0.0 : 1, 1.0 : 10})
model_rf_with_weight.fit(X_std_train, y_train)
y_preds_rf_with_weight = model_rf_with_weight.predict(X_std_test)
f1_score(y_test_real, y_preds_rf_with_weight, average='macro')

#### При помощи Random Forest выявляем наиболее важные признаки

In [None]:
X_std_train.columns

In [None]:
df_feature_importance = pd.DataFrame({'feature': X_std_train.columns, 
                                      'importance': model_rf_with_weight.feature_importances_})

In [None]:
df_feature_importance.sort_values(by='importance', ascending=False).head(100)

In [None]:
X_std_train[list(df_feature_importance[df_feature_importance.importance > 0.002].feature.values)]

Строим random forest для части признаков

In [None]:
%%time
model_rf_for_slice = RandomForestClassifier(random_state=21)
trained_model_rf_for_slice, y_preds_rf_for_slice, metrics_rf_for_slice = model_preds(model_rf_for_slice, 
                     X_std_train[list(df_feature_importance[df_feature_importance.importance > 0.002].feature.values)], 
                     y_train, X_test[list(df_feature_importance[df_feature_importance.importance > 0.002].feature.values)],
                     y_test_real)
metrics_rf_for_slice

Random forest для части признаков с переопределением весов целевой переменной

In [None]:
%%time
model_rf_for_slice_with_std = RandomForestClassifier(random_state=21, class_weight={0 : 1, 1 : 10})
trained_model_rf_for_slice_with_std, y_preds_rf_for_slice_with_std, metrics_rf_for_slice_with_std = model_preds(model_rf_for_slice_with_std, 
                     X_std_train[list(df_feature_importance[df_feature_importance.importance > 0.002].feature.values)], 
                     y_train, X_test[list(df_feature_importance[df_feature_importance.importance > 0.002].feature.values)], 
                     y_test_real)
metrics_rf_for_slice_with_std

In [None]:
# Функция обучения pipeline, формирования предсказаний и подсчета метрик
def pipeline_preds(pipeline, X_train, y_train, X_test, y_test_real):
    pipeline.fit(X_train, y_train)
    y_preds = pipeline.predict(X_test)
    metrics = f1_score(y_test_real, y_preds, average='macro')
    return pipeline, y_preds, metrics

In [None]:
X_train_slice = X_train[list(df_feature_importance[df_feature_importance.importance > 0.002].feature.values)]
X_test_slice = X_test[list(df_feature_importance[df_feature_importance.importance > 0.002].feature.values)]

In [None]:
# Добавим результаты вычисления метрик в общий датафрейм
metrics_rf_with_std_and_PCA = np.nan
metrics_rf_slice_with_std_and_PCA = np.nan
result_for_rf = ['Random Forest', metrics_rf, metrics_rf_with_std, metrics_rf_with_std_and_PCA, 
                metrics_rf_for_slice, metrics_rf_for_slice_with_std, metrics_rf_slice_with_std_and_PCA]
df_result_for_rf = pd.DataFrame([result_for_rf], columns=columns_name)
df_metrics_for_models = pd.concat([df_metrics_for_models, df_result_for_rf], axis=0, ignore_index=True)
df_metrics_for_models

#### Рассмотрим Gradient Boosting

In [None]:
%%time
pipeline_gb = Pipeline([('gradient_boosting', GradientBoostingClassifier(random_state=21))])
trained_model_gb, y_preds_gb, metrics_gb = pipeline_preds(pipeline_gb, X_train, y_train, X_test, y_test_real)
metrics_gb

Добавим стандартизацию данных в Gradient Boosting

In [None]:
%%time
pipeline_gb_with_std = Pipeline([('standard_scaler', StandardScaler()), 
                                ('gradient_boosting', GradientBoostingClassifier(random_state=21))])
trained_model_gb_with_std, y_preds_gb_with_std, metrics_gb_with_std = pipeline_preds(pipeline_gb_with_std, 
                                    X_train, y_train, X_test, y_test_real)
metrics_gb_with_std

In [None]:
# Рассмотрим Gradient Boosting для отобранной части признаков
%%time
pipeline_gb_slice = Pipeline([('gradient_boosting', GradientBoostingClassifier(random_state=21))])
trained_model_gb_slice, y_preds_gb_slice, metrics_gb_slice = pipeline_preds(pipeline_gb_slice, X_train_slice, 
                                                                            y_train, X_test_slice, y_test_real)
metrics_gb_slice

In [None]:
# Рассмотрим Gradient Boosting со стандартизацией данных для отобранной части признаков
%%time
pipeline_gb_slice_with_std = Pipeline([('standard_scaler', StandardScaler()), 
                                ('gradient_boosting', GradientBoostingClassifier(random_state=21))])
trained_model_gb_slice_with_std, y_preds_gb_slice_with_std, metrics_gb_slice_with_std = pipeline_preds(pipeline_gb_slice_with_std, 
                                    X_train_slice, y_train, X_test_slice, y_test_real)
metrics_gb_slice_with_std

In [None]:
metrics_gb_with_std_pca = np.nan
metrics_gb_slice_with_std_pca = np.nan
result_for_gb = ['Gradient Boosting', metrics_gb, metrics_gb_with_std, metrics_gb_with_std_pca, 
                metrics_gb_slice, metrics_gb_slice_with_std, metrics_gb_slice_with_std_pca]
df_result_for_gb = pd.DataFrame([result_for_gb], columns=columns_name)
df_metrics_for_models = pd.concat([df_metrics_for_models, df_result_for_gb], axis=0, ignore_index=True)
df_metrics_for_models

Наилучший результат дала модель GradientBoostingClassifier без стандартизации. Подберем гиперпараметры для этой модели

In [None]:
%%time
parameters = {'n_estimators': [50, 100, 200], 'max_depth': [1, 3, 5]}
model_gb = GradientBoostingClassifier(random_state=21)
f1 = make_scorer(f1_score, average='macro')
grid_search_for_gb = GridSearchCV(model_gb, parameters, scoring=f1)
grid_search_for_gb.fit(X_train, y_train)

In [None]:
grid_search_for_gb.best_params_

In [None]:
grid_search_for_gb.cv_results_

#### Финальное обучение модели, сохранение модели и выполнение предсказания

In [None]:
X_train_final = full_train.drop(columns=['target'])
y_train_final = full_train.target

In [None]:
del full_train

In [None]:
best_model = GradientBoostingClassifier(random_state=21, max_depth=1, n_estimators=100)

In [None]:
best_model.fit(X_train_final, y_train_final)

In [None]:
with open('best_model.pkl', 'wb') as f:
    pickle.dump(best_model, f)

In [None]:
y_predict_final = best_model.predict(full_test)

In [None]:
full_test_result['target'] = y_predict_final

In [None]:
result = full_test_result[['id', 'vas_id', 'buy_time', 'target']].reset_index(drop=True)

In [None]:
result.to_csv('answers_test.csv')

In [None]:
with open('best_model.pkl', 'rb') as file: 
    pickle_model = pickle.load(file) 

In [None]:
pickle_model.predict(full_test)