In [1]:
import pandas as pd
import numpy as np

from datetime import datetime
from tqdm import tqdm

from sklearn.metrics import jaccard_score, roc_auc_score
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split, GroupKFold, cross_val_score

import optuna
from optuna.visualization import plot_contour
from optuna.visualization import plot_edf
from optuna.visualization import plot_intermediate_values
from optuna.visualization import plot_optimization_history
from optuna.visualization import plot_parallel_coordinate
from optuna.visualization import plot_param_importances
from optuna.visualization import plot_slice

In [35]:
def transform_to_features(df, name_type_features='begin'):
    
    # конструируем матрицу признаков
    stats = df.groupby(['flg_period','hash_id', 'lac']).count().reset_index()
    
    final_pivot = pd.pivot(
                      stats,
                      index=['flg_period', 'hash_id'],
                      columns=['lac'],
                      values=['ts']
                     ).reset_index()
    final_pivot.fillna(0, inplace=True)
    
    new_columns = ['flg_period']
    new_columns += [i + '_' + str(j) + '_'+ name_type_features for i, j in final_pivot.columns[1:]]
    
    name_features = [i + '_' + str(j) for i, j in final_pivot.columns[2:]]
    
    final_pivot.columns = new_columns
    final_pivot.drop(columns=['flg_period'], inplace=True)
    
    return final_pivot, name_features

### Транзакции по клиентам

In [182]:
df = pd.read_csv('01_data.csv', sep=';')
df.head(5)

Unnamed: 0,lac,cid,ts,fulldate,hash_id
0,7755,35950,1536391000.0,2018-09-13,1361396
1,7755,35950,1536391000.0,2018-09-14,1361396
2,7752,19203,1535967000.0,2018-09-04,1361396
3,7755,35950,1536391000.0,2018-09-09,1361396
4,7752,19203,1535967000.0,2018-09-05,1361396


In [183]:
df['datetime'] = pd.to_datetime(df['ts'], unit='s')
df['hour'] = df['datetime'].dt.hour


df['fulldate'] = df['fulldate'].astype('datetime64')
df['flg_period'] = df['fulldate'] < '2018-09-15'
df = df[df['fulldate'] != '2018-09-15']

### Размеченная выборка

In [184]:
df_res = pd.read_csv('02_etalon.csv', sep=';')
matching_hash_id_begin_month = df_res['id1'].unique()
matching_hash_id_end_month = df_res['id2'].unique()
df_res.head(5)

Unnamed: 0,id1,id2
0,1361396,2695335
1,1795864,2458905
2,1543059,2730453
3,1028066,2539971
4,1533076,2712514


### Статистика по выборкам

In [185]:
n_unique_users = df['hash_id'].nunique()
print(f'Кол-во уникальных хэшей : {n_unique_users}')

unique_users_begin_month = df[df['flg_period']]['hash_id'].unique()
n_unique_users_begin_month = len(unique_users_begin_month)
print(f'Кол-во уникальных хэшей в начале месяца: {n_unique_users_begin_month}')

unique_users_end_month = df[~df['flg_period']]['hash_id'].unique()
n_unique_users_end_month = len(unique_users_end_month)
print(f'Кол-во уникальных хэшей в конце месяца: {n_unique_users_end_month}')

Кол-во уникальных хэшей : 4542
Кол-во уникальных хэшей в начале месяца: 2248
Кол-во уникальных хэшей в конце месяца: 2294


In [10]:
n_unique_lac = df['lac'].nunique()
print(f'Кол-во уникальных lac : {n_unique_lac}')

n_unique_lac_begin_month = df[df['flg_period']]['lac'].nunique()
print(f'Кол-во уникальных lac в начале месяца: {n_unique_lac_begin_month}')

n_unique_lac_end_month = df[~df['flg_period']]['lac'].nunique()
print(f'Кол-во уникальных lac в конце месяца: {n_unique_lac_end_month}')

Кол-во уникальных lac : 364
Кол-во уникальных lac в начале месяца: 355
Кол-во уникальных lac в конце месяца: 340


### Выборка по фичам

In [142]:
final_pivot, features = transform_to_features(df)

In [150]:
# делаем негативное семплирование (добавляем отрицательных примеров)
dataset = df_res.copy()
negative_sample_size = 100 #кол-во негативных примеров на 1 позитивный

unique_hash_id = set(df_res['id1'].unique())
dataset_negative_examples = pd.DataFrame()
dct = {
       'id1' : [], 
       'id2' : []
      }
for index_ in range(len(dataset)):
    hash_id_begin, hash_id_end = dataset.loc[index_, 'id1'], dataset.loc[index_, 'id2']
    
    unique_hash_id_ = list(set(unique_hash_id) - set([hash_id_end]))
    negative_hash_id = np.random.choice(unique_hash_id_, size=negative_sample_size)
    
    dct['id1'].extend([hash_id_begin]*negative_sample_size)
    dct['id2'].extend(list(negative_hash_id))

In [151]:
%%time
# собираем две выборки вместе
dataset['target'] = 1
dataset_negative_examples = pd.DataFrame(data=dct)
dataset_negative_examples['target'] = 0

df_final = pd.concat([dataset, dataset_negative_examples])
ev = df_final['target'].mean()
print(f'Средний ev_rate : {ev:.4f}')

Средний ev_rate : 0.0099
CPU times: user 33 ms, sys: 160 µs, total: 33.2 ms
Wall time: 31.9 ms


In [152]:
%%time
final_pivot_begin, _ = transform_to_features(df, name_type_features='begin')
final_pivot_end, _ = transform_to_features(df, name_type_features='end')

df_final = pd.merge(df_final, final_pivot_begin,
                     how='left', left_on='id1', right_on='hash_id__begin')
# df_final.drop(columns=['hash_id'], inplace=True)

df_final = pd.merge(df_final, final_pivot_end,
                     how='left', left_on='id2', right_on='hash_id__end')

CPU times: user 2.45 s, sys: 1.15 s, total: 3.6 s
Wall time: 3.61 s


In [153]:
# добавляем дельты
for column in features:
    df_final[f'delta_{column}'] = df_final[f'{column}_begin'] - df_final[f'{column}_end']

### Модель
Задача - бинарная классификация. Используем подход, когда для пары ```id1-id2``` устанавливается метка 1, если матчинг верный и 0, если пары ```id1-id2``` сматчены неверно. Алгоритм - логистическая регрессия

In [156]:
%%time
X = df_final
# разделяем трейн и тест по id1
hash_id_ = X['id1'].unique()
test_hash_id = hash_id_[391:]

X_train, y_train = X[~X['id1'].isin(test_hash_id)], X[~X['id1'].isin(test_hash_id)]['target']
X_test, y_test = X[X['id1'].isin(test_hash_id)], X[X['id1'].isin(test_hash_id)]['target']

col_names = X_train.columns.tolist()
col_names.remove('id1')
col_names.remove('id2')
col_names.remove('target')
col_names.remove('hash_id__begin')
col_names.remove('hash_id__end')

clf = LogisticRegression(
#                         penalty='l1',
                        max_iter=10000,
                        class_weight='balanced',
#                          solver='saga',
#                          n_jobs=-1
                        )
clf.fit(X_train[col_names], y_train)
y_pred_proba_train = clf.predict_proba(X_train[col_names])[:, 1]
score_train = 2*roc_auc_score(y_train, y_pred_proba_train)-1
print('GINI score on train : {:.3f}'.format(score_train))

y_pred_proba_test = clf.predict_proba(X_test[col_names])[:, 1]
score_test = 2*roc_auc_score(y_test, y_pred_proba_test)-1
print('GINI score on test : {:.3f}'.format(score_test))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


GINI score on train : 1.000
GINI score on test : 0.728
CPU times: user 39min 18s, sys: 23min 7s, total: 1h 2min 26s
Wall time: 4min 4s


In [157]:
# считаем метрику точности
X_test.loc[:, 'proba'] = y_pred_proba_test

X_test_ = X_test.sort_values(by=['id1', 'proba'], ascending=False)
presicion_1 = X_test_.drop_duplicates(subset=['id1'])['target'].mean()
print(f'Точность на тестовой выборке составила : {presicion_1}')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


Точность на тестовой выборке составила : 0.76


#### Подбор параметра регуляризации

In [159]:
base_params = {
    'max_iter' : 10000,
}
def objective(trial):
    global base_params
    
    params = {
            **base_params,
            'C' : trial.suggest_loguniform('C', 1e-5, 100),
    }
    clf = LogisticRegression(**params)
    
    n_splits=10
    cv = GroupKFold(n_splits=n_splits)
    groups_kfold = X_train['id1']
    scores = cross_val_score(clf, X=X_train[col_names],  y=y_train, 
                            groups=groups_kfold,
                            scoring='roc_auc', cv=cv, n_jobs=6
                             )
    
    score = 2*scores.mean() - 1
    return score

In [160]:
sampler = optuna.samplers.TPESampler(seed=42)
study = optuna.create_study(
    direction="maximize",
    sampler=sampler
)
study.optimize(objective, n_trials=20, timeout=3000, n_jobs=8)

[32m[I 2022-06-23 13:03:30,121][0m A new study created in memory with name: no-name-47d816dd-62c0-478f-9abe-0a0bd51ae214[0m
[32m[I 2022-06-23 13:46:49,316][0m Trial 1 finished with value: 0.7767862943786983 and parameters: {'C': 1.0031493002281022e-05}. Best is trial 1 with value: 0.7767862943786983.[0m
[32m[I 2022-06-23 13:49:39,123][0m Trial 6 finished with value: 0.7950740401051939 and parameters: {'C': 0.0001597919516734645}. Best is trial 6 with value: 0.7950740401051939.[0m
[32m[I 2022-06-23 13:57:08,710][0m Trial 3 finished with value: 0.7912055054240628 and parameters: {'C': 3.899167585210222e-05}. Best is trial 6 with value: 0.7950740401051939.[0m
[32m[I 2022-06-23 13:59:25,211][0m Trial 7 finished with value: 0.5227081328073635 and parameters: {'C': 54.56652370850856}. Best is trial 6 with value: 0.7950740401051939.[0m
[32m[I 2022-06-23 14:05:28,438][0m Trial 5 finished with value: 0.6136782380013148 and parameters: {'C': 1.7557221958888347}. Best is trial 6 

In [161]:
final_params = {**study.best_params, **base_params}
best_clf = LogisticRegression(**final_params)

best_clf.fit(X_train[col_names], y_train)
y_pred_proba_train = best_clf.predict_proba(X_train[col_names])[:, 1]
score_train = 2*roc_auc_score(y_train, y_pred_proba_train)-1
print('GINI score on train : {:.3f}'.format(score_train))

y_pred_proba_test = best_clf.predict_proba(X_test[col_names])[:, 1]
score_test = 2*roc_auc_score(y_test, y_pred_proba_test)-1
print('GINI score on test : {:.3f}'.format(score_test))

GINI score on train : 0.947
GINI score on test : 0.799


In [165]:
X_test.loc[:, 'proba'] = y_pred_proba_test

X_test_ = X_test.sort_values(by=['id1', 'proba'], ascending=False)
presicion_1 = X_test_.drop_duplicates(subset=['id1'])['target'].mean()
print(f'Точность на тестовой выборке составила : {presicion_1}')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


Точность на тестовой выборке составила : 0.73


In [167]:
coef = pd.DataFrame.from_dict({
                                'features' : col_names,
                                'coef_' : best_clf.coef_[0]
                            }
                            )

In [168]:
coef[(coef['coef_'] > 0.01) | (coef['coef_'] < -0.01)]

Unnamed: 0,features,coef_
102,ts_7742_begin,-0.012243
103,ts_7743_begin,-0.020298
110,ts_7752_begin,-0.010005
135,ts_7798_begin,0.010045
365,ts_5000_end,0.017893
...,...,...
1006,delta_ts_9749,0.010862
1009,delta_ts_9752,-0.119898
1014,delta_ts_9757,-0.042086
1024,delta_ts_9768,0.016232


### Скоринг

In [169]:
# создаем df с фичами
features_begin, _ = transform_to_features(df, name_type_features='begin')
features_end, _ = transform_to_features(df, name_type_features='end')

features_begin = features_begin[features_begin['hash_id__begin'].isin(unique_users_begin_month)]
features_end = features_end[features_end['hash_id__end'].isin(unique_users_end_month)]

In [170]:
# для каждого id1 (hash_id в первой части месяца) скорим все доступные id2 (hash_id во второй части месяца) 
# для матчинга выбираем id2 с наибольшей вероятностью
set_used_hash_id_end = set(matching_hash_id_end_month)
res_dct = {
           'id1' : [],
           'id2' : []
          }
users_to_matching = list(set(unique_users_begin_month) - set(matching_hash_id_begin_month))
print(f'Нужно сматчить {len(users_to_matching)} юзеров')
for hash_id_begin in tqdm(users_to_matching):
    features_begin_i = features_begin[features_begin['hash_id__begin'] == hash_id_begin]
    features_begin_i = pd.merge(
                                features_begin_i, 
                                features_end[~features_end['hash_id__end'].isin(set_used_hash_id_end)],
                                how='cross'
                               )

    for column in features:
        features_begin_i[f'delta_{column}'] = features_begin_i[f'{column}_begin'] - features_begin_i[f'{column}_end']
    
    # скорим всех сотрудников
    proba_i = clf.predict_proba(features_begin_i[col_names])[:, 1]
    hash_id_end = features_begin_i.loc[np.argmax(proba_i), 'hash_id__end']
    
    set_used_hash_id_end.add(hash_id_end)
    
    res_dct['id1'].append(hash_id_begin)
    res_dct['id2'].append(hash_id_end)

Нужно сматчить 1757 юзеров


100%|██████████| 1757/1757 [12:35<00:00,  2.32it/s]


In [171]:
df_res_model = pd.DataFrame(res_dct)
# res_df.to_csv('res.csv')

In [172]:
final_res = pd.concat([df_res, df_res_model])

assert len(final_res['id1'].unique()) == len(unique_users_begin_month)
assert len(final_res['id2'].unique()) == len(final_res)

final_res.to_csv('results.csv', index=False)

### Идеи:
 - нагенерить побольше фичей (поведение человека внутри дня, по дням недели);
 - покрутить разные варианты негативного семплирования (например, добавлять примеры, которые плохо различаются алгоритмом);
 - попробовать бустинг для задачи классификации;
 - попробовать KNN (используя, например, расстояние Жаккара);
 - перейти к задаче ранжирования -> отсеивая кандидатов более "легкими" методами (например, на основе колаборативной фильтрации);
 - попробовать сетки, которые будут рассматривать логи звонков как последовательность, а функцию потерь на основе ```triplet loss``` (например, сиамские сети)