In [1]:
%matplotlib inline

import gc
import os
import sys
import pandas as pd
import numpy as np
import tqdm
import seaborn as sns
from scipy.stats import mode

from catboost import CatBoostClassifier, Pool
from catboost import Pool, EShapCalcType, EFeaturesSelectionAlgorithm
from sklearn.model_selection import cross_val_score
from sklearn.metrics import recall_score, precision_score
import platform
from collections import Counter
from sklearn.metrics import recall_score, matthews_corrcoef
from lightgbm import LGBMClassifier, log_evaluation
from lightgbm import early_stopping

import matplotlib
import matplotlib.pyplot as plt 

from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
from sklearn.metrics import roc_auc_score, matthews_corrcoef, accuracy_score
from sklearn.metrics import recall_score, precision_score
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

pd.set_option('display.max_columns', None)

# если у вас есть CUDA, то она понадобится там для экспериментов в catboost
os.environ["CUDA_VISIBLE_DEVICES"] = '0'

task_type = 'GPU'
if platform.node() == 'VLAD2016':
    task_type = 'CPU'
    
import warnings
warnings.simplefilter(action = 'ignore', category = FutureWarning)
warnings.simplefilter(action = 'ignore', category = DeprecationWarning)
warnings.simplefilter(action = 'ignore', category = UserWarning)
warnings.simplefilter(action = 'ignore', category = RuntimeWarning)
warnings.filterwarnings("ignore", message = "numpy.dtype size changed")
warnings.filterwarnings("ignore", message = "numpy.ufunc size changed")
pd.options.mode.chained_assignment = None

from warnings import simplefilter
simplefilter(action="ignore", category = pd.errors.PerformanceWarning)

DEBUG = False
target = 'blocked'

SEEDS = 3

In [2]:
dtrain = pd.read_csv('../input/train_dataset_train.csv')
dtrain.head()

Unnamed: 0,contract_id,blocked
0,7780,0
1,7785,0
2,7794,0
3,7795,0
4,7798,0


In [3]:
for u in sorted(dtrain[target].unique()):
    print(u, dtrain[dtrain[target] == u].shape[0])

0 5294
1 698


In [4]:
dtest = pd.read_csv('../input/sample_solution.csv')
dtest.head()

Unnamed: 0,contract_id,blocked
0,3453,0
1,3454,0
2,3455,0
3,3456,0
4,3457,0


In [5]:
named = pd.read_csv(
    "../input/named.csv", 
    parse_dates = ['date']
)

type_contract = pd.read_csv("../input/type_contract.csv")
log = pd.read_csv(
    "../input/log.csv",
    parse_dates = ['event_date']
)

In [6]:
named.head()

Unnamed: 0,date,url,contract_id
0,2021-04-17,webmail.sampo.ru,101397
1,2021-04-17,webmail.sampo.ru,179624
2,2021-04-17,tvip-provision.sampo.ru,190335
3,2021-04-17,tvip-provision.sampo.ru,61670
4,2021-04-17,tvip-provision.sampo.ru,39370


In [7]:
type_contract.head()

Unnamed: 0,contract_id,day_or_month_contract
0,7780,0
1,3996,0
2,7785,0
3,7794,0
4,7795,1


In [8]:
dtrain = pd.merge(
    dtrain, 
    type_contract, 
    on = "contract_id", 
    how = 'left').drop_duplicates(subset = "contract_id")

dtrain.head(3)

Unnamed: 0,contract_id,blocked,day_or_month_contract
0,7780,0,0.0
1,7785,0,0.0
2,7794,0,0.0


In [9]:
dtest = pd.merge(
    dtest, 
    type_contract, 
    on = "contract_id", 
    how = 'left')

dtest.head(3)

Unnamed: 0,contract_id,blocked,day_or_month_contract
0,3453,0,0
1,3454,0,1
2,3455,0,1


In [10]:
log.head()

Unnamed: 0,contract_id,event_date,event_type
0,36294,2021-03-28 16:24:30,Добавление в Обращались с номеров
1,36294,2021-03-28 16:27:41,Обращение в службу заботы о клиентах
2,36294,2021-03-28 16:29:56,Выключение IPTV-пакета
3,36294,2021-03-28 16:29:56,Включение IPTV-пакета
4,36294,2021-03-28 16:35:04,Обращение в службу заботы о клиентах


In [11]:
# log
for df in [dtrain, dtest]:
    df['log']  = ''
    df['log2'] = ''
    df['log3'] = ''
    df['log4'] = ''
    df['log5'] = ''
    df['log6'] = ''
    df['log7'] = ''
    df['log8'] = ''
    df['log9'] = ''
    df['log_mode'] = ''

for df in [dtrain, dtest]:
    for contract_id in tqdm.tqdm(df['contract_id'].unique()):
        temp = log[log['contract_id'] == contract_id]
        temp.sort_values(by = ['event_date'], ascending = False, inplace = True)
        temp.reset_index(drop = True, inplace = True)
        try:
            url = temp.loc[0, 'event_type']
            df.loc[df['contract_id'] == contract_id, 'log'] = url
        except:
            pass
        try:
            url = temp.loc[1, 'event_type']
            df.loc[df['contract_id'] == contract_id, 'log2'] = url
        except:
            pass
        try:
            url = temp.loc[2, 'event_type']
            df.loc[df['contract_id'] == contract_id, 'log3'] = url
        except:
            pass
        try:
            url = temp.loc[3, 'event_type']
            df.loc[df['contract_id'] == contract_id, 'log4'] = url
        except:
            pass
        try:
            url = temp.loc[4, 'event_type']
            df.loc[df['contract_id'] == contract_id, 'log5'] = url
        except:
            pass 
        try:
            url = temp.loc[5, 'event_type']
            df.loc[df['contract_id'] == contract_id, 'log6'] = url
        except:
            pass
        try:
            url = temp.loc[6, 'event_type']
            df.loc[df['contract_id'] == contract_id, 'log7'] = url
        except:
            pass
        try:
            url = temp.loc[7, 'event_type']
            df.loc[df['contract_id'] == contract_id, 'log8'] = url
        except:
            pass
        try:
            url = temp.loc[8, 'event_type']
            df.loc[df['contract_id'] == contract_id, 'log9'] = url
        except:
            pass
        try:
            temp = temp[temp['event_type'] == temp['event_type']]
            url = temp['event_type'].mode()[0]
            df.loc[df['contract_id'] == contract_id, 'log_mode'] = url
        except:
            pass

100%|██████████████████████████████████████████| 5992/5992 [00:07<00:00, 799.69it/s]
100%|████████████████████████████████████████████| 810/810 [00:00<00:00, 813.95it/s]


In [12]:
dtrain.head(3)

Unnamed: 0,contract_id,blocked,day_or_month_contract,log,log2,log3,log4,log5,log6,log7,log8,log9,log_mode
0,7780,0,0.0,Перенос денежных средств,Смена схемы оплаты,Смена схемы оплаты,Смена схемы оплаты,Обращение в службу заботы о клиентах,Обращение в службу заботы о клиентах,Смена схемы оплаты,Обращение в службу заботы о клиентах,Отключение услуги Автоплатёж,Смена схемы оплаты
1,7785,0,0.0,Реестр договоров,,,,,,,,,Реестр договоров
2,7794,0,0.0,,,,,,,,,,


In [13]:
# url
for df in [dtrain, dtest]:
    df['url']  = ''
    df['url2'] = ''
    df['url3'] = ''
    df['url4'] = ''
    df['url5'] = ''
    df['url6'] = ''
    df['url7'] = ''
    df['url_mode'] = ''

for df in [dtrain, dtest]:
    for contract_id in tqdm.tqdm(df['contract_id'].unique()):
        temp = named[named['contract_id'] == contract_id]
        temp.sort_values(by = ['date'], ascending = False, inplace = True)
        temp.reset_index(drop = True, inplace = True)
        try:
            url = temp.loc[0, 'url']
            df.loc[df['contract_id'] == contract_id, 'url'] = url
        except:
            pass
        try:
            url = temp.loc[1, 'url']
            df.loc[df['contract_id'] == contract_id, 'url2'] = url
        except:
            pass
        try:
            url = temp.loc[2, 'url']
            df.loc[df['contract_id'] == contract_id, 'url3'] = url
        except:
            pass
        try:
            url = temp.loc[3, 'url']
            df.loc[df['contract_id'] == contract_id, 'url4'] = url
        except:
            pass
        try:
            url = temp.loc[4, 'url']
            df.loc[df['contract_id'] == contract_id, 'url5'] = url
        except:
            pass
        try:
            url = temp.loc[5, 'url']
            df.loc[df['contract_id'] == contract_id, 'url6'] = url
        except:
            pass
        try:
            url = temp.loc[6, 'url']
            df.loc[df['contract_id'] == contract_id, 'url7'] = url
        except:
            pass
        try:
            temp = temp[temp['url'] == temp['url']]
            url = temp['url'].mode()[0]
            df.loc[df['contract_id'] == contract_id, 'url_mode'] = url
        except:
            pass

100%|██████████████████████████████████████████| 5992/5992 [00:44<00:00, 135.57it/s]
100%|████████████████████████████████████████████| 810/810 [00:05<00:00, 139.02it/s]


In [14]:
dtrain.head(3)

Unnamed: 0,contract_id,blocked,day_or_month_contract,log,log2,log3,log4,log5,log6,log7,log8,log9,log_mode,url,url2,url3,url4,url5,url6,url7,url_mode
0,7780,0,0.0,Перенос денежных средств,Смена схемы оплаты,Смена схемы оплаты,Смена схемы оплаты,Обращение в службу заботы о клиентах,Обращение в службу заботы о клиентах,Смена схемы оплаты,Обращение в службу заботы о клиентах,Отключение услуги Автоплатёж,Смена схемы оплаты,start.sampo.ru,pay.sampo.ru,vitrina.sampo.ru,ice.sampo.ru,wifi.sampo.ru,swap.sampo.ru,rupor.sampo.ru,webmail.sampo.ru
1,7785,0,0.0,Реестр договоров,,,,,,,,,Реестр договоров,fnc.rt.ru,fnc.rt.ru,fnc.rt.ru,static01.rupor.sampo.ru,static01.rupor.sampo.ru,fdb00.sampo.ru,profile.sampo.ru,fnc.rt.ru
2,7794,0,0.0,,,,,,,,,,,,,,,,,,


In [15]:
vectorizer = TfidfVectorizer(
    lowercase = True, 
    preprocessor = None, 
    tokenizer = None, 
    analyzer = 'char', 
    stop_words = None, 
    token_pattern = None, 
    ngram_range = (3, 5), 
    max_df = 1.0, 
    min_df = 1, 
    max_features = 500000,
)
train = vectorizer.fit_transform(dtrain['url'].values)
test  = vectorizer.transform(dtest['url'].values)
#vectorizer.get_feature_names_out()
train.shape

(5992, 3035)

In [16]:
for i in range(train.shape[1]):
    dtrain['v_' + str(i)] = train[:, i].toarray()
    dtest['v_' + str(i)] = test[:, i].toarray()

In [17]:
vectorizer = TfidfVectorizer(
    lowercase = True, 
    preprocessor = None, 
    tokenizer = None, 
    analyzer = 'char', 
    stop_words = None, 
    token_pattern = None, 
    ngram_range = (3, 5), 
    max_df = 1.0, 
    min_df = 1, 
    max_features = 500000,
)
train = vectorizer.fit_transform(dtrain['log'].values)
test  = vectorizer.transform(dtest['log'].values)
#vectorizer.get_feature_names_out()
train.shape

(5992, 2755)

In [18]:
for i in range(train.shape[1]):
    dtrain['v2_' + str(i)] = train[:, i].toarray()
    dtest['v2_' + str(i)] = test[:, i].toarray()

In [19]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

enc = OneHotEncoder(handle_unknown = 'ignore')
le  = LabelEncoder()
L = dtrain.shape[0]
categorical_columns = [
    'log', 'log2', 'log3', 'log4', 'log5', 'log6', 'log7', 'log8', 'log9',
    'log_mode', 
    'url', 'url2', 'url3', 'url4', 'url5', 'url6', 'url7', 
    'url_mode'
]

for u in categorical_columns:
    print(u, len(dtrain[u].unique()))
    
    if len(dtrain[u].unique()) < 1:
        df = pd.DataFrame(pd.concat([dtrain[u], dtest[u]], axis = 0))
        df[u] = df[u].fillna('none')
        df[u] = df[u].astype(str)
        df[u] = df[u] + '_' + u
        
        temp = enc.fit_transform(df[u].values.reshape(-1, 1)).toarray()
        for i in range(temp.shape[1]):
            n = u + '_' + str(i)
            dtrain[n] = temp[:, i][:L]
            dtest[n] = temp[:, i][L:]
            if n not in use:
                use.append(n)        
        if u in use:
            use.remove(u)       
    
        dtrain.drop(u, axis = 1, inplace = True)
        dtest.drop(u, axis = 1, inplace = True)
    
    else:
        df = pd.DataFrame(pd.concat([dtrain[u], dtest[u]], axis = 0))
        df[u] = df[u].fillna('none')
        df[u] = df[u].astype(str)
        le.fit(df[u].values.ravel())
        temp = le.transform(df[u].values.ravel())
        dtrain[u] = temp[:L]
        dtest[u]  = temp[L:]        
    
gc.collect()

log 73
log2 74
log3 76
log4 71
log5 72
log6 68
log7 65
log8 68
log9 61
log_mode 53
url 183
url2 180
url3 160
url4 160
url5 157
url6 149
url7 147
url_mode 180


0

In [20]:
use = [f for f in dtrain.columns if f not in ['contract_id', target]]
len(use)

5809

In [21]:
from sklearn.feature_selection import mutual_info_classif

trg  = dtrain[[target]]

for u in tqdm.tqdm(use):
    
    temp = dtrain[[u]]
    temp = temp.fillna(0)
    mif = mutual_info_classif(temp, trg, n_neighbors = 5, copy = True, random_state = 0)[0]
    
    if mif == 0:
        use.remove(u)

 69%|█████████████████████████████▋             | 4017/5809 [00:58<00:26, 68.58it/s]


In [22]:
print(dtrain[use].shape)
dtypes = dtrain.dtypes

dtrain['hash'] = dtrain[use].apply(lambda x: hash(tuple(x)), axis = 1)
temp = pd.DataFrame(columns = dtrain.columns)

for h in tqdm.tqdm(dtrain['hash'].unique()):
    temp2 = dtrain[dtrain['hash'] == h]
    
    if temp2.shape[0] > 1:
        dtrain = dtrain[dtrain['hash'] != h]
    
    if temp2.shape[0] > 3:
        stats = Counter(temp2[target])
        m = max(stats, key = stats.get)
        temp2 = temp2.head(1)
        temp2[target] = m
        temp = pd.concat([temp, temp2], ignore_index = True)
        
        
dtrain.reset_index(drop = True, inplace = True)
temp.reset_index(drop = True, inplace = True)
del dtrain['hash']
del temp['hash']

dtrain = pd.concat([dtrain, temp], ignore_index = True)
dtrain.reset_index(drop = True, inplace = True)
del temp, temp2; gc.collect()

#dtrain.dtypes = dtypes
print(dtrain[use].shape)

(5992, 5811)


100%|██████████████████████████████████████████| 3571/3571 [00:10<00:00, 340.68it/s]

(3417, 5811)





In [23]:
params = {
        'max_depth': 5, 
        'num_leaves': 31, 
        'learning_rate': 0.1, 
        'reg_alpha': 0.1, 
        'reg_lambda': 0.1, 
        'n_estimators': 10000, 
        'subsample': 0.99, 
        'subsample_freq': 5, 
        'colsample_bytree': 0.99, 
        'random_state': 0, 
        'verbose': -1, 
        #'metric': 'custom',
}

In [24]:
X_train, X_val, = train_test_split(
    dtrain,
    test_size = 0.1, 
    random_state = 0
)
X_train.shape, X_val.shape

((3075, 5811), (342, 5811))

In [35]:
def lgb_mcc(y_true, y_pred):
    y_pred_proba = y_pred.reshape(1, -1)
    
    best = 0
    k = 0
    
    for i in np.linspace(0.01, 0.99, 2500):
        preds = np.where(y_pred_proba > i, 1, 0)
        sc = matthews_corrcoef(X_val[target].astype(int), preds.ravel())
        if sc > best:
            best = sc
            k = i
    y_pred = np.where(y_pred_proba > k, 1, 0)  
    
    mcc = matthews_corrcoef(y_true, y_pred.ravel())
    return 'metric', mcc, True


models = []
params['metric'] = 'custom'

for seed in range(SEEDS):
    params['random_state'] = seed
    model = LGBMClassifier(**params)
    model.fit(
        X_train[use].values,
        X_train[target].astype(int),
        eval_set = (X_val[use].values, X_val[target].astype(int)),
        callbacks = [
            early_stopping(100), 
            log_evaluation(100)
        ],
        eval_metric = lgb_mcc,
    )
    models.append(model)

Training until validation scores don't improve for 100 rounds
[100]	valid_0's metric: 0.408675
Early stopping, best iteration is:
[98]	valid_0's metric: 0.408675
Training until validation scores don't improve for 100 rounds
[100]	valid_0's metric: 0.353462
Early stopping, best iteration is:
[8]	valid_0's metric: 0.382724
Training until validation scores don't improve for 100 rounds
[100]	valid_0's metric: 0.386067
[200]	valid_0's metric: 0.35618
Early stopping, best iteration is:
[114]	valid_0's metric: 0.408675


In [38]:
trg = []

for model in models:
    y_pred_proba = model.predict_proba(X_val[use].values)[:, 1]
    
    best = 0
    k = 0
    for i in np.linspace(0.01, 0.99, 2500):
        preds = np.where(y_pred_proba > i, 1, 0)
        sc = recall_score(X_val[target].astype(int), preds.ravel(), average = 'macro')
        if sc > best:
            best = sc
            k = i
            
    y_pred = np.where(y_pred_proba > k, 1, 0)
    
    recall = recall_score(X_val[target].astype(int), y_pred.ravel(), average = 'macro')
    print(np.round(recall, 6), np.round(k, 6))
    
    y_pred_proba = model.predict_proba(dtest[use].values)[:, 1]
    y_pred = np.where(y_pred_proba > k, 1, 0)
    trg.append(y_pred)

0.739832 0.084629
0.734471 0.086593
0.755428 0.108196


In [39]:
dtest[target] = mode(trg)[0][0]
dtest[target] = dtest[target].astype(int)

dtest[['contract_id', 'blocked']].to_csv('final.csv', index  = False)
dtest[['contract_id', 'blocked']].head()

Unnamed: 0,contract_id,blocked
0,3453,0
1,3454,1
2,3455,0
3,3456,1
4,3457,1


https://lk.hacks-ai.ru/758288/champ