In [1]:
import pandas as pd
import numpy as np
import catboost as catb 
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from sc import Score_model_dev, get_input
import holidays
from datetime import date
from tqdm import tqdm
tqdm.pandas()

import warnings
warnings.simplefilter("ignore")

In [2]:
sample = pd.read_csv("./date/sample_submission.csv")
X_train = pd.read_csv("./date/X_train.csv")
X_valid = pd.read_csv('./date/X_valid.csv')
y_train = pd.read_csv('./date/y_train.csv')

In [3]:
def processing(df: pd.DataFrame) -> pd.DataFrame:
    """Предварительная подготовка данных"""
    
    df['DBM_Request_Time'] =  pd.to_datetime(df['DBM_Request_Time'])
    df['dayofweek'] = df['DBM_Request_Time'].dt.dayofweek
    df['month'] = pd.DatetimeIndex(df['DBM_Request_Time']).month
    df['day'] = pd.DatetimeIndex(df['DBM_Request_Time']).day
    df['hour'] = pd.DatetimeIndex(df['DBM_Request_Time']).hour
    
    df['City_ID'] = df['City_ID'].fillna(0).astype(int)
    df['DBM_Creative_ID'] = df['DBM_Creative_ID'].astype(int)
    df['DBM_Operating_System_ID'] = df['DBM_Operating_System_ID'].astype(int)
    df['DBM_Browser_Platform_ID'] = df['DBM_Browser_Platform_ID'].astype(int)
    df['DBM_Device_Type'] = df['DBM_Device_Type'].astype(int)
    
    return df

In [4]:
# Количество дубликатов
duplicates = X_train.shape[0] - X_train.drop_duplicates().shape[0]
print (f'Number of repeated lines {duplicates}')

Number of repeated lines 0


In [5]:
X_train.isna().sum() 

ID                               0
Country_Code                     0
City_ID                    1188164
DBM_Request_Time                 0
DBM_Creative_ID                  0
DBM_Language                     0
DBM_Operating_System_ID          0
DBM_Browser_Platform_ID          0
DBM_Device_Type                  0
DBM_Mobile_Model_ID              0
domen                            0
dtype: int64

In [6]:
X_valid.isna().sum() 

ID                              0
Country_Code                    0
City_ID                    791289
DBM_Request_Time                0
DBM_Creative_ID                 0
DBM_Language                    0
DBM_Operating_System_ID         0
DBM_Browser_Platform_ID         0
DBM_Device_Type                 0
DBM_Mobile_Model_ID             0
domen                           0
dtype: int64

In [7]:
y_train['TARGET'].value_counts()

0    9962580
1      35150
Name: TARGET, dtype: int64

In [8]:
X_train = processing(X_train)
X_valid = processing(X_valid)

In [9]:
minority = y_train[y_train["TARGET"] == 1]
majority = y_train[y_train["TARGET"] == 0].sample(n=len(minority))

In [10]:
y_train = pd.concat([minority, majority])
y_train = shuffle(y_train).reset_index(drop=True)

In [11]:
train = y_train.merge(
    X_train, how="left", on="ID"
)
train = train.drop(['ID', 'DBM_Request_Time'], axis=1)
test = X_valid.drop(['ID', 'DBM_Request_Time'], axis=1)

In [12]:
train.head(3)

Unnamed: 0,TARGET,Country_Code,City_ID,DBM_Creative_ID,DBM_Language,DBM_Operating_System_ID,DBM_Browser_Platform_ID,DBM_Device_Type,DBM_Mobile_Model_ID,domen,dayofweek,month,day,hour
0,1,RU,29356,215163790,RU,1175,3,2,7016.0,mbapp_url,4,12,27,3
1,1,RU,29364,215162880,Other,1181,3,2,6459.0,other,2,12,25,8
2,1,RU,29349,215162880,RU,1205,3,2,7978.0,other,0,12,23,22


In [13]:
# подготовка моделей
cat_columns = ['Country_Code', 
               'City_ID', 
               'DBM_Creative_ID', 
               'DBM_Language', 
               'DBM_Operating_System_ID', 
               'DBM_Browser_Platform_ID', 
               'DBM_Device_Type', 
               'DBM_Mobile_Model_ID', 
               'domen', 
               'month', 
               'day', 
               'hour',
               'dayofweek',
              ]




catbm = catb.CatBoostClassifier(eval_metric='AUC',
                                depth=3,
                                silent=True,
                                iterations=1000,
                                random_state=21,
                                early_stopping_rounds=200, 
                                cat_features= cat_columns)


catbm_xgb = catb.CatBoostClassifier(eval_metric='AUC',
                                    grow_policy='Depthwise',
                                    depth=3,
                                    min_data_in_leaf=500,
                                    silent=True,
                                    iterations=1000,
                                    random_state=21,
                                    early_stopping_rounds=200,
                                    cat_features=cat_columns,)

catbm_lgb = catb.CatBoostClassifier(eval_metric='AUC',
                                    grow_policy='Lossguide',
                                    max_leaves=100,
                                    min_data_in_leaf=400,
                                    silent=True,
                                    iterations=1000,
                                    random_state=21,
                                    early_stopping_rounds=200,
                                    cat_features = cat_columns,)

In [14]:
models = {'catbm':catbm,
          'catbm_lgb':catbm_lgb,
          'catbm_xgb':catbm_xgb
         }
TARGET_NAME = 'TARGET'

In [15]:
scd = Score_model_dev(df=train, TARGET_NAME=TARGET_NAME, KFold=10, models=models)

In [16]:
scd.fit()


FOLD 1 REPORT
[4mModel                                                                     f1 score    Recall  Precission   ROC AUC      Gini[0m
catbm                                                                       0.6146    0.5911        0.64    0.6296    0.2592
catbm_lgb                                                                   0.6123    0.5931      0.6327    0.6247    0.2494
catbm_xgb                                                                   0.6107    0.5931      0.6293    0.6222    0.2444

FOLD 2 REPORT
[4mModel                                                                     f1 score    Recall  Precission   ROC AUC      Gini[0m
catbm                                                                       0.6163    0.5927      0.6418    0.6309    0.2618
catbm_lgb                                                                   0.6226    0.6041      0.6423    0.6337    0.2674
catbm_xgb                                                                   0.6

#### Gini = 2 * ROC_AUC - 1

In [17]:
res = scd.predict(test)

In [18]:
res.head(3)

Unnamed: 0,Country_Code,City_ID,DBM_Creative_ID,DBM_Language,DBM_Operating_System_ID,DBM_Browser_Platform_ID,DBM_Device_Type,DBM_Mobile_Model_ID,domen,dayofweek,month,day,hour,catbm,catbm_lgb,catbm_xgb
0,RU,29378,215162880,RU,1175,3,2,7016.0,kp.ru,1,12,24,23,0.415448,0.406832,0.415179
1,RU,0,215163790,RU,1175,3,2,6450.0,expert.ru,4,12,27,4,0.67167,0.709363,0.702242
2,RU,29459,215162880,RU,1181,3,2,6459.0,drive2.ru,0,12,30,22,0.394591,0.407058,0.408807


In [19]:
res['TARGET'] = (res['catbm'] + res['catbm_lgb'] + res['catbm_xgb'])/3

In [20]:
result = X_valid[['ID']].merge(
    res[['TARGET']] , how="left", left_index=True, right_index=True
)

In [21]:
result.head(3)

Unnamed: 0,ID,TARGET
0,10757267,0.412486
1,6100469,0.694425
2,3054264,0.403485


In [22]:
result.to_csv('./date/result.csv', index=False)