## Выберем данных для теста демки

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from notebooks.helpers import explore_data_modern, load_latest_params
from catboost import Pool,CatBoostClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold

Настройки отображения

In [2]:
pd.set_option('display.max_columns', 50)
plt.style.use('ggplot') 

Загрузка данных

In [3]:
try:
    CHOICE = pd.read_parquet('../data/datasets/cross.parquet')
            
    print("✅ Данные загружены!")
    print(f"Cross validation frame: {CHOICE.shape[0]} строк")
except Exception as e:
    print(f"❌ Ошибка: {e}")

✅ Данные загружены!
Cross validation frame: 1385812 строк


In [4]:
display(explore_data_modern(CHOICE, 'Choice'))


🔍 Анализ датафрейма: Choice


Unnamed: 0,Тип данных,Уникальных,Пропусков,% Пропусков,Пример значения
client_id,object,1091884,0,0.0%,1338357431.1640964866
device_screen_resolution,object,4414,0,0.0%,1792x1120
brand_tier,object,3,0,0.0%,other
is_returning,int32,2,0,0.0%,0
visit_time_minutes,int64,1440,0,0.0%,1114
has_utm_keyword,int32,2,0,0.0%,1
utm_keyword_campaign_fill,object,1164,0,0.0%,qUcotcWimEOQiboVPcCx
utm_keyword_notset_fill,object,1143,0,0.0%,qUcotcWimEOQiboVPcCx
target,int32,2,0,0.0%,0
geo_city,object,2201,0,0.0%,Tula


In [8]:
choice = CHOICE[['utm_source', 'utm_medium', 'device_brand', 'visit_number', 'utm_campaign', 'utm_keyword', 'target']] 

In [9]:
display(explore_data_modern(choice, 'Choice'))


🔍 Анализ датафрейма: Choice


Unnamed: 0,Тип данных,Уникальных,Пропусков,% Пропусков,Пример значения
utm_source,object,263,0,0.0%,fgymSoTvjKPEgaIJqsiH
utm_medium,object,50,0,0.0%,cpm
device_brand,object,194,0,0.0%,unknown
visit_number,int64,502,0,0.0%,1
utm_campaign,object,400,0,0.0%,FTjNLDyTrXaWYgZymFkV
utm_keyword,object,1143,0,0.0%,qUcotcWimEOQiboVPcCx
target,int32,2,0,0.0%,0


In [10]:
yes = choice[choice['target'] == 1]
no = choice[choice['target'] == 0] 

In [11]:
yes

Unnamed: 0,utm_source,utm_medium,device_brand,visit_number,utm_campaign,utm_keyword,target
19,ZpYIoDJMcFzVoPFsHGJL,banner,Huawei,1,LEoPHuyFvzoNfnzGgfcd,puhZPIYqKXeFPaUviSjo,1
35,ZpYIoDJMcFzVoPFsHGJL,banner,Xiaomi,1,SgIUDYUKnyWHVowUOqid,unknown,1
83,kjsLglQLzykiRbcDiGcD,cpc,Apple,3,unknown,unknown,1
111,bByPQxmDaMXgpHeypKSM,referral,,1,LTuZkdKfxRGVceoWkVyg,unknown,1
112,kjsLglQLzykiRbcDiGcD,cpc,Asus,1,bJJuEXRheRIxXEaYIXqM,unknown,1
...,...,...,...,...,...,...,...
1385585,QxAxdyPLuQMEcrdZWdWb,cpc,Samsung,1,IWLzPAfDfwypTzMxtiQE,JTBldRAXvttfVmCNgppl,1
1385611,ZpYIoDJMcFzVoPFsHGJL,banner,Huawei,1,LEoPHuyFvzoNfnzGgfcd,puhZPIYqKXeFPaUviSjo,1
1385652,ZpYIoDJMcFzVoPFsHGJL,banner,Apple,1,WiILFRDQbcHDHNvHzGpX,unknown,1
1385684,ZpYIoDJMcFzVoPFsHGJL,banner,Huawei,1,LEoPHuyFvzoNfnzGgfcd,puhZPIYqKXeFPaUviSjo,1


In [13]:
print(yes.drop('target', axis=1).head(5).to_json(orient='records', indent=2))

[
  {
    "utm_source":"ZpYIoDJMcFzVoPFsHGJL",
    "utm_medium":"banner",
    "device_brand":"Huawei",
    "visit_number":1,
    "utm_campaign":"LEoPHuyFvzoNfnzGgfcd",
    "utm_keyword":"puhZPIYqKXeFPaUviSjo"
  },
  {
    "utm_source":"ZpYIoDJMcFzVoPFsHGJL",
    "utm_medium":"banner",
    "device_brand":"Xiaomi",
    "visit_number":1,
    "utm_campaign":"SgIUDYUKnyWHVowUOqid",
    "utm_keyword":"unknown"
  },
  {
    "utm_source":"kjsLglQLzykiRbcDiGcD",
    "utm_medium":"cpc",
    "device_brand":"Apple",
    "visit_number":3,
    "utm_campaign":"unknown",
    "utm_keyword":"unknown"
  },
  {
    "utm_source":"bByPQxmDaMXgpHeypKSM",
    "utm_medium":"referral",
    "device_brand":"",
    "visit_number":1,
    "utm_campaign":"LTuZkdKfxRGVceoWkVyg",
    "utm_keyword":"unknown"
  },
  {
    "utm_source":"kjsLglQLzykiRbcDiGcD",
    "utm_medium":"cpc",
    "device_brand":"Asus",
    "visit_number":1,
    "utm_campaign":"bJJuEXRheRIxXEaYIXqM",
    "utm_keyword":"unknown"
  }
]


In [15]:
print(no.drop('target', axis=1).head(5).to_json(orient='records', indent=2))

[
  {
    "utm_source":"fgymSoTvjKPEgaIJqsiH",
    "utm_medium":"cpm",
    "device_brand":"unknown",
    "visit_number":1,
    "utm_campaign":"FTjNLDyTrXaWYgZymFkV",
    "utm_keyword":"qUcotcWimEOQiboVPcCx"
  },
  {
    "utm_source":"fDLlAcSmythWSCVMvqvL",
    "utm_medium":"(none)",
    "device_brand":"Apple",
    "visit_number":2,
    "utm_campaign":"LTuZkdKfxRGVceoWkVyg",
    "utm_keyword":"unknown"
  },
  {
    "utm_source":"fDLlAcSmythWSCVMvqvL",
    "utm_medium":"(none)",
    "device_brand":"",
    "visit_number":1,
    "utm_campaign":"LTuZkdKfxRGVceoWkVyg",
    "utm_keyword":"unknown"
  },
  {
    "utm_source":"fDLlAcSmythWSCVMvqvL",
    "utm_medium":"(none)",
    "device_brand":"Xiaomi",
    "visit_number":1,
    "utm_campaign":"LTuZkdKfxRGVceoWkVyg",
    "utm_keyword":"unknown"
  },
  {
    "utm_source":"fDLlAcSmythWSCVMvqvL",
    "utm_medium":"(none)",
    "device_brand":"",
    "visit_number":1,
    "utm_campaign":"LTuZkdKfxRGVceoWkVyg",
    "utm_keyword":"unknown"
  }
]


In [23]:
features = ['utm_source', 'utm_medium', 'device_brand', 'visit_number', 'utm_campaign', 'utm_keyword']
print({feature: CHOICE[feature].unique().tolist() for feature in features})

{'utm_source': ['fgymSoTvjKPEgaIJqsiH', 'fDLlAcSmythWSCVMvqvL', 'ZpYIoDJMcFzVoPFsHGJL', 'MvfHsxITijuriZxsqZqt', 'kjsLglQLzykiRbcDiGcD', 'bByPQxmDaMXgpHeypKSM', 'fbFKcMumlScApQMqFIqp', 'BHcvLfOaCWvWTykYqHVe', 'QxAxdyPLuQMEcrdZWdWb', 'hTjLvqNxGggkGnxSCaTm', 'PlbkrSYoHuZBWfYjYnfw', 'ghoaGAksqhKomdFrxgyJ', 'eLzNJHzPelJpEyBwMrKo', 'geDcueAOghDzHkGMmdOq', 'jaSOmLICuBzCFqHfBdRg', 'RmEBuqrriAfAVsLQQmhk', 'zwpKjjsMoRVCdipntaHt', 'aXQzDWsJuGXeBXexNHjc', 'SzZERoLMmrEUEhDaYcyN', 'qVXuCoVQtPxcUkAXiXBa', 'nmfptFmSirEqNzAzqbXA', 'TTtiRKFZIaQpIWggfCoF', 'NwLFDlNWnYxuLZEAZppl', 'gDBGzjFKYabGgSPZvrDH', 'IZEXUFLARCUMynmHNBGo', 'juYouSPHPfKdXivEPJjt', 'iNFgfQPqHPBuvGCYtrQE', 'nSReTmyFtbSjlPrTKoaX', 'oCqKpnSZJeYOVZTgTmKR', 'gVRrcxiDQubJiljoTbGm', 'NwuIyBhuPCXhJVPLtXCC', 'TxKUcPpthBDPieTGmVhx', 'FTAuYVNoYYxgvKMpKSLW', 'KgicpPxiEQfzPlPwQZJq', 'vFcAhRxLfOWKhvxjELkx', 'GpAkIXsclxDGyILfNlrR', 'oZCzWSykfixnjMPDNjSU', 'ISrKoXQCxqqYvAZICvjs', 'BKeImrJuRDZcHiSSTdzm', 'maiZOsuEAMdeoRVsYoFk', 'RxecHElWobBxIeAkqFXV', 

In [ ]:
del CHOICE, choice, yes, no