# 📊 EDA для «СберАвтоподписки» — Первичный анализ данных
**Что делаем:**
1. Смотрим на структуру данных.
2. Проверяем пропуски и аномалии.
3. Анализируем целевые события (конверсии).

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from pathlib import Path

Настройки отображения

In [2]:
pd.set_option('display.max_columns', 50)
plt.style.use('ggplot') 

Загрузка данных

In [3]:
try:
    sessions = pd.read_pickle('../data/raw/ga_sessions.pkl')
    hits = pd.read_pickle('../data/raw/ga_hits.pkl')
    print("✅ Данные загружены!")
    print(f"Сессии: {sessions.shape[0]} строк, {sessions.shape[1]} колонок")
    print(f"События: {hits.shape[0]} строк, {hits.shape[1]} колонок")
except Exception as e:
    print(f"❌ Ошибка: {e}")

✅ Данные загружены!
Сессии: 1860042 строк, 18 колонок
События: 15726470 строк, 11 колонок


## 2. Обзор данных

In [4]:
def head_to_file(df, file_name):
    df.head(10).to_csv(f'../data/temp/{file_name}_head.csv', index=False)    

In [5]:
def explore_data(df, name, n=5):
    print(f"\n🔍 Анализ: {name}")
    print(f"\nПервые {n} строк:")
    display(df.head(n))
    print("\nИнформация о колонках:")
    print(df.info())
    # print("\nПропуски:")
    # print(df.isnull().sum().sort_values(ascending=False))
    # print("\nПримеры уникальных значений:")
    # for col in df.columns[:3]:  # Смотрим первые 3 колонки
    #     print(f"{col}: {df[col].nunique()} уникальных значений")
        
def explore_data_modern(df, name):
    print(f"\n🔍 Анализ датафрейма: {name}")
    
    # Создаем сводную таблицу с характеристиками
    analysis = pd.DataFrame({
        'Тип данных': df.dtypes,
        'Уникальных': df.nunique(),
        'Пропусков': df.isnull().sum(),
        '% Пропусков': (df.isnull().mean() * 100).round(1),
        'Пример значения': df.iloc[0] if len(df) > 0 else None
    }).sort_values('Пропусков', ascending=False)
    
    # Стилизация таблицы
    styled_analysis = analysis.style \
        .background_gradient(subset=['Пропусков', '% Пропусков'], cmap='Reds') \
        .format({'% Пропусков': '{:.1f}%'})
    
    # Выводим результаты
    print("\n📊 Основные характеристики колонок:")
    display(styled_analysis)

    return analysis

def print_unique(df, column):
    for i, value in enumerate(df[column].unique(), 1):
        print(f"{i}. {value}")
    
    

## Выдернем данные касаемые девайса для последующего заполнения бренда

In [6]:
missing_brand = sessions[sessions['device_brand'].isna()]

# Отбираем нужные колонки
device_cols = ['device_category', 'device_os', 'device_screen_resolution', 'device_browser', 'device_brand', 'device_model']
missing_devices = missing_brand[device_cols].copy()

missing_devices['original_index'] = missing_brand.index

missing_devices.to_pickle('../data/temp/missing_devices.pkl')
print(f"Найдено строк с пропущенным брендом: {len(missing_devices)}")

Найдено строк с пропущенным брендом: 118678


In [7]:
explore_data_modern(missing_devices, 'missing_devices')


🔍 Анализ датафрейма: missing_devices

📊 Основные характеристики колонок:


Unnamed: 0,Тип данных,Уникальных,Пропусков,% Пропусков,Пример значения
device_brand,object,0,118678,100.0%,
device_model,object,0,118678,100.0%,
device_category,object,3,0,0.0%,desktop
device_os,object,5,0,0.0%,Windows
device_screen_resolution,object,1038,0,0.0%,1536x864
device_browser,object,24,0,0.0%,Chrome
original_index,int64,118678,0,0.0%,28


Unnamed: 0,Тип данных,Уникальных,Пропусков,% Пропусков,Пример значения
device_brand,object,0,118678,100.0,
device_model,object,0,118678,100.0,
device_category,object,3,0,0.0,desktop
device_os,object,5,0,0.0,Windows
device_screen_resolution,object,1038,0,0.0,1536x864
device_browser,object,24,0,0.0,Chrome
original_index,int64,118678,0,0.0,28


In [8]:
print(missing_devices['device_category'].value_counts())

device_category
desktop    118585
mobile         76
tablet         17
Name: count, dtype: int64


In [9]:
explore_data(sessions, "Сессии (ga_sessions)")
explore_data(hits, "События (ga_hits)")


🔍 Анализ: Сессии (ga_sessions)

Первые 5 строк:


Unnamed: 0,session_id,client_id,visit_date,visit_time,visit_number,utm_source,utm_medium,utm_campaign,utm_adcontent,utm_keyword,device_category,device_os,device_brand,device_model,device_screen_resolution,device_browser,geo_country,geo_city
0,9055434745589932991.1637753792.1637753792,2108382700.1637757,2021-11-24,14:36:32,1,ZpYIoDJMcFzVoPFsHGJL,banner,LEoPHuyFvzoNfnzGgfcd,vCIpmpaGBnIQhyYNkXqp,puhZPIYqKXeFPaUviSjo,mobile,Android,Huawei,,360x720,Chrome,Russia,Zlatoust
1,905544597018549464.1636867290.1636867290,210838531.16368672,2021-11-14,08:21:30,1,MvfHsxITijuriZxsqZqt,cpm,FTjNLDyTrXaWYgZymFkV,xhoenQgDQsgfEPYNPwKO,IGUCNvHlhfHpROGclCit,mobile,Android,Samsung,,385x854,Samsung Internet,Russia,Moscow
2,9055446045651783499.1640648526.1640648526,2108385331.164065,2021-12-28,02:42:06,1,ZpYIoDJMcFzVoPFsHGJL,banner,LEoPHuyFvzoNfnzGgfcd,vCIpmpaGBnIQhyYNkXqp,puhZPIYqKXeFPaUviSjo,mobile,Android,Huawei,,360x720,Chrome,Russia,Krasnoyarsk
3,9055447046360770272.1622255328.1622255328,2108385564.1622252,2021-05-29,05:00:00,1,kjsLglQLzykiRbcDiGcD,cpc,,NOBKLgtuvqYWkXQHeYWM,,mobile,,Xiaomi,,393x786,Chrome,Russia,Moscow
4,9055447046360770272.1622255345.1622255345,2108385564.1622252,2021-05-29,05:00:00,2,kjsLglQLzykiRbcDiGcD,cpc,,,,mobile,,Xiaomi,,393x786,Chrome,Russia,Moscow



Информация о колонках:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1860042 entries, 0 to 1860041
Data columns (total 18 columns):
 #   Column                    Dtype 
---  ------                    ----- 
 0   session_id                object
 1   client_id                 object
 2   visit_date                object
 3   visit_time                object
 4   visit_number              int64 
 5   utm_source                object
 6   utm_medium                object
 7   utm_campaign              object
 8   utm_adcontent             object
 9   utm_keyword               object
 10  device_category           object
 11  device_os                 object
 12  device_brand              object
 13  device_model              object
 14  device_screen_resolution  object
 15  device_browser            object
 16  geo_country               object
 17  geo_city                  object
dtypes: int64(1), object(17)
memory usage: 255.4+ MB
None

🔍 Анализ: События (ga_hits)

Первые 5 строк:

Unnamed: 0,session_id,hit_date,hit_time,hit_number,hit_type,hit_referer,hit_page_path,event_category,event_action,event_label,event_value
0,5639623078712724064.1640254056.1640254056,2021-12-23,597864.0,30,event,,sberauto.com/cars?utm_source_initial=google&ut...,quiz,quiz_show,,
1,7750352294969115059.1640271109.1640271109,2021-12-23,597331.0,41,event,,sberauto.com/cars/fiat?city=1&city=18&rental_c...,quiz,quiz_show,,
2,885342191847998240.1640235807.1640235807,2021-12-23,796252.0,49,event,,sberauto.com/cars/all/volkswagen/polo/e994838f...,quiz,quiz_show,,
3,142526202120934167.1640211014.1640211014,2021-12-23,934292.0,46,event,,sberauto.com/cars?utm_source_initial=yandex&ut...,quiz,quiz_show,,
4,3450086108837475701.1640265078.1640265078,2021-12-23,768741.0,79,event,,sberauto.com/cars/all/mercedes-benz/cla-klasse...,quiz,quiz_show,,



Информация о колонках:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15726470 entries, 0 to 15726469
Data columns (total 11 columns):
 #   Column          Dtype  
---  ------          -----  
 0   session_id      object 
 1   hit_date        object 
 2   hit_time        float64
 3   hit_number      int64  
 4   hit_type        object 
 5   hit_referer     object 
 6   hit_page_path   object 
 7   event_category  object 
 8   event_action    object 
 9   event_label     object 
 10  event_value     object 
dtypes: float64(1), int64(1), object(9)
memory usage: 1.3+ GB
None


### Очистка. 
## Щаг подготовительный.

**Функция сохранения фреймов**

In [10]:
SESSIONS_FRAME = 'sessions'
HITS_FRAME = 'hits'
def save_frame(df, frame_name, comment):
    timestamp = pd.Timestamp.now().strftime("%Y%m%d_%H%M")
    path = f'../data/processed/'
    filename = f'{path}{frame_name}_{timestamp}.pkl'
    df.to_pickle(filename)
    log_file = f'../data/processed/log_{frame_name}.txt'
    with open(log_file, "a", encoding="utf-8") as file:
        file.write(f"{timestamp}: {comment}\n")
        
    print(f'Сохранено: {filename} | Лог: {log_file}')   
    
def save_session(comment):
    save_frame(sessions, SESSIONS_FRAME, comment)

def save_hits(comment):
    save_frame(hits, HITS_FRAME, comment)
    

**Сохраним фреймы перед реформами**

In [11]:
save_frame(sessions, SESSIONS_FRAME, 'Стартовое состояние фрейма перед очисткой')
save_frame(hits, HITS_FRAME, 'Стартовое состояние фрейма перед очисткой')

Сохранено: ../data/processed/sessions_20250707_1453.pkl | Лог: ../data/processed/log_sessions.txt
Сохранено: ../data/processed/hits_20250707_1453.pkl | Лог: ../data/processed/log_hits.txt


## Шаг 1. 
**Удаляем device_model, utm_keyword, device_os (очень много пропусков, определяем как мусор)**

In [12]:
sessions = sessions.drop(columns=['device_model', 'utm_keyword', 'device_os'])
save_session('Удалены столбцы: "device_model", "utm_keyword", "device_os"')

Сохранено: ../data/processed/sessions_20250707_1455.pkl | Лог: ../data/processed/log_sessions.txt


**Удаляем event_value по тем же мотивам**

In [13]:
hits = hits.drop(columns=['event_value'])
save_hits('Удалён столбец: "event_value"')

Сохранено: ../data/processed/hits_20250707_1455.pkl | Лог: ../data/processed/log_hits.txt


**Смотрим что получилось**

In [14]:
analys_session = explore_data_modern(sessions, "Сессии (ga_sessions)")
analys_hit = explore_data_modern(hits, "События (ga_hits)")
analys_hit.to_csv('../data/temp/analys_hits.csv')
analys_session.to_csv('../data/temp/analys_sessions.csv')


🔍 Анализ датафрейма: Сессии (ga_sessions)

📊 Основные характеристики колонок:


Unnamed: 0,Тип данных,Уникальных,Пропусков,% Пропусков,Пример значения
utm_adcontent,object,286,335615,18.0%,vCIpmpaGBnIQhyYNkXqp
utm_campaign,object,412,219603,11.8%,LEoPHuyFvzoNfnzGgfcd
device_brand,object,206,118678,6.4%,Huawei
utm_source,object,293,97,0.0%,ZpYIoDJMcFzVoPFsHGJL
session_id,object,1860042,0,0.0%,9055434745589932991.1637753792.1637753792
client_id,object,1391719,0,0.0%,2108382700.1637753791
visit_date,object,226,0,0.0%,2021-11-24
visit_time,object,85318,0,0.0%,14:36:32
visit_number,int64,537,0,0.0%,1
utm_medium,object,56,0,0.0%,banner



🔍 Анализ датафрейма: События (ga_hits)

📊 Основные характеристики колонок:


Unnamed: 0,Тип данных,Уникальных,Пропусков,% Пропусков,Пример значения
hit_time,float64,925887,9160322,58.2%,597864.000000
hit_referer,object,37873,6274804,39.9%,
event_label,object,39825,3760184,23.9%,
session_id,object,1734610,0,0.0%,5639623078712724064.1640254056.1640254056
hit_date,object,226,0,0.0%,2021-12-23
hit_number,int64,500,0,0.0%,30
hit_type,object,1,0,0.0%,event
hit_page_path,object,342715,0,0.0%,sberauto.com/cars?utm_source_initial=google&utm_medium_initial=cpc&utm_campaign_initial=bJJuEXRheRIxXEaYIXqM|BmWKiKCvBVipoWOpNUIL&utm_content_initial=yXzSAJWSuRbYeLFyMVFl|iJxXDTgPAXGQYOdgCzqe|xAEYTlJDbQNwQTLGxICQ|pmvTbUlmnwAkuHIbvKUg|lFpAoUqEvHoyZfCNRuCR|oxjYxrkwnRSTTYcVZzdb|dtYJKUplNmmKCOfiziff|NgTCwCMgdzcCqxiALuwY|aHSPdWXGkJravzWuwcJA|HbolMJUevblAbkHClEQa|kqZPVvZmXADsCZxIQwbv|QaGunexmXrztCWbLcYxs|OfdDOXSwHYclonQJpPxh|htxLmqvAkmISCliXGMzb|HbolMJUevblAbkHClEQa&utm_term_initial=&city=1&rental_page=rental_only&rental_car=rental_only&city=18
event_category,object,52,0,0.0%,quiz
event_action,object,230,0,0.0%,quiz_show


**Посмотрим есть что нибудь внятное в столбцах utm_adcontent, utm_campaign, utm_source,**

In [17]:
print("utm_adcontent:")
print(sessions['utm_adcontent'].value_counts())
print("utm_campaign:")
print(sessions['utm_campaign'].value_counts())
print("utm_source:")
print(sessions['utm_source'].value_counts())
print("utm_medium:")
print(sessions['utm_medium'].value_counts())

utm_adcontent:
utm_adcontent
JNHcPlZPxEMWDnRiyoBf    1006599
vCIpmpaGBnIQhyYNkXqp     181048
xhoenQgDQsgfEPYNPwKO     113072
PkybGvWbaqORmxjNunqZ      60316
LLfCasrxQzJIyuldcuWy      24222
                         ...   
ryswqHdLvbezpDLjokYF          1
gQEnaeQmOHisHSTEyzhp          1
PlanrlymnpCeYvUrUTVJ          1
WbXQsVOKLOcJrHikuQTZ          1
oJivRDNIrrOckRBIKOtG          1
Name: count, Length: 286, dtype: int64
utm_campaign:
utm_campaign
LTuZkdKfxRGVceoWkVyg    463481
LEoPHuyFvzoNfnzGgfcd    324044
FTjNLDyTrXaWYgZymFkV    247360
gecBYcKZCPMcVYdSSzKP    134042
TmThBvoCcwkCZZUWACYq     26597
                         ...  
cqgnjDbqrtCipVvzhxqa         1
AdtHYglxfCpTpwZeTAuW         1
pcvPxfVFaAmhwFmvIeYd         1
qPDTdivQVeflLjTYIJnG         1
cXxuwXPoQCvAXPHpFcZl         1
Name: count, Length: 412, dtype: int64
utm_source:
utm_source
ZpYIoDJMcFzVoPFsHGJL    578290
fDLlAcSmythWSCVMvqvL    300575
kjsLglQLzykiRbcDiGcD    266354
MvfHsxITijuriZxsqZqt    186199
BHcvLfOaCWvWTykYqHVe    11

**Теперь столбец hit_type за однообразие**

In [106]:
hits = hits.drop(columns=['hit_type'])
save_hits('Удалён столбец: "hit_type"')

Сохранено: ../data/processed/sessions_20250706_1958.pkl | Лог: ../data/processed/log_sessions.txt
Сохранено: ../data/processed/hits_20250706_1959.pkl | Лог: ../data/processed/log_hits.txt


**Посмотрим что вышло**

In [107]:
explore_data(sessions, SESSIONS_FRAME)
explore_data(hits, HITS_FRAME)

In [108]:
explore_data_modern(sessions, SESSIONS_FRAME)
explore_data_modern(hits, HITS_FRAME)


🔍 Анализ датафрейма: sessions

📊 Основные характеристики колонок:


Unnamed: 0,Тип данных,Уникальных,Пропусков,% Пропусков,Пример значения
device_brand,object,206,118678,6.4%,Huawei
session_id,object,1860042,0,0.0%,9055434745589932991.1637753792.1637753792
client_id,object,1391719,0,0.0%,2108382700.1637753791
visit_date,object,226,0,0.0%,2021-11-24
visit_time,object,85318,0,0.0%,14:36:32
visit_number,int64,537,0,0.0%,1
utm_medium,object,56,0,0.0%,banner
device_category,object,3,0,0.0%,mobile
device_screen_resolution,object,5039,0,0.0%,360x720
device_browser,object,57,0,0.0%,Chrome



🔍 Анализ датафрейма: hits

📊 Основные характеристики колонок:


Unnamed: 0,Тип данных,Уникальных,Пропусков,% Пропусков,Пример значения
hit_time,float64,925887,9160322,58.2%,597864.000000
hit_referer,object,37873,6274804,39.9%,
event_label,object,39825,3760184,23.9%,
session_id,object,1734610,0,0.0%,5639623078712724064.1640254056.1640254056
hit_date,object,226,0,0.0%,2021-12-23
hit_number,int64,500,0,0.0%,30
hit_page_path,object,342715,0,0.0%,sberauto.com/cars?utm_source_initial=google&utm_medium_initial=cpc&utm_campaign_initial=bJJuEXRheRIxXEaYIXqM|BmWKiKCvBVipoWOpNUIL&utm_content_initial=yXzSAJWSuRbYeLFyMVFl|iJxXDTgPAXGQYOdgCzqe|xAEYTlJDbQNwQTLGxICQ|pmvTbUlmnwAkuHIbvKUg|lFpAoUqEvHoyZfCNRuCR|oxjYxrkwnRSTTYcVZzdb|dtYJKUplNmmKCOfiziff|NgTCwCMgdzcCqxiALuwY|aHSPdWXGkJravzWuwcJA|HbolMJUevblAbkHClEQa|kqZPVvZmXADsCZxIQwbv|QaGunexmXrztCWbLcYxs|OfdDOXSwHYclonQJpPxh|htxLmqvAkmISCliXGMzb|HbolMJUevblAbkHClEQa&utm_term_initial=&city=1&rental_page=rental_only&rental_car=rental_only&city=18
event_category,object,52,0,0.0%,quiz
event_action,object,230,0,0.0%,quiz_show


Unnamed: 0,Тип данных,Уникальных,Пропусков,% Пропусков,Пример значения
hit_time,float64,925887,9160322,58.2,597864.0
hit_referer,object,37873,6274804,39.9,
event_label,object,39825,3760184,23.9,
session_id,object,1734610,0,0.0,5639623078712724064.1640254056.1640254056
hit_date,object,226,0,0.0,2021-12-23
hit_number,int64,500,0,0.0,30
hit_page_path,object,342715,0,0.0,sberauto.com/cars?utm_source_initial=google&ut...
event_category,object,52,0,0.0,quiz
event_action,object,230,0,0.0,quiz_show


**Удалим hit_page_path, ну потому что инфа совершенно бесполезна**

In [109]:
hits = hits.drop(columns=['hit_page_path'])
save_hits('Удалён столбец: "hit_page_path"')

Сохранено: ../data/processed/hits_20250706_2001.pkl | Лог: ../data/processed/log_hits.txt


**Традиционно посмотрим**

In [110]:
explore_data_modern(hits, HITS_FRAME)


🔍 Анализ датафрейма: hits

📊 Основные характеристики колонок:


Unnamed: 0,Тип данных,Уникальных,Пропусков,% Пропусков,Пример значения
hit_time,float64,925887,9160322,58.2%,597864.000000
hit_referer,object,37873,6274804,39.9%,
event_label,object,39825,3760184,23.9%,
session_id,object,1734610,0,0.0%,5639623078712724064.1640254056.1640254056
hit_date,object,226,0,0.0%,2021-12-23
hit_number,int64,500,0,0.0%,30
event_category,object,52,0,0.0%,quiz
event_action,object,230,0,0.0%,quiz_show


Unnamed: 0,Тип данных,Уникальных,Пропусков,% Пропусков,Пример значения
hit_time,float64,925887,9160322,58.2,597864.0
hit_referer,object,37873,6274804,39.9,
event_label,object,39825,3760184,23.9,
session_id,object,1734610,0,0.0,5639623078712724064.1640254056.1640254056
hit_date,object,226,0,0.0,2021-12-23
hit_number,int64,500,0,0.0,30
event_category,object,52,0,0.0,quiz
event_action,object,230,0,0.0,quiz_show


In [None]:
print("Категории:")
print(hits['event_category'].value_counts())

**event_category не содержит ничего интересного, event_label при наличии 40 000 уникальных значений, как то связанных с event - это мусор**

In [111]:
hits = hits.drop(columns=['event_label', 'event_category'])
save_hits('Удалён столбец: "event_label", "event_category"')

Сохранено: ../data/processed/hits_20250706_2003.pkl | Лог: ../data/processed/log_hits.txt


**Снова посмотрим**

In [112]:
explore_data_modern(sessions, SESSIONS_FRAME)
explore_data_modern(hits, HITS_FRAME)


🔍 Анализ датафрейма: sessions

📊 Основные характеристики колонок:


Unnamed: 0,Тип данных,Уникальных,Пропусков,% Пропусков,Пример значения
device_brand,object,206,118678,6.4%,Huawei
session_id,object,1860042,0,0.0%,9055434745589932991.1637753792.1637753792
client_id,object,1391719,0,0.0%,2108382700.1637753791
visit_date,object,226,0,0.0%,2021-11-24
visit_time,object,85318,0,0.0%,14:36:32
visit_number,int64,537,0,0.0%,1
utm_medium,object,56,0,0.0%,banner
device_category,object,3,0,0.0%,mobile
device_screen_resolution,object,5039,0,0.0%,360x720
device_browser,object,57,0,0.0%,Chrome



🔍 Анализ датафрейма: hits

📊 Основные характеристики колонок:


Unnamed: 0,Тип данных,Уникальных,Пропусков,% Пропусков,Пример значения
hit_time,float64,925887,9160322,58.2%,597864.000000
hit_referer,object,37873,6274804,39.9%,
session_id,object,1734610,0,0.0%,5639623078712724064.1640254056.1640254056
hit_date,object,226,0,0.0%,2021-12-23
hit_number,int64,500,0,0.0%,30
event_action,object,230,0,0.0%,quiz_show


Unnamed: 0,Тип данных,Уникальных,Пропусков,% Пропусков,Пример значения
hit_time,float64,925887,9160322,58.2,597864.0
hit_referer,object,37873,6274804,39.9,
session_id,object,1734610,0,0.0,5639623078712724064.1640254056.1640254056
hit_date,object,226,0,0.0,2021-12-23
hit_number,int64,500,0,0.0,30
event_action,object,230,0,0.0,quiz_show


## Попробуем заполнить пропуски device_brand
### Ранее мы отложили missing_brand
**Сначала заполним категорию desktop значением NoName**

In [114]:
sessions.loc[sessions['device_brand'].isna() & (sessions['device_category'] == 'desktop'), 'device_brand'] = 'NoName'
save_session('Заполнение пропусков в device_brand значениями NoName для категории desktop')

Сохранено: ../data/processed/sessions_20250706_2012.pkl | Лог: ../data/processed/log_sessions.txt


**Посмотрим**

In [117]:
explore_data_modern(sessions, SESSIONS_FRAME)
missing_devices = missing_devices[missing_devices['device_category'] != 'desktop']
explore_data_modern(missing_devices, 'missing_devices')


🔍 Анализ датафрейма: sessions

📊 Основные характеристики колонок:


Unnamed: 0,Тип данных,Уникальных,Пропусков,% Пропусков,Пример значения
device_brand,object,207,93,0.0%,Huawei
session_id,object,1860042,0,0.0%,9055434745589932991.1637753792.1637753792
client_id,object,1391719,0,0.0%,2108382700.1637753791
visit_date,object,226,0,0.0%,2021-11-24
visit_time,object,85318,0,0.0%,14:36:32
visit_number,int64,537,0,0.0%,1
utm_medium,object,56,0,0.0%,banner
device_category,object,3,0,0.0%,mobile
device_screen_resolution,object,5039,0,0.0%,360x720
device_browser,object,57,0,0.0%,Chrome



🔍 Анализ датафрейма: missing_devices

📊 Основные характеристики колонок:


Unnamed: 0,Тип данных,Уникальных,Пропусков,% Пропусков,Пример значения
device_brand,object,0,93,100.0%,
device_model,object,0,93,100.0%,
device_category,object,2,0,0.0%,tablet
device_os,object,3,0,0.0%,Macintosh
device_screen_resolution,object,26,0,0.0%,810x1080
device_browser,object,7,0,0.0%,Safari
original_index,int64,93,0,0.0%,69321


Unnamed: 0,Тип данных,Уникальных,Пропусков,% Пропусков,Пример значения
device_brand,object,0,93,100.0,
device_model,object,0,93,100.0,
device_category,object,2,0,0.0,tablet
device_os,object,3,0,0.0,Macintosh
device_screen_resolution,object,26,0,0.0,810x1080
device_browser,object,7,0,0.0,Safari
original_index,int64,93,0,0.0,69321


In [118]:
print(missing_devices['device_os'].value_counts())

device_os
Linux        57
Macintosh    20
Windows      16
Name: count, dtype: int64


In [119]:
print(sessions['device_brand'].value_counts())

device_brand
Apple       551088
Samsung     332194
Xiaomi      288367
            248500
Huawei      185853
             ...  
Vodafone         1
Wexler           1
KingSing         1
Star             1
Opera            1
Name: count, Length: 207, dtype: int64


**Разберёмся с макинтошами**

In [121]:
mac_indices = missing_brand[missing_brand['device_os'] == 'Macintosh'].index
sessions.loc[mac_indices, 'device_brand'] = 'Apple'
save_session('Заполнение пропусков в device_brand значениями Apple для os Macintosh')

Сохранено: ../data/processed/sessions_20250706_2039.pkl | Лог: ../data/processed/log_sessions.txt


In [123]:
explore_data_modern(sessions, SESSIONS_FRAME)
missing_devices = missing_devices[missing_devices['device_os'] != 'Macintosh']
explore_data_modern(missing_devices, 'missing_devices')


🔍 Анализ датафрейма: sessions

📊 Основные характеристики колонок:


Unnamed: 0,Тип данных,Уникальных,Пропусков,% Пропусков,Пример значения
device_brand,object,207,73,0.0%,Huawei
session_id,object,1860042,0,0.0%,9055434745589932991.1637753792.1637753792
client_id,object,1391719,0,0.0%,2108382700.1637753791
visit_date,object,226,0,0.0%,2021-11-24
visit_time,object,85318,0,0.0%,14:36:32
visit_number,int64,537,0,0.0%,1
utm_medium,object,56,0,0.0%,banner
device_category,object,3,0,0.0%,mobile
device_screen_resolution,object,5039,0,0.0%,360x720
device_browser,object,57,0,0.0%,Chrome



🔍 Анализ датафрейма: missing_devices

📊 Основные характеристики колонок:


Unnamed: 0,Тип данных,Уникальных,Пропусков,% Пропусков,Пример значения
device_brand,object,0,73,100.0%,
device_model,object,0,73,100.0%,
device_category,object,2,0,0.0%,mobile
device_os,object,2,0,0.0%,Linux
device_screen_resolution,object,19,0,0.0%,393x851
device_browser,object,5,0,0.0%,Chrome
original_index,int64,73,0,0.0%,119227


Unnamed: 0,Тип данных,Уникальных,Пропусков,% Пропусков,Пример значения
device_brand,object,0,73,100.0,
device_model,object,0,73,100.0,
device_category,object,2,0,0.0,mobile
device_os,object,2,0,0.0,Linux
device_screen_resolution,object,19,0,0.0,393x851
device_browser,object,5,0,0.0,Chrome
original_index,int64,73,0,0.0,119227


In [128]:
print(missing_devices['device_browser'].value_counts())

device_browser
Chrome              63
Samsung Internet     6
Edge                 2
Firefox              1
UC Browser           1
Name: count, dtype: int64


In [129]:
sessions.loc[(sessions['device_browser'] == 'Samsung Internet') & (sessions['device_brand'].isna()), 'device_brand'] = 'Samsung'
save_session('Заполнение пропусков в device_brand значениями "Samsung" для браузера "Samsung Internet"')

Сохранено: ../data/processed/sessions_20250706_2103.pkl | Лог: ../data/processed/log_sessions.txt


In [131]:
explore_data_modern(sessions, SESSIONS_FRAME)
missing_devices = missing_devices[missing_devices['device_browser'] != 'Samsung Internet']


🔍 Анализ датафрейма: sessions

📊 Основные характеристики колонок:


Unnamed: 0,Тип данных,Уникальных,Пропусков,% Пропусков,Пример значения
device_brand,object,207,67,0.0%,Huawei
session_id,object,1860042,0,0.0%,9055434745589932991.1637753792.1637753792
client_id,object,1391719,0,0.0%,2108382700.1637753791
visit_date,object,226,0,0.0%,2021-11-24
visit_time,object,85318,0,0.0%,14:36:32
visit_number,int64,537,0,0.0%,1
utm_medium,object,56,0,0.0%,banner
device_category,object,3,0,0.0%,mobile
device_screen_resolution,object,5039,0,0.0%,360x720
device_browser,object,57,0,0.0%,Chrome


In [140]:
explore_data_modern(missing_devices, 'missing_devices')
unknown_indices = missing_devices.index
print(unknown_indices)


🔍 Анализ датафрейма: missing_devices

📊 Основные характеристики колонок:


Unnamed: 0,Тип данных,Уникальных,Пропусков,% Пропусков,Пример значения
device_brand,object,0,67,100.0%,
device_model,object,0,67,100.0%,
device_category,object,2,0,0.0%,mobile
device_os,object,2,0,0.0%,Linux
device_screen_resolution,object,15,0,0.0%,393x851
device_browser,object,4,0,0.0%,Chrome
original_index,int64,67,0,0.0%,119227


Index([ 119227,  121378,  137577,  265642,  276129,  276130,  276131,  412904,
        429723,  483498,  537622,  537623,  537624,  553620,  553622,  553624,
        605930,  605931,  607704,  607706,  624841,  638499,  652853,  671507,
        671508,  787164,  843906,  922551,  922553, 1014397, 1019406, 1057636,
       1152139, 1180190, 1193771, 1199700, 1199702, 1272918, 1346304, 1362164,
       1395819, 1419719, 1421352, 1421353, 1433829, 1434062, 1471626, 1471631,
       1471638, 1471640, 1471641, 1471679, 1471685, 1478907, 1489509, 1489510,
       1524279, 1524526, 1531030, 1542872, 1561882, 1671088, 1671089, 1704802,
       1716387, 1724652, 1747305],
      dtype='int64')


In [141]:
sessions.loc[unknown_indices, 'device_brand'] = 'unknown'
save_session('Заполнение оставшихся пропусков в device_brand значениями unknown')

Сохранено: ../data/processed/sessions_20250706_2137.pkl | Лог: ../data/processed/log_sessions.txt


In [142]:
explore_data_modern(sessions, SESSIONS_FRAME)


🔍 Анализ датафрейма: sessions

📊 Основные характеристики колонок:


Unnamed: 0,Тип данных,Уникальных,Пропусков,% Пропусков,Пример значения
session_id,object,1860042,0,0.0%,9055434745589932991.1637753792.1637753792
client_id,object,1391719,0,0.0%,2108382700.1637753791
visit_date,object,226,0,0.0%,2021-11-24
visit_time,object,85318,0,0.0%,14:36:32
visit_number,int64,537,0,0.0%,1
utm_medium,object,56,0,0.0%,banner
device_category,object,3,0,0.0%,mobile
device_brand,object,208,0,0.0%,Huawei
device_screen_resolution,object,5039,0,0.0%,360x720
device_browser,object,57,0,0.0%,Chrome


Unnamed: 0,Тип данных,Уникальных,Пропусков,% Пропусков,Пример значения
session_id,object,1860042,0,0.0,9055434745589932991.1637753792.1637753792
client_id,object,1391719,0,0.0,2108382700.1637753791
visit_date,object,226,0,0.0,2021-11-24
visit_time,object,85318,0,0.0,14:36:32
visit_number,int64,537,0,0.0,1
utm_medium,object,56,0,0.0,banner
device_category,object,3,0,0.0,mobile
device_brand,object,208,0,0.0,Huawei
device_screen_resolution,object,5039,0,0.0,360x720
device_browser,object,57,0,0.0,Chrome


## 3. Анализ целевых событий
Определяем, какие события считать конверсиями.

In [19]:
def get_target_events(df, min_freq=50):
    event_stats = df['event_action'].value_counts()
    valid_events = event_stats[event_stats >= min_freq].index
    rare_events = event_stats[event_stats < min_freq].index
    
    print(f"Допустимые события: {len(valid_events)}")
    print(f"Редкие события (игнорируются): {len(rare_events)}")
    return valid_events


Допустимые события: 146
Редкие события (игнорируются): 84


Топ событий в hits

In [18]:
print("\nТоп-10 событий:")
print(hits['event_action'].value_counts().head(10))


Топ-10 событий:
event_action
view_card                  3558985
view_new_card              3551009
sub_landing                1798117
go_to_car_card              973666
sub_view_cars_click         791515
search_form_region          512069
search_form_search_btn      433748
sap_search_form_cost_to     356596
showed_number_ads           326274
pagination_click            248944
Name: count, dtype: int64


Попробуем вывести претендентов на целевые события.

In [23]:
target_events = get_target_events(hits, 100)
for idx, event in enumerate(target_events, 1):
    print(f"{idx}. {event}")

Допустимые события: 146
Редкие события (игнорируются): 84
1. view_card
2. view_new_card
3. sub_landing
4. go_to_car_card
5. sub_view_cars_click
6. search_form_region
7. search_form_search_btn
8. sap_search_form_cost_to
9. showed_number_ads
10. pagination_click
11. search_form_mark_select
12. quiz_show
13. search_form_model_select
14. sap_search_form_cost_from
15. photos_all
16. search_form_search_car_type_select
17. photos
18. sub_car_page
19. view_more_click
20. search_kpp
21. sub_header_link_click
22. proactive invitation shown
23. sub_offer_click
24. start_chat
25. listing_ads_sort
26. search_form_rental
27. sub_faq
28. quiz_start
29. sub_view_faq_click
30. search_form_cost_to
31. analystlabel
32. start_auth
33. search_body_type
34. sub_car_claim_click
35. search_form_model_multiselec
36. search_form_year_from
37. show_phone_input
38. toggle_leasing_cars_show
39. click_on_logo
40. search_km_to
41. search_form_modification_select
42. sub_open_dialog_click
43. search_engine
44. search

Выкинем в файл.

In [27]:
with open("../data/temp/target_events_list.csv", "w") as f:
    f.write("\n".join(target_events))
print("✅ Список сохранён в target_events_list.csv")

✅ Список сохранён в target_events_list.csv


In [30]:
head_to_file(sessions, 'sessions')
head_to_file(hits, 'hits')
print("✅ Хеды выгружены в файлы")

✅ Хеды выгружены в файлы


Ручной ввод целевых событий.

In [31]:
target_actions = ['start_chat']