In [1]:
import os
import random
import warnings

import numpy as np
import pandas as pd
import pyarrow.feather as ft

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import CountVectorizer

import ipywidgets as widgets
from ipywidgets.embed import embed_minimal_html
from IPython.display import display
from IPython.core.display import HTML

from tqdm import tqdm
from pprint import pprint

%matplotlib inline
warnings.simplefilter('ignore')

In [2]:
PATH = '../data'

TRAIN_DATA = 'project_train.f'
TEST_DATA = 'project_test_public.f'
OPTION = 'option_names.csv'

TRAIN_PATH = os.path.join(PATH, TRAIN_DATA)
TEST_PATH = os.path.join(PATH, TEST_DATA)
OPTION_PATH = os.path.join(PATH, OPTION)

train_df = ft.read_feather(TRAIN_PATH)
test_df = ft.read_feather(TEST_PATH)
options = pd.read_csv(OPTION_PATH, index_col='id')

In [3]:
train_df.shape, test_df.shape

((1424484, 70), (9780, 66))

In [4]:
train_df.columns.difference(test_df.columns)

Index(['actual_price', 'close_date', 'price', 'start_date'], dtype='object')

Данные содержат очень много колонок, попробуем сгруппировать их

In [5]:
type_dct = {str(k): list(v) for k, v in train_df.groupby(train_df.dtypes, axis=1)}
pprint(type_dct, compact=True)

{'datetime64[ns]': ['start_date', 'close_date', 'sale_end_date'],
 'float64': ['actual_price', 'latitude', 'longitude', 'crashes', 'is_taxi',
             'is_pledged', 'is_restrictions', 'is_carsharing', 'audiosistema',
             'diski', 'electropodemniki', 'fary', 'salon',
             'upravlenie_klimatom', 'usilitel_rul'],
 'int64': ['doors_number', 'year', 'mileage'],
 'object': ['price', 'brand', 'model', 'generation', 'modification',
            'equipment', 'body_type', 'drive_type', 'transmission_type',
            'engine_type', 'color', 'pts', 'owners_count', 'steering_wheel',
            'description', 'Передняя левая фара', 'Передняя правая фара',
            'Правое зеркало', 'Левое зеркало', 'Капот', 'Заднее правое крыло',
            'Заднее левое крыло', 'Передняя левая дверь', 'Крыша',
            'Переднее правое крыло', 'Заднее стекло', 'Задний бампер',
            'Передняя правая дверь', 'Задняя правая фара', 'Дверь багажника',
            'Переднее левое крыл

In [6]:
train_df['price'] = pd.to_numeric(train_df['price'])

In [7]:
def get_info(df: pd.DataFrame):
    info_df = pd.DataFrame(columns=['type', 'n_uniq', 'multi_uniq', 'values', 'null_ratio'], index=df.columns)

    for column in df.columns:
        total = len(df[column])
        null_ratio = df[column].isna().sum() / total
        n_uniq = df[column].nunique()
        uniq = df[column].unique()
        column_type = df[column].dtype
        type_name = ''
        values = '---'
        multi_n_uniq = '---'
        if n_uniq > 100:
            if column in ['latitude', 'longitude']:
                type_name = 'geolocation'
                values = [df[column].min(), df[column].max()]
            elif column_type in ['float64', 'int64']:
                type_name = 'numeric'
                values = [df[column].min(), df[column].max()]
            elif column_type in ['datetime64[ns]']:
                type_name = 'date'
                values = [df[column].min(), df[column].max()]
            elif column in ['description']:
                type_name = 'text'
        if type_name == '' and column_type == 'object':
            multi_uniq = np.unique(sum(map(lambda x: x.strip('[]').split(',') if x else [str(x)], train_df[column].unique()), []))
            multi_n_uniq = len(multi_uniq) - 1
            is_multi = len(uniq) > len(multi_uniq)
            if n_uniq <= 2:
                type_name = 'binary'
                values = uniq
            elif n_uniq <= 10:
                if is_multi:
                    type_name = 'multicategory'
                    values = multi_uniq if multi_n_uniq <= 10 else np.append(multi_uniq[:10], '...')
                else:
                    type_name = 'category'
                    values = uniq
            else:
                if is_multi:
                    type_name = 'multicategory'
                    values = multi_uniq if multi_n_uniq <= 10 else np.append(multi_uniq[:10], '...')
                else:
                    type_name = 'category'
                    values = np.append(uniq[:10], '...')
        if type_name == '':
            if n_uniq <= 2:
                type_name = 'binary'
                values = uniq
            elif n_uniq <= 10:
                type_name = 'category'
                values = uniq
            else:
                type_name = 'category'
                values = np.append(uniq[:10], '...')
        info_df.loc[column] = [type_name, n_uniq, multi_n_uniq, values, null_ratio]
    return info_df

In [8]:
(train_df.brand + train_df.model + train_df.generation).nunique()

1546

In [9]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', 1000)
info_df = get_info(train_df)
info_df

Unnamed: 0,type,n_uniq,multi_uniq,values,null_ratio
actual_price,numeric,13964,---,"[15000.0, 23500000.0]",0.942838
price,numeric,32423,---,"[10000.0, 50000000.0]",0.0
start_date,date,1343058,---,"[2022-02-28 12:34:52.658711, 2023-02-27 23:33:21.435794]",0.057162
close_date,date,1343056,---,"[2022-02-28 13:01:38.549945, 2023-02-28 00:41:28.522994]",0.057162
sale_end_date,date,240,---,"[2022-06-11 00:00:00, 2023-02-05 00:00:00]",0.942838
brand,category,76,75,"[Volkswagen, Renault, Mercedes-Benz, Hyundai, Jaguar, ГАЗ, Skoda, Toyota, ZOTYE, BMW, ...]",0.0
model,category,779,778,"[Passat, Kaptur, M-класс, GL-класс AMG, Tucson, E-Pace, 24 Волга, Viano, Superb, GLC-класс Coupe, ...]",0.0
generation,category,1183,1182,"[B8 (2014—2020), I рестайлинг (2020—2023), W164 рестайлинг (2008—2011), X166 (2012—2016), IV (2020—2023), I (2017—2023), 2410 (1985—1992), W639 рестайлинг (2010—2014), II (2008—2013), C253 рестайлинг (2019—2023), ...]",0.0
modification,category,1795,1794,"[2.0 TDI DSG (150 л.с.), 1.3 TCe 4WD CVT (150 л.с.), ML 320d 3.0 4MATIC 7G-Tronic (211 л.с.), GL 63 5.5 4MATIC 7G-Tronic (557 л.с.), 2.0 4WD AT (150 л.с.), 2.0 4WD AT (249 л.с.), 2.4 MT (100 л.с.), 2.1 CDI 4MATIC AT (163 л.с.) Lang, 1.8 TSI DSG (160 л.с.), GLC 300d 2.0 4MATIC 9G-Tronic (245 л.с.), ...]",0.0
equipment,category,1102,1103,"[Highline, None, Conceptline, Trendline, Life Plus, Comfortline, Life, Style TCe 150, Edition One, Drive, ...]",0.229607


In [10]:
pd.reset_option('all')

In [11]:
for_vanya = ['generation', 'modification', 'equipment', 'description', 'audiosistema', 'diski', 'electropodemniki', 'fary', 'salon', 'upravlenie_klimatom', 'usilitel_rul',  'audiosistema_mult', 'shini_i_diski_mult']
for_sasha = info_df[info_df.type == 'multicategory'].index.to_list()
for_maxim = info_df.index.difference(for_vanya).difference(for_sasha).to_list()
for_dict = {'Ваня': for_vanya, 'Саша': for_sasha, 'Максим': for_maxim}

In [12]:
pprint(for_dict, compact=True)

{'Ваня': ['generation', 'modification', 'equipment', 'description',
          'audiosistema', 'diski', 'electropodemniki', 'fary', 'salon',
          'upravlenie_klimatom', 'usilitel_rul', 'audiosistema_mult',
          'shini_i_diski_mult'],
 'Максим': ['actual_price', 'body_type', 'brand', 'close_date', 'color',
            'crashes', 'doors_number', 'drive_type', 'engine_type',
            'is_carsharing', 'is_pledged', 'is_restrictions', 'is_taxi',
            'latitude', 'longitude', 'mileage', 'model', 'owners_count',
            'price', 'pts', 'sale_end_date', 'start_date', 'steering_wheel',
            'transmission_type', 'year'],
 'Саша': ['Передняя левая фара', 'Передняя правая фара', 'Правое зеркало',
          'Левое зеркало', 'Капот', 'Заднее правое крыло', 'Заднее левое крыло',
          'Передняя левая дверь', 'Крыша', 'Переднее правое крыло',
          'Заднее стекло', 'Задний бампер', 'Передняя правая дверь',
          'Задняя правая фара', 'Дверь багажника', 'Передн

Колонки с типом **datetime64[ns]**:
- 'start_date' - дата открытия объявления
- 'close_date' - дата закрытия объявления
- 'sale_end_date' - дата продажи авто дилером

In [13]:
funcs = [min, max, lambda x: x.max() - x.min(), 'count', lambda x: x.count() / len(x)]
train_df[['start_date', 'close_date', 'sale_end_date']].agg(funcs).set_index(pd.Index(['start date', 'end date', 'range', 'count', 'ratio']))

Unnamed: 0,start_date,close_date,sale_end_date
start date,2022-02-28 12:34:52.658711,2022-02-28 13:01:38.549945,2022-06-11 00:00:00
end date,2023-02-27 23:33:21.435794,2023-02-28 00:41:28.522994,2023-02-05 00:00:00
range,364 days 10:58:28.777083,364 days 11:39:49.973049,239 days 00:00:00
count,1343058,1343058,81426
ratio,0.942838,0.942838,0.057162


In [14]:
train_df[['start_date', 'close_date', 'sale_end_date']].notnull().apply(lambda x: (x[0] & x[1]) | x[2], axis=1).all()

True

Все строки либо относятся к объявлениям, либо к продажам дилеров.

Разделим train_df на данные дилеров и объявления.

In [15]:
is_dealer = train_df['sale_end_date'].notnull()

dealer_df = train_df[is_dealer]
advert_df = train_df[~is_dealer]

In [16]:
dealer_df.shape, advert_df.shape

((81426, 70), (1343058, 70))

In [17]:
dealer_df.columns[dealer_df.nunique(dropna=False).eq(1)]

Index(['start_date', 'close_date', 'Заднее стекло'], dtype='object')

In [18]:
advert_df.columns[advert_df.nunique(dropna=False).eq(1)]

Index(['actual_price', 'sale_end_date', 'crashes', 'is_taxi', 'is_pledged',
       'is_restrictions', 'is_carsharing'],
      dtype='object')

In [19]:
def get_multicat_info(values: np.array, column: str, dfs: list, names: list):
    cv = CountVectorizer(preprocessor=lambda x: str(x).strip('[]'), tokenizer=lambda x: x.replace(', ', ',').split(','))
    X = values
    cv.fit(X)

    index = np.unique(cv.transform(X).toarray(), axis=0)
    index = sorted(index, key=lambda x: sum(x) + np.dot(np.array(x), 2 ** np.arange(len(x))) * 1e-8)
    index = cv.inverse_transform(index)

    flag = False
    for row in index:
        for idx in row:
            if idx.isnumeric():
                flag = True
                break
        else:
            continue
        break
    if flag:
        options_index = list(map(lambda row: list(map(lambda x: options.loc[int(x)].item() if x.isnumeric() else x, row)), index))
        options_index = list(map(lambda x: ', '.join(x), options_index))

    index = list(map(lambda x: ', '.join(x), index))
    result_info = pd.DataFrame(index=options_index if flag else index)

    for data, name in zip(dfs, names):
        multicat_info = pd.DataFrame(columns=['total', 'percentage', 'multi total', 'multi percentage'], index=options_index if flag else index)
        transformed_column = cv.transform(data[column]).toarray()
        n = data.shape[0]
        for idx, multicat in enumerate(index):
            transformed_multicat = cv.transform([multicat]).toarray()[0]
            total = (transformed_column == transformed_multicat).all(axis=1).sum()
            percentage = total / n
            multi_total = ((transformed_column - transformed_multicat) != -1).all(axis=1).sum()
            multi_percentage = multi_total / n
            multicat_info.loc[options_index[idx] if flag else multicat] = [total, np.round(percentage * 100, 4), multi_total, np.round(multi_percentage * 100, 4)]

        multicat_info.columns = pd.MultiIndex.from_product([[name], multicat_info.columns])
        result_info = pd.concat([result_info, multicat_info], axis=1)

    result_info.columns = pd.MultiIndex.from_tuples(result_info.columns)
    return result_info.drop(['total', 'multi total'], axis=1, level=1)

In [20]:
damage_features = ['Передняя левая фара', 'Передняя правая фара', 'Правое зеркало',
          'Левое зеркало', 'Капот', 'Заднее правое крыло', 'Заднее левое крыло',
          'Передняя левая дверь', 'Крыша', 'Переднее правое крыло',
          'Заднее стекло', 'Задний бампер', 'Передняя правая дверь',
          'Задняя правая фара', 'Дверь багажника', 'Переднее левое крыло',
          'Лобовое стекло', 'Задняя правая дверь', 'Передний бампер',
          'Задняя левая фара', 'Задняя левая дверь']

outs = {}
dict_info = {}

for name in damage_features:
    temp = widgets.Output()
    outs[name] = temp
    X = pd.unique(np.concatenate([train_df[name].unique(), test_df[name].unique()]))
    dict_info[name] = get_multicat_info(X, name, dfs=[train_df, dealer_df, advert_df, test_df], names=['train_df', 'dealer_df', 'advert_df', 'test_df'])


accordion = widgets.Accordion(children=list(outs.values()))

for i, name in enumerate(damage_features):
    accordion.set_title(i, name)

# display(
#     HTML(
#         """
# <style>
# .jupyter-widgets.widget-tab > .p-TabBar .p-TabBar-tab {
#     flex: 0 1 auto
# }
# </style>
# """
#     )
# )

for name in damage_features:
    with outs[name]:
        display(dict_info[name])
accordion.selected_index = None
# display(accordion)

In [21]:
embed_minimal_html('damage_features.html', views=[accordion], title='damage_features report export')

In [22]:
display(HTML(filename='./damage_features.html'))

In [23]:
component_features = ['aktivnaya_bezopasnost_mult', 'electroprivod_mult', 'fary_mult',
          'multimedia_navigacia_mult', 'obogrev_mult', 'pamyat_nastroek_mult',
          'podushki_bezopasnosti_mult', 'pomosh_pri_vozhdenii_mult',
          'protivoygonnaya_sistema_mult', 'salon_mult',
          'upravlenie_klimatom_mult']

outs = {}
dict_info = {}

for name in component_features:
    temp = widgets.Output()
    outs[name] = temp
    X = pd.unique(np.concatenate([train_df[name].unique(), test_df[name].unique()]))
    dict_info[name] = get_multicat_info(X, name, dfs=[train_df, dealer_df, advert_df, test_df], names=['train_df', 'dealer_df', 'advert_df', 'test_df'])


accordion = widgets.Accordion(children=list(outs.values()))

for i, name in enumerate(component_features):
    accordion.set_title(i, name)

# display(
#     HTML(
#         """
# <style>
# .jupyter-widgets.widget-tab > .p-TabBar .p-TabBar-tab {
#     flex: 0 1 auto
# }
# </style>
# """
#     )
# )

for name in component_features:
    with outs[name]:
        display(dict_info[name])
accordion.selected_index = None
# display(accordion)

In [24]:
embed_minimal_html('component_features.html', views=[accordion], title='component_features report export')

In [25]:
display(HTML(filename='./component_features.html'))

In [26]:
get_multicat_info(train_df['protivoygonnaya_sistema_mult'].unique(), 'protivoygonnaya_sistema_mult', dfs=[train_df, dealer_df, advert_df], names=['train_df', 'dealer_df', 'advert_df'])

Unnamed: 0_level_0,train_df,train_df,dealer_df,dealer_df,advert_df,advert_df
Unnamed: 0_level_1,percentage,multi percentage,percentage,multi percentage,percentage,multi percentage
Сигнализация,6.2133,37.2477,1.3141,34.0469,6.5104,37.4418
Центральный замок,9.7555,48.7137,12.2995,73.0123,9.6013,47.2406
Иммобилайзер,0.6528,28.7416,1.1286,57.5639,0.6239,26.9942
Спутник,0.1012,4.5103,0.0086,0.3783,0.1068,4.7609
,43.3574,43.3574,23.9948,23.9948,44.5313,44.5313
"Сигнализация, Центральный замок",10.9717,30.0924,4.7638,32.1949,11.3481,29.9649
"Сигнализация, Иммобилайзер",0.6421,19.3303,0.5219,27.9223,0.6494,18.8094
"Центральный замок, Иммобилайзер",8.5199,27.3645,28.4909,55.9047,7.3091,25.6342
"Сигнализация, Спутник",0.2372,4.0436,0.0111,0.339,0.2509,4.2682
"Центральный замок, Спутник",0.1268,4.0897,0.0086,0.35,0.1339,4.3164


In [27]:
# from dataprep.eda import plot, plot_correlation, plot_missing, plot_diff, create_report

# create_report(dealer_df.drop(['start_date', 'close_date', 'Заднее стекло'], axis=1), progress=False).save('report_dealer_df.html')