In [None]:
import os
import random
import warnings

import numpy as np
import pandas as pd
import pyarrow.feather as ft

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import CountVectorizer

import ipywidgets as widgets
from ipywidgets.embed import embed_minimal_html
from IPython.display import display
from IPython.core.display import HTML

from tqdm import tqdm
from pprint import pprint

%matplotlib inline
warnings.simplefilter('ignore')

In [None]:
PATH = '../data'

TRAIN_DATA = 'project_train.f'
TEST_DATA = 'project_test_public.f'
OPTION = 'option_names.csv'

TRAIN_PATH = os.path.join(PATH, TRAIN_DATA)
TEST_PATH = os.path.join(PATH, TEST_DATA)
OPTION_PATH = os.path.join(PATH, OPTION)

train_df = ft.read_feather(TRAIN_PATH)
test_df = ft.read_feather(TEST_PATH)
options = pd.read_csv(OPTION_PATH, index_col='id')

In [None]:
train_df.shape, test_df.shape

In [None]:
train_df.columns.difference(test_df.columns)

Данные содержат очень много колонок, попробуем сгруппировать их

In [None]:
type_dct = {str(k): list(v) for k, v in train_df.groupby(train_df.dtypes, axis=1)}
pprint(type_dct, compact=True)

In [None]:
train_df['price'] = pd.to_numeric(train_df['price'])

In [None]:
def get_info(df: pd.DataFrame):
    info_df = pd.DataFrame(columns=['type', 'n_uniq', 'multi_uniq', 'values', 'null_ratio'], index=df.columns)

    for column in df.columns:
        total = len(df[column])
        null_ratio = df[column].isna().sum() / total
        n_uniq = df[column].nunique()
        uniq = df[column].unique()
        column_type = df[column].dtype
        type_name = ''
        values = '---'
        multi_n_uniq = '---'
        if n_uniq > 100:
            if column in ['latitude', 'longitude']:
                type_name = 'geolocation'
                values = [df[column].min(), df[column].max()]
            elif column_type in ['float64', 'int64']:
                type_name = 'numeric'
                values = [df[column].min(), df[column].max()]
            elif column_type in ['datetime64[ns]']:
                type_name = 'date'
                values = [df[column].min(), df[column].max()]
            elif column in ['description']:
                type_name = 'text'
        if type_name == '' and column_type == 'object':
            multi_uniq = np.unique(sum(map(lambda x: x.strip('[]').split(',') if x else [str(x)], train_df[column].unique()), []))
            multi_n_uniq = len(multi_uniq) - 1
            is_multi = len(uniq) > len(multi_uniq)
            if n_uniq <= 2:
                type_name = 'binary'
                values = uniq
            elif n_uniq <= 10:
                if is_multi:
                    type_name = 'multicategory'
                    values = multi_uniq if multi_n_uniq <= 10 else np.append(multi_uniq[:10], '...')
                else:
                    type_name = 'category'
                    values = uniq
            else:
                if is_multi:
                    type_name = 'multicategory'
                    values = multi_uniq if multi_n_uniq <= 10 else np.append(multi_uniq[:10], '...')
                else:
                    type_name = 'category'
                    values = np.append(uniq[:10], '...')
        if type_name == '':
            if n_uniq <= 2:
                type_name = 'binary'
                values = uniq
            elif n_uniq <= 10:
                type_name = 'category'
                values = uniq
            else:
                type_name = 'category'
                values = np.append(uniq[:10], '...')
        info_df.loc[column] = [type_name, n_uniq, multi_n_uniq, values, null_ratio]
    return info_df

In [None]:
(train_df.brand + train_df.model + train_df.generation).nunique()

In [None]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', 1000)
info_df = get_info(train_df)
info_df

In [None]:
pd.reset_option('all')

In [None]:
for_vanya = ['generation', 'modification', 'equipment', 'description', 'audiosistema', 'diski', 'electropodemniki', 'fary', 'salon', 'upravlenie_klimatom', 'usilitel_rul',  'audiosistema_mult', 'shini_i_diski_mult']
for_sasha = info_df[info_df.type == 'multicategory'].index.to_list()
for_maxim = info_df.index.difference(for_vanya).difference(for_sasha).to_list()
for_dict = {'Ваня': for_vanya, 'Саша': for_sasha, 'Максим': for_maxim}

In [None]:
pprint(for_dict, compact=True)

Колонки с типом **datetime64[ns]**:
- 'start_date' - дата открытия объявления
- 'close_date' - дата закрытия объявления
- 'sale_end_date' - дата продажи авто дилером

In [None]:
funcs = [min, max, lambda x: x.max() - x.min(), 'count', lambda x: x.count() / len(x)]
train_df[['start_date', 'close_date', 'sale_end_date']].agg(funcs).set_index(pd.Index(['start date', 'end date', 'range', 'count', 'ratio']))

In [None]:
train_df[['start_date', 'close_date', 'sale_end_date']].notnull().apply(lambda x: (x[0] & x[1]) | x[2], axis=1).all()

Все строки либо относятся к объявлениям, либо к продажам дилеров.

Разделим train_df на данные дилеров и объявления.

In [None]:
is_dealer = train_df['sale_end_date'].notnull()

dealer_df = train_df[is_dealer]
advert_df = train_df[~is_dealer]

In [None]:
dealer_df.shape, advert_df.shape

In [None]:
dealer_df.columns[dealer_df.nunique(dropna=False).eq(1)]

In [None]:
advert_df.columns[advert_df.nunique(dropna=False).eq(1)]

In [None]:
def get_multicat_info(values: np.array, column: str, dfs: list, names: list):
    cv = CountVectorizer(preprocessor=lambda x: str(x).strip('[]'), tokenizer=lambda x: x.replace(', ', ',').split(','))
    X = values
    cv.fit(X)

    index = np.unique(cv.transform(X).toarray(), axis=0)
    index = sorted(index, key=lambda x: sum(x) + np.dot(np.array(x), 2 ** np.arange(len(x))) * 1e-8)
    index = cv.inverse_transform(index)

    flag = False
    for row in index:
        for idx in row:
            if idx.isnumeric():
                flag = True
                break
        else:
            continue
        break
    if flag:
        options_index = list(map(lambda row: list(map(lambda x: options.loc[int(x)].item() if x.isnumeric() else x, row)), index))
        options_index = list(map(lambda x: ', '.join(x), options_index))

    index = list(map(lambda x: ', '.join(x), index))
    result_info = pd.DataFrame(index=options_index if flag else index)

    for data, name in zip(dfs, names):
        multicat_info = pd.DataFrame(columns=['total', 'percentage', 'multi total', 'multi percentage'], index=options_index if flag else index)
        transformed_column = cv.transform(data[column]).toarray()
        n = data.shape[0]
        for idx, multicat in enumerate(index):
            transformed_multicat = cv.transform([multicat]).toarray()[0]
            total = (transformed_column == transformed_multicat).all(axis=1).sum()
            percentage = total / n
            multi_total = ((transformed_column - transformed_multicat) != -1).all(axis=1).sum()
            multi_percentage = multi_total / n
            multicat_info.loc[options_index[idx] if flag else multicat] = [total, np.round(percentage * 100, 4), multi_total, np.round(multi_percentage * 100, 4)]

        multicat_info.columns = pd.MultiIndex.from_product([[name], multicat_info.columns])
        result_info = pd.concat([result_info, multicat_info], axis=1)

    result_info.columns = pd.MultiIndex.from_tuples(result_info.columns)
    return result_info.drop(['total', 'multi total'], axis=1, level=1)

In [None]:
damage_features = ['Передняя левая фара', 'Передняя правая фара', 'Правое зеркало',
          'Левое зеркало', 'Капот', 'Заднее правое крыло', 'Заднее левое крыло',
          'Передняя левая дверь', 'Крыша', 'Переднее правое крыло',
          'Заднее стекло', 'Задний бампер', 'Передняя правая дверь',
          'Задняя правая фара', 'Дверь багажника', 'Переднее левое крыло',
          'Лобовое стекло', 'Задняя правая дверь', 'Передний бампер',
          'Задняя левая фара', 'Задняя левая дверь']

outs = {}
dict_info = {}

for name in damage_features:
    temp = widgets.Output()
    outs[name] = temp
    X = pd.unique(np.concatenate([train_df[name].unique(), test_df[name].unique()]))
    dict_info[name] = get_multicat_info(X, name, dfs=[train_df, dealer_df, advert_df, test_df], names=['train_df', 'dealer_df', 'advert_df', 'test_df'])


accordion = widgets.Accordion(children=list(outs.values()))

for i, name in enumerate(damage_features):
    accordion.set_title(i, name)

# display(
#     HTML(
#         """
# <style>
# .jupyter-widgets.widget-tab > .p-TabBar .p-TabBar-tab {
#     flex: 0 1 auto
# }
# </style>
# """
#     )
# )

for name in damage_features:
    with outs[name]:
        display(dict_info[name])
accordion.selected_index = None
# display(accordion)

In [None]:
embed_minimal_html('damage_features.html', views=[accordion], title='damage_features report export')

In [None]:
display(HTML(filename='./damage_features.html'))

In [None]:
component_features = ['aktivnaya_bezopasnost_mult', 'electroprivod_mult', 'fary_mult',
          'multimedia_navigacia_mult', 'obogrev_mult', 'pamyat_nastroek_mult',
          'podushki_bezopasnosti_mult', 'pomosh_pri_vozhdenii_mult',
          'protivoygonnaya_sistema_mult', 'salon_mult',
          'upravlenie_klimatom_mult']

outs = {}
dict_info = {}

for name in component_features:
    temp = widgets.Output()
    outs[name] = temp
    X = pd.unique(np.concatenate([train_df[name].unique(), test_df[name].unique()]))
    dict_info[name] = get_multicat_info(X, name, dfs=[train_df, dealer_df, advert_df, test_df], names=['train_df', 'dealer_df', 'advert_df', 'test_df'])


accordion = widgets.Accordion(children=list(outs.values()))

for i, name in enumerate(component_features):
    accordion.set_title(i, name)

# display(
#     HTML(
#         """
# <style>
# .jupyter-widgets.widget-tab > .p-TabBar .p-TabBar-tab {
#     flex: 0 1 auto
# }
# </style>
# """
#     )
# )

for name in component_features:
    with outs[name]:
        display(dict_info[name])
accordion.selected_index = None
# display(accordion)

In [None]:
embed_minimal_html('component_features.html', views=[accordion], title='component_features report export')

In [None]:
display(HTML(filename='./component_features.html'))

In [None]:
get_multicat_info(train_df['protivoygonnaya_sistema_mult'].unique(), 'protivoygonnaya_sistema_mult', dfs=[train_df, dealer_df, advert_df], names=['train_df', 'dealer_df', 'advert_df'])

In [None]:
# from dataprep.eda import plot, plot_correlation, plot_missing, plot_diff, create_report

# create_report(dealer_df.drop(['start_date', 'close_date', 'Заднее стекло'], axis=1), progress=False).save('report_dealer_df.html')