In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.cluster import DBSCAN

In [None]:
%matplotlib inline

In [None]:
train = pd.read_feather('../data/project_train.f')

In [None]:
DEALER = ~train.actual_price.isna()
train['is_dealer'] = DEALER

In [None]:
options = pd.read_csv('../data/option_names.csv')
options.head()

In [None]:
train.head()

In [None]:
'Число записей: {}, колонок: {}'.format(*train.shape)

In [None]:
train.info()

In [None]:
# в
option_cols = ['audiosistema', 'diski', 'electropodemniki', 'fary', 'salon', 'upravlenie_klimatom', 'usilitel_rul']
categories = ['crashes', 'is_taxi', 'is_pledged', 'is_restrictions', 'is_carsharing']

int_cols = option_cols + categories  # надо преобразовать к инту

In [None]:
for col in int_cols:
    train.loc[:, col] = train[col].astype('Int32') 

In [None]:
train['price'] = train.price.astype(float)

In [None]:
train['start_date'] = pd.to_datetime(train['start_date']).dt.date
train['close_date'] = pd.to_datetime(train['close_date']).dt.date
train['sale_end_date'] = pd.to_datetime(train['sale_end_date']).dt.date

In [None]:
train.info()

## EDA

### Распределение цен на авто: у юзеров и у дилеров

In [None]:
actual_price_dist = pd.DataFrame(
    {
        'price': train.loc[DEALER, 'actual_price'].copy(),
        'price_type': 'close'
    }
)

user_price_dist = pd.DataFrame(
    {
        'price': train.loc[DEALER, 'price'].copy(),
        'price_type': 'start'
    }
)

price_dist = pd.concat([actual_price_dist, user_price_dist], axis=0)

In [None]:
f, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
sns.kdeplot(
    price_dist, 
    x='price', 
    hue='price_type', 
    log_scale=False,
    ax=ax1
)
sns.kdeplot(
    price_dist, 
    x='price', 
    hue='price_type', 
    log_scale=True,
    ax=ax2
);

In [None]:
f, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

sns.boxplot(
    price_dist,
    x='price_type',
    y='price',
    ax=ax1,
)

b = sns.boxplot(
    price_dist,
    x='price_type',
    y='price',
    ax=ax2,
)
b.set_yscale('log')

In [None]:
from scipy.stats import ttest_ind


t, p = ttest_ind(
    np.log(train[DEALER].price), 
    np.log(train[DEALER].actual_price),
    equal_var=True,
    alternative='greater'
)
t, p

### Даты

In [None]:
train.info()

In [None]:
train['duration'] = (train.close_date - train.start_date).dt.days

Есть ошибки - пара объявлений, которые были проданы раньше, чем опубликованы

In [None]:
train[(train.duration < 0)]

#### Распределение длительности продажи

In [None]:
h = sns.histplot(
    train[train.duration >= 0],
    x='duration',
    binwidth=5,
)

Посмотрим на количество открытых объявлений по месяцам

In [None]:
get_month = lambda x: x.month
train['start_month'] = train.start_date.apply(get_month)
train['close_month'] = train.close_date.apply(get_month)
train['sale_end_month'] = train.sale_end_date.apply(get_month)

In [None]:
# sns.countplot(
#     x=pd.concat(
#         [
#             train.start_date.apply(lambda x: x.month),
#             train.close_date.apply(lambda x: x.month)
#         ]
#     ),
#     hue=['start_month'] * train.start_date.shape[0] + ['close_month'] * train.close_date.shape[0]
# )

h = sns.countplot(
    x=train.start_date.apply(
        lambda x: f"{x.month}-{x.year}" if not pd.isnull(x) else pd.NaT
    ),
#     ordeer=''
);
plt.xticks(rotation=45);

С ноября по январь продавали меньше

#### Бренды

In [None]:
f'{pd.Series(train.brand.apply(lambda x: x.strip().lower()).unique()).duplicated().sum()} дубликатов '

In [None]:
f, ax = plt.subplots(1, 1, figsize=(7, 12))
g = sns.countplot(
    train, 
    y='brand', 
    order=train.brand.value_counts().index,
    ax=ax
)
plt.xticks(rotation=45);

Мы можем заметить, что есть очень редкие бренды (тесла, бентли и пр.) - посмотрим, сколько объявлений с 15 самыми редкими из них:

In [None]:
f, ax = plt.subplots(1, 1, figsize=(7, 12))
g = sns.countplot(
    train, 
    y='brand', 
    order=train.brand.value_counts().index[-15:],
    ax=ax
)
plt.xticks(rotation=45);

#### Такси

Среди обычных пользовательских сделок нет информации по принадлежности машины к таксопарку

In [None]:
(~train[~DEALER].is_taxi.isna()).sum()

In [None]:
train['is_taxi'] = train.is_taxi.astype(float)

In [None]:
train[DEALER].is_taxi.value_counts(dropna=False)

Посмотрим на то, как принадлежность к таксопарку влияет на цену авто:

In [None]:
sns.boxplot(
    train[DEALER],
    x='is_taxi',
    y='actual_price'
)

#### Машины под залогом

Среди пользовательских машин нет информации о нахождении под залогом

In [None]:
(~train[~DEALER].is_pledged.isna()).sum()

In [None]:
train['is_pledged'] = train.is_pledged.astype(float)
train[DEALER].is_pledged.value_counts(dropna=False)

In [None]:
sns.boxplot(
    train[DEALER],
    x='is_pledged',
    y='actual_price'
)

#### Машины с ограничениями

In [None]:
(~train[~DEALER].is_restrictions.isna()).sum()

In [None]:
train['is_restrictions'] = train.is_restrictions.astype(float)
train[DEALER].is_restrictions.value_counts(dropna=False)

In [None]:
sns.boxplot(
    train[DEALER],
    x='is_restrictions',
    y='actual_price'
)

#### Машины из карша

In [None]:
(~train[~DEALER].is_carsharing.isna()).sum()

In [None]:
train['is_carsharing'] = train.is_carsharing.astype(float)
train[DEALER].is_carsharing.value_counts(dropna=False)

In [None]:
sns.boxplot(
    train[DEALER],
    x='is_carsharing',
    y='actual_price'
)

#### Crashes

Пользователи не указывают информацию об авариях авто:

In [None]:
(~train[~DEALER].crashes.isna()).sum()

In [None]:
train['crashes'] = train.crashes.astype(float)
train[DEALER].crashes.value_counts(dropna=False)

In [None]:
f, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 12))
sns.countplot(
    x=train[DEALER].crashes.fillna(-1),
    ax=ax1
)
sns.boxplot(
    x=train[DEALER].crashes.fillna(-1),
    y=train[DEALER].actual_price,
    ax=ax2
)


In [None]:
sns.heatmap(train[['crashes', 'actual_price']].corr(), annot=True)

#### Пробег

In [None]:
train[['mileage', 'actual_price']].corr()

In [None]:
train[['mileage', 'price']].corr()

In [None]:
pd.concat([1 / train.mileage, train.actual_price], axis=1).corr()

In [None]:
sns.scatterplot(
    x=np.log(train.mileage),
    y=np.log(train.actual_price)
)

#### Геолокация

In [None]:
geo = train[['latitude', 'longitude']]

In [None]:
from pathlib import Path
import requests
import pandas as pd

url = ("https://raw.githubusercontent.com/"
      "epogrebnyak/ru-cities/main/assets/towns.csv")

# save file locally
p = Path("towns.csv")
if not p.exists():
    content = requests.get(url).text
    p.write_text(content, encoding="utf-8")

# read as dataframe
towns = pd.read_csv("towns.csv")
(towns.sample(5))

In [None]:
train[['latitude', 'longitude']].head()

In [None]:
# Latitude & Longitude input
Latitude = "53.60782"
Longitude = "32.59346"
 
location = geolocator.reverse(Latitude+","+Longitude)
 
# Display
location.raw['address']

In [None]:
from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent="geoapiExercises")


def city_state_country(row):
    coord = f"{row['latitude']}, {row['longitude']}"
    location = geolocator.reverse(coord, exactly_one=True)
    try:
        return location.raw['address']['state']
    except KeyError:
        return np.nan


In [None]:
train[DEALER, 'region'] = train[DEALER].apply(city_state_country, axis=1)

In [None]:
sns.scatterplot(
    geo,
    x='longitude',
    y='latitude'
)

Попробую кластеризовать геоданные

In [None]:
dbscan = DBSCAN(
    eps=1,
    min_samples=1
)