In [6]:
import datetime

import pandas as pd
import warnings
import dill
import pickle

from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer,  make_column_selector
from sklearn.metrics import accuracy_score

In [2]:
# Функция выбора необходимых признаков
def filter_data(df):
    columns_to_drop = [
        'id',
        'url',
        'region',
        'region_url',
        'price',
        'manufacturer',
        'image_url',
        'description',
        'posting_date',
        'lat',
        'long'
    ]
    return df.drop(columns=columns_to_drop)

# Функция обработки выбросов в признаке year
def clean_year(df):
    df_clean = df.copy()
    q25 = df_clean.year.quantile(0.25)
    q75 = df_clean.year.quantile(0.75)
    iqr = q75 - q25
    boundaries = (q25 - 1.5 * iqr, q75 + 1.5 * iqr)
    df_clean.loc[df_clean['year'] < boundaries[0], 'year'] = round(boundaries[0])
    df_clean.loc[df_clean['year'] > boundaries[1], 'year'] = round(boundaries[1])
    return df_clean

# Функция создания новых признаков
def create_features(df):
    df_new = df.copy()
    df_new.loc[:, 'short_model'] = df_new['model'].apply(lambda x: x.lower().split(' ')[0] if type(x) == str else x)
    df_new.loc[:, 'age_category'] = df_new['year'].apply(lambda x: 'new' if x > 2013 else ('old' if x < 2006 else 'average'))
    return df_new

numerical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='median')),
('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='most_frequent')),
('encoder', OneHotEncoder(handle_unknown='ignore', sparse=False))
])

preprocessor = ColumnTransformer(transformers=[
('numerical', numerical_transformer, make_column_selector(dtype_include=['int64', 'float64'])),
('categorical', categorical_transformer, make_column_selector(dtype_include=['object']))
])

with open('models/best_model.pkl', 'rb') as file:
    model = dill.load(file)

In [4]:
warnings.simplefilter(action='ignore')
df = pd.read_csv(f'data/main_dataset.csv')
X = df.drop(['price_category'], axis=1)
y = df['price_category']

pipe = Pipeline(steps=[
    ('filter', FunctionTransformer(filter_data)),
    ('clean', FunctionTransformer(clean_year)),
    ('new_feats', FunctionTransformer(create_features)),
    ('preprocessor', preprocessor),
    ('classifier', model)
])
pipe.fit(X, y)

In [7]:
y_pred = pipe.predict(X)
score = accuracy_score(y, y_pred)

In [9]:

with open(f'models/price_category_predict_pipe.pkl', 'wb') as file:
    dill.dump({
        'model': pipe,
        'metadata': {
            'name': 'Car price prediction model',
            'author': 'Yaroslav Kryazhev',
            'version': 1,
            'date': datetime.datetime.now(),
            'type': type(pipe.named_steps['classifier']).__name__,
            'accuracy': score
        }
    }, file)