In [1]:
import os

import numpy as np
import pandas as pd

import joblib

from typing import Dict

import yaml
import json

import warnings
warnings.filterwarnings("ignore")

In [2]:
config_path = '../config/params.yml'
config = yaml.load(open(config_path), Loader=yaml.FullLoader)

preproc = config['preprocessing']
training = config['train']
evaluate = config['evaluate']


# check columns with train
column_sequence_path = preproc['unique_values_path']
with open(column_sequence_path) as json_file:
    column_sequence = json.load(json_file)

In [3]:
preproc

{'map_bins_columns': {'age': [35, 60],
  'MonthlyIncome': [4000, 8000],
  'NumberOfDependents': [0, 6]},
 'drop_columns': ['id'],
 'train_path': '../data/raw/train.csv',
 'unique_values_path': '../data/processed/unique_values.json',
 'train_path_proc': '../data/processed/train.csv',
 'test_path_proc': '../data/processed/test.csv',
 'test_size': 0.25,
 'target_column': 'SeriousDlqin2yrs',
 'random_state': 10}

# Import

In [4]:
data_test = pd.read_csv(r"C:\Users\User\OneDrive - БФУ им. И. Канта\Рабочий стол\mlops-frontend-backend-docker-compose\data\check\test.csv")
data_test[:4]

Unnamed: 0.1,Unnamed: 0,SeriousDlqin2yrs,Revolving,age,NumberOfTime30,DebtRatio,MonthlyIncome,NumberOfOpen,NumberOfTimes90,NumberRealEstate,NumberOfTime60,NumberOfDependents
0,1,,0.885519,43,0,0.177513,5700.0,4,0,0,0,0.0
1,2,,0.463295,57,0,0.527237,9141.0,15,0,4,0,2.0
2,3,,0.043275,59,0,0.687648,5083.0,12,0,1,0,2.0
3,4,,0.280308,38,1,0.925961,3200.0,7,0,2,0,0.0


In [5]:
data_test = data_test.rename(columns={'Unnamed: 0': 'id'})

In [7]:
mean_M = data_test['MonthlyIncome'].mean()
data_test = data_test.fillna({'MonthlyIncome': mean_M})

In [8]:
mean_N = data_test['NumberOfDependents'].mean()
data_test = data_test.fillna({'NumberOfDependents': mean_N})

In [9]:
del data_test['SeriousDlqin2yrs']

In [10]:
data_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101503 entries, 0 to 101502
Data columns (total 11 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   id                  101503 non-null  int64  
 1   Revolving           101503 non-null  float64
 2   age                 101503 non-null  int64  
 3   NumberOfTime30      101503 non-null  int64  
 4   DebtRatio           101503 non-null  float64
 5   MonthlyIncome       101503 non-null  float64
 6   NumberOfOpen        101503 non-null  int64  
 7   NumberOfTimes90     101503 non-null  int64  
 8   NumberRealEstate    101503 non-null  int64  
 9   NumberOfTime60      101503 non-null  int64  
 10  NumberOfDependents  101503 non-null  float64
dtypes: float64(4), int64(7)
memory usage: 8.5 MB


In [11]:
data_test

Unnamed: 0,id,Revolving,age,NumberOfTime30,DebtRatio,MonthlyIncome,NumberOfOpen,NumberOfTimes90,NumberRealEstate,NumberOfTime60,NumberOfDependents
0,1,0.885519,43,0,0.177513,5700.00000,4,0,0,0,0.000000
1,2,0.463295,57,0,0.527237,9141.00000,15,0,4,0,2.000000
2,3,0.043275,59,0,0.687648,5083.00000,12,0,1,0,2.000000
3,4,0.280308,38,1,0.925961,3200.00000,7,0,2,0,0.000000
4,5,1.000000,27,0,0.019917,3865.00000,4,0,0,0,1.000000
...,...,...,...,...,...,...,...,...,...,...,...
101498,101499,0.282653,24,0,0.068522,1400.00000,5,0,0,0,0.000000
101499,101500,0.922156,36,3,0.934217,7615.00000,8,0,2,0,4.000000
101500,101501,0.081596,70,0,836.000000,6855.03559,3,0,0,0,0.769046
101501,101502,0.335457,56,0,3568.000000,6855.03559,8,0,2,1,3.000000


# Preprocessing

In [14]:
def transform_types(data: pd.DataFrame, change_type_columns: dict) -> pd.DataFrame:
    """
    Преобразование признаков в заданный тип данных
    :param data: датасет
    :param change_type_columns: словарь с признаками и типами данных
    :return:
    """
    return data.astype(change_type_columns, errors="raise")


def get_bins(
    data: (int, float), first_val: (int, float), second_val: (int, float)
) -> str:
    """
    Генерация бинов для разных признаков
    :param data: датасет
    :param first_val: первый порог значения для разбиения на бины
    :param second_val: второй порог значения для разбиения на бины
    :return: датасет
    """
    assert isinstance(data, (int, float)), "Проблема с типом данных в признаке"
    result = (
        "small"
        if data <= first_val
        else "medium"
        if first_val < data <= second_val
        else "large"
    )
    return result


def check_columns_evaluate(data: pd.DataFrame, unique_values_path: str) -> pd.DataFrame:
    """
    Проверка на наличие признаков из train и упорядочивание признаков согласно train
    :param data: датасет test
    :param unique_values_path: путь до списока с признаками train для сравнения
    :return: датасет test
    """
    with open(unique_values_path) as json_file:
        unique_values = json.load(json_file)

    column_sequence = unique_values.keys()

    assert set(column_sequence) == set(data.columns), "Разные признаки"
    return data[column_sequence]

In [13]:
def pipeline_preprocess(data: pd.DataFrame, flg_evaluate: bool = True, **kwargs):
    """
    Пайплайн по предобработке данных
    :param data: датасет
    :param flg_evaluate: флаг для evaluate
    :return: датасет
    """
    # drop columns
    data = data.drop(kwargs["drop_columns"], axis=1, errors="ignore")
    # проверка dataset на совпадение с признаками из train
    # либо сохранение уникальных данных с признаками из train
    
    # bins
    for key in kwargs["map_bins_columns"].keys():
        data[f"{key}_bins"] = data[key].apply(
            lambda x: get_bins(
                x,
                first_val=kwargs["map_bins_columns"][key][0],
                second_val=kwargs["map_bins_columns"][key][1],
            )
        )

    # change category types
    dict_category = {key: "category" for key in data.select_dtypes(["object"]).columns}
    data = transform_types(data=data, change_type_columns=dict_category)
    return data

In [15]:
dict(data_test.iloc[0])

{'id': 1.0,
 'Revolving': 0.88551908,
 'age': 43.0,
 'NumberOfTime30': 0.0,
 'DebtRatio': 0.177512717,
 'MonthlyIncome': 5700.0,
 'NumberOfOpen': 4.0,
 'NumberOfTimes90': 0.0,
 'NumberRealEstate': 0.0,
 'NumberOfTime60': 0.0,
 'NumberOfDependents': 0.0}

In [16]:
data_test

Unnamed: 0,id,Revolving,age,NumberOfTime30,DebtRatio,MonthlyIncome,NumberOfOpen,NumberOfTimes90,NumberRealEstate,NumberOfTime60,NumberOfDependents
0,1,0.885519,43,0,0.177513,5700.00000,4,0,0,0,0.000000
1,2,0.463295,57,0,0.527237,9141.00000,15,0,4,0,2.000000
2,3,0.043275,59,0,0.687648,5083.00000,12,0,1,0,2.000000
3,4,0.280308,38,1,0.925961,3200.00000,7,0,2,0,0.000000
4,5,1.000000,27,0,0.019917,3865.00000,4,0,0,0,1.000000
...,...,...,...,...,...,...,...,...,...,...,...
101498,101499,0.282653,24,0,0.068522,1400.00000,5,0,0,0,0.000000
101499,101500,0.922156,36,3,0.934217,7615.00000,8,0,2,0,4.000000
101500,101501,0.081596,70,0,836.000000,6855.03559,3,0,0,0,0.769046
101501,101502,0.335457,56,0,3568.000000,6855.03559,8,0,2,1,3.000000


In [17]:
data_proc_test = pipeline_preprocess(data=data_test, **preproc)

In [19]:
data_proc_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101503 entries, 0 to 101502
Data columns (total 13 columns):
 #   Column                   Non-Null Count   Dtype   
---  ------                   --------------   -----   
 0   Revolving                101503 non-null  float64 
 1   age                      101503 non-null  int64   
 2   NumberOfTime30           101503 non-null  int64   
 3   DebtRatio                101503 non-null  float64 
 4   MonthlyIncome            101503 non-null  float64 
 5   NumberOfOpen             101503 non-null  int64   
 6   NumberOfTimes90          101503 non-null  int64   
 7   NumberRealEstate         101503 non-null  int64   
 8   NumberOfTime60           101503 non-null  int64   
 9   NumberOfDependents       101503 non-null  float64 
 10  age_bins                 101503 non-null  category
 11  MonthlyIncome_bins       101503 non-null  category
 12  NumberOfDependents_bins  101503 non-null  category
dtypes: category(3), float64(4), int64(6)
memory 

In [None]:
del data_proc_test['']

# Evaluate

In [20]:
model = joblib.load(training['model_path'])
data_proc_test['predict'] = model.predict(data_proc_test)

In [21]:
data_proc_test

Unnamed: 0,Revolving,age,NumberOfTime30,DebtRatio,MonthlyIncome,NumberOfOpen,NumberOfTimes90,NumberRealEstate,NumberOfTime60,NumberOfDependents,age_bins,MonthlyIncome_bins,NumberOfDependents_bins,predict
0,0.885519,43,0,0.177513,5700.00000,4,0,0,0,0.000000,medium,medium,small,0
1,0.463295,57,0,0.527237,9141.00000,15,0,4,0,2.000000,medium,large,medium,0
2,0.043275,59,0,0.687648,5083.00000,12,0,1,0,2.000000,medium,medium,medium,0
3,0.280308,38,1,0.925961,3200.00000,7,0,2,0,0.000000,medium,small,small,1
4,1.000000,27,0,0.019917,3865.00000,4,0,0,0,1.000000,small,small,medium,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101498,0.282653,24,0,0.068522,1400.00000,5,0,0,0,0.000000,small,small,small,0
101499,0.922156,36,3,0.934217,7615.00000,8,0,2,0,4.000000,medium,medium,medium,1
101500,0.081596,70,0,836.000000,6855.03559,3,0,0,0,0.769046,large,medium,medium,0
101501,0.335457,56,0,3568.000000,6855.03559,8,0,2,1,3.000000,medium,medium,medium,1
