# Import

In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import yaml

from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score, \
f1_score, log_loss,plot_confusion_matrix, confusion_matrix
from sklearn.preprocessing import label_binarize, LabelEncoder

import joblib

import optuna
from optuna.visualization.matplotlib import plot_param_importances
from optuna.visualization import plot_optimization_history

import warnings
from warnings import simplefilter
warnings.filterwarnings("ignore")
simplefilter("ignore", category=RuntimeWarning)

In [2]:
config_path = 'E:/Pet_project/MLOps/Pet_MLOps/config/parameters.yml'
config = yaml.load(open(config_path), Loader=yaml.FullLoader)

preprocess = config['preprocessing']
training = config['train']
evaluate = config['evaluate']


'''# check columns with train
column_sequence_path = preproces['unique_values_path']
with open(column_sequence_path) as json_file:
    column_sequence = json.load(json_file)'''

"# check columns with train\ncolumn_sequence_path = preproces['unique_values_path']\nwith open(column_sequence_path) as json_file:\n    column_sequence = json.load(json_file)"

In [3]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [4]:
data_test = pd.read_csv(evaluate['predict_path'])
data_test.head()

Unnamed: 0,Tour_ID,country,age_group,travel_with,total_female,total_male,purpose,main_activity,info_source,tour_arrangement,package_transport_int,package_accomodation,package_food,package_transport_tz,package_sightseeing,package_guided_tour,package_insurance,night_mainland,night_zanzibar,first_trip_tz
0,tour_idynufedne,KOREA,25-44,Alone,0.0,1.0,Leisure and Holidays,Widlife Tourism,Others,Independent,No,No,No,No,No,No,No,7,4,Yes
1,tour_id9r3y5moe,UNITED KINGDOM,45-64,With Children,1.0,1.0,Leisure and Holidays,Conference Tourism,"Travel agent, tour operator",Package Tour,Yes,Yes,Yes,Yes,Yes,Yes,Yes,7,0,Yes
2,tour_idf6itml6g,ITALY,25-44,With Spouse,1.0,1.0,Leisure and Holidays,Beach Tourism,"Travel agent, tour operator",Package Tour,Yes,Yes,No,Yes,No,No,No,0,6,Yes
3,tour_id99u4znru,KENYA,25-44,Alone,0.0,1.0,Other,Beach Tourism,"Radio, TV, Web",Independent,No,No,No,No,No,No,No,3,4,No
4,tour_idj4i9urbx,ZAMBIA,25-44,Alone,0.0,1.0,Business,Widlife Tourism,"Radio, TV, Web",Independent,No,No,No,No,No,No,No,6,0,No


# Preprocessing

In [5]:
def lower_case(data: pd.DataFrame, columns: list):
    '''
    приведение к одному регистру
    :param data: датасет
    :param columns: признаки, значения которых необходимо привести к одному регистру
    '''
    for i in columns:
        data[i] = data[i].str.lower()

def correct_mistake(data: pd.DataFrame, map_dict_mistake: dict):
    '''
    Замена значений в датасете
    :param data: датасет
    :param map_dict_mistake: словарь с признаками и значениями
    :return: датасет
    '''
    return data.replace(map_dict_mistake)

def filling_in_gaps(data:pd.DataFrame):
    '''
    заполнение пропусков
    :param data: датасет
    '''
    list_data_columns = data.isna().sum().reset_index().rename(columns={
        'index': 'col_name',
        0: 'cnt'
    })

    list_columns_zero = list(
        (list_data_columns[list_data_columns.iloc[:, 1] != 0]['col_name']))

    for i in list_columns_zero:
        if data[i].dtype == 'int' or data[i].dtype == 'float':
            data[i].fillna(data[i].median(), inplace=True)
        else:
            data[i].fillna('None', inplace=True)

    return data.isna().sum()

def func_map_values(data:pd.DataFrame, names_cols: dict):
    '''
    замена значений в датасете
    :param data: датасет
    :param names_cols: список признаков
    '''
    return data.replace(names_cols)

def symbol_correct(data:pd.DataFrame, names_cols: list):
    '''
    замена пробелов и запятых символом "_"
    :param data: датасет
    :param names_cols: список признаков, в значениях которых встречаются пробелы и запятые
    '''
    for col in names_cols:
        data[col].replace(' ', '_', regex=True, inplace=True)
        data[col].replace(',', '', regex=True, inplace=True)
        
def correct_types(data: pd.DataFrame, change_type_columns: dict):
    """
    Преобразование признаков в заданный тип данных
    :param data: датасет
    :param change_type_columns: словарь с признаками и типами данных
    :return:
    """
    return data.astype(change_type_columns, errors="raise")

In [6]:
def pipeline(data:pd.DataFrame, **kwargs):
    '''
    сбор функций для коректировки датасета
    :param data: датасет
    return: датасет
    '''
    # удаление признаков
    data.drop(columns=kwargs['drop_column'], axis=1, inplace=True)
    
    # приведение к одному регистру
    lower_case(data=data, columns=kwargs['lower_columns'])
    
    # исправление орфографических ошибок
    data = correct_mistake(data=data, map_dict_mistake=kwargs['correct_values'])
    
    # заполнение пропусков
    filling_in_gaps(data=data)
    
    # перевод признаков типа bool в int и замена значений в признаке age_group
    data = func_map_values(data=data, names_cols=kwargs['map_func_for_columns'])
    
    # замена пробелов и запятых в датасете на "_"
    symbol_correct(data=data, names_cols=kwargs['columns_symbol_correct'])
    
    # преобразование признаков в заданный тип
    assert isinstance(
        kwargs["change_type_columns"], dict
    ), "Подайте тип данных в формате dict"
    data = correct_types(data=data, change_type_columns=kwargs['change_type_columns'])
    
    return data

In [7]:
data_test = pipeline(data=data_test, **preprocess)
data_test.head()

Unnamed: 0,country,age_group,travel_with,total_female,total_male,purpose,main_activity,info_source,tour_arrangement,package_transport_int,package_accomodation,package_food,package_transport_tz,package_sightseeing,package_guided_tour,package_insurance,night_mainland,night_zanzibar,first_trip_tz
0,korea,25_to_44,Alone,0.0,1.0,Leisure_and_Holidays,Wildlife_Tourism,Others,Independent,0,0,0,0,0,0,0,7,4,1
1,united_kingdom,45_to_64,With_Children,1.0,1.0,Leisure_and_Holidays,Conference_Tourism,Travel_agent_tour_operator,Package_Tour,1,1,1,1,1,1,1,7,0,1
2,italy,25_to_44,With_Spouse,1.0,1.0,Leisure_and_Holidays,Beach_Tourism,Travel_agent_tour_operator,Package_Tour,1,1,0,1,0,0,0,0,6,1
3,kenya,25_to_44,Alone,0.0,1.0,Other,Beach_Tourism,Radio_TV_Web,Independent,0,0,0,0,0,0,0,3,4,0
4,zambia,25_to_44,Alone,0.0,1.0,Business,Wildlife_Tourism,Radio_TV_Web,Independent,0,0,0,0,0,0,0,6,0,0


# Evaluate

In [8]:
# обращение к сохраненной модели
saved_model = joblib.load(training['model_path'])
# предсказание классов на test dataset
data_test['predict'] = saved_model.predict(data_test)

In [9]:
mapping = {'High_Cost': 0,
 'Higher_Cost': 1,
 'Highest_Cost': 2,
 'Low_Cost': 3,
 'Lower_Cost': 4,
 'Normal_Cost': 5}
map_reverse = {g:i for i, g in mapping.items()}

In [10]:
data_test['predict'] = data_test['predict'].map(map_reverse)
data_test.head(20)

Unnamed: 0,country,age_group,travel_with,total_female,total_male,purpose,main_activity,info_source,tour_arrangement,package_transport_int,package_accomodation,package_food,package_transport_tz,package_sightseeing,package_guided_tour,package_insurance,night_mainland,night_zanzibar,first_trip_tz,predict
0,korea,25_to_44,Alone,0.0,1.0,Leisure_and_Holidays,Wildlife_Tourism,Others,Independent,0,0,0,0,0,0,0,7,4,1,Normal_Cost
1,united_kingdom,45_to_64,With_Children,1.0,1.0,Leisure_and_Holidays,Conference_Tourism,Travel_agent_tour_operator,Package_Tour,1,1,1,1,1,1,1,7,0,1,Higher_Cost
2,italy,25_to_44,With_Spouse,1.0,1.0,Leisure_and_Holidays,Beach_Tourism,Travel_agent_tour_operator,Package_Tour,1,1,0,1,0,0,0,0,6,1,High_Cost
3,kenya,25_to_44,Alone,0.0,1.0,Other,Beach_Tourism,Radio_TV_Web,Independent,0,0,0,0,0,0,0,3,4,0,Normal_Cost
4,zambia,25_to_44,Alone,0.0,1.0,Business,Wildlife_Tourism,Radio_TV_Web,Independent,0,0,0,0,0,0,0,6,0,0,Normal_Cost
5,uganda,45_to_64,Alone,0.0,1.0,Visiting_Friends_and_Relatives,Beach_Tourism,Friends_relatives,Independent,0,0,0,0,0,0,0,23,0,0,Lower_Cost
6,italy,45_to_64,With_Spouse_and_Children,1.0,2.0,Leisure_and_Holidays,Beach_Tourism,Travel_agent_tour_operator,Package_Tour,1,1,1,1,0,1,1,0,14,1,Higher_Cost
7,zambia,25_to_44,Alone,0.0,1.0,Business,Beach_Tourism,Friends_relatives,Independent,0,0,0,0,0,0,0,6,0,0,Normal_Cost
8,united_kingdom,45_to_64,With_Spouse,1.0,1.0,Leisure_and_Holidays,Wildlife_Tourism,Friends_relatives,Package_Tour,0,1,1,1,1,0,0,4,0,1,Higher_Cost
9,the_netherlands,25_to_44,Alone,0.0,1.0,Business,Wildlife_Tourism,Friends_relatives,Independent,0,0,0,0,0,0,0,2,0,0,Lower_Cost
