# Tanzania Tourism Prediction - Prognozy dotyczące turystyki w Tanzanii
<h2>Autorzy:</h2><br>
<ul>
<li>Piotr Janiszek 247678</li>
<li>Kacper Białek 247629</li>
<li>Franciszek Pawlak 247756</li>
<li>Michał Korblit 242427</li>
</ul>

In [None]:
is_test_iteration = False

import warnings
warnings.filterwarnings("ignore")


<h3>Imports</h3>

In [None]:
import numpy as np
import optuna
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score

import xgboost as xgb
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor

import tensorflow as tf
from tensorflow import keras

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter



<h3>Data loadind</h3>


In [None]:
df_train = pd.read_csv('data/Train.csv')
df_regions = pd.read_csv('data/regions.csv')

corrections = {
    'SWIZERLAND': 'SWITZERLAND',
    'MALT': 'MALTA',
    'BURGARIA': 'BULGARIA',
    'DRC': 'CONGO (DEMOCRATIC REPUBLIC OF THE)',
    'KOREA': 'SOUTH KOREA',
    'SWAZILAND': 'ESWATINI',
    'UKRAIN': 'UKRAINE',
    'TRINIDAD TOBACCO': 'TRINIDAD AND TOBAGO',
    'COMORO': 'COMOROS',
    'COSTARICA': 'COSTA RICA',
    'PHILIPINES': 'PHILIPPINES',
    'IVORY COAST': "CÔTE D'IVOIRE",
    'DJIBOUT': 'DJIBOUTI',
    'MORROCO': 'MOROCCO',
    'UNITED STATES OF AMERICA': 'UNITED STATES',
    'UAE': 'UNITED ARAB EMIRATES',
    'SCOTLAND': 'UNITED KINGDOM',
    'CAPE VERDE': 'CABO VERDE',
}

df_train['country'] = df_train['country'].replace(corrections)
df_regions = df_regions[['name', 'sub-region']]
df_regions = df_regions.rename(columns={'sub-region': 'region'})
df_regions['name'] = df_regions['name'].str.upper()
df_train = pd.merge(df_train, df_regions, how='left', left_on='country', right_on='name')
df_train = df_train.drop(columns=['name'])

print(df_train.columns)
print(df_regions.columns)


<h3>Data Cleaning</h3>

In [None]:
count = df_train.isna().any(axis=1).sum()
df_train.loc[df_train['most_impressing'].isna(), 'most_impressing'] = 'No comments'
count2 = df_train.isna().any(axis=1).sum()

mask_valid = (df_train['total_male'].notna()) & \
             (df_train['total_female'].notna()) & \
             ((df_train['total_male'] + df_train['total_female']) != 0)
df_train.loc[mask_valid, 'total_cost_per_person'] = df_train.loc[mask_valid, 'total_cost'] / (
            df_train.loc[mask_valid, 'total_male'] + df_train.loc[mask_valid, 'total_female'])
mean = df_train.loc[mask_valid, 'total_cost_per_person'].mean()
print("Średnia bez wierszy z zerową sumą:", mean)


# idzie przez kolumny z nan
for index, row in df_train[df_train.isna().any(axis=1)].iterrows():
    if (pd.isna(row['total_male']) | pd.isna(row['total_female'])) & (not (pd.isna(row['total_male']) & pd.isna(row['total_female']))):
        if pd.isna(row['total_male']):
            difference = row['total_cost'] - (row['total_female'] * mean)
            person_left = difference / mean
            person_left = round(person_left, 0)

            if person_left < 0:
                person_left = 0
            #print(f"Inserting {person_left}")
            df_train.loc[index, 'total_male'] = person_left

        else:
            difference = row['total_cost'] - (row['total_male'] * mean)

            person_left = difference / mean
            person_left = round(person_left, 0)

            if person_left < 0:
                person_left = 0
            #print(f"Inserting {person_left}")
            df_train.loc[index, 'total_female'] = person_left

for index, row in df_train[df_train.isna().any(axis=1)].iterrows():
    # Completing the travel_with field with the value Alone, when the number of people shows that he/she travels alone
    if pd.isna(row['travel_with']) & ((row['total_male'] + row['total_female']) == 1):
        df_train.loc[index, 'travel_with'] = 'Alone'


In [None]:
# Random forest imputation
mask_valid = (df_train['total_male'].notna()) & \
             (df_train['total_female'].notna()) & \
             ((df_train['total_male'] + df_train['total_female']) != 0)
df_train.loc[mask_valid, 'total_cost_per_person'] = df_train.loc[mask_valid, 'total_cost'] / (
            df_train.loc[mask_valid, 'total_male'] + df_train.loc[mask_valid, 'total_female'])
mean = df_train.loc[mask_valid, 'total_cost_per_person'].mean()

features = ['region', 'age_group', 'total_female',
       'total_male', 'purpose', 'main_activity', 'info_source',
       'tour_arrangement', 'package_transport_int', 'package_accomodation',
       'package_food', 'package_transport_tz', 'package_sightseeing',
       'package_guided_tour', 'package_insurance', 'night_mainland',
       'night_zanzibar', 'payment_mode', 'first_trip_tz', 'most_impressing',
       'total_cost']

features_cat = ['region', 'age_group', 'purpose', 'main_activity', 'info_source', 'tour_arrangement',
                'package_transport_int', 'package_accomodation', 'package_food', 'package_transport_tz',
                'package_sightseeing', 'package_guided_tour', 'package_insurance',
                'payment_mode', 'first_trip_tz', 'most_impressing']

# Określ dozwolone kategorie - Model 1
allowed_categories = ['Friends/Relatives', 'Children']
# Filtrowanie danych treningowych - uwzględniamy tylko dozwolone kategorie
df_train_imp = df_train[df_train['travel_with'].isin(allowed_categories)].copy()
X_train = pd.get_dummies(df_train_imp[features], columns=features_cat)
y_train = df_train_imp['travel_with']
# Trenowanie modelu
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Określ dozwolone kategorie - Model 2
allowed_categories2 = ['Children', 'Friends/Relatives', 'Spouse', 'Spouse and Children']
# Filtrowanie danych treningowych - uwzględniamy tylko dozwolone kategorie
df_train_imp2 = df_train[df_train['travel_with'].isin(allowed_categories2)].copy()
X_train2 = pd.get_dummies(df_train_imp2[features], columns=features_cat)
y_train2 = df_train_imp2['travel_with']
# Trenowanie modelu
rf2 = RandomForestClassifier(n_estimators=100, random_state=42)
rf2.fit(X_train2, y_train2)

# Określ dozwolone kategorie - Model 3
features3 = ['region', 'age_group', 'purpose', 'main_activity', 'info_source',
       'tour_arrangement', 'package_transport_int', 'package_accomodation',
       'package_food', 'package_transport_tz', 'package_sightseeing',
       'package_guided_tour', 'package_insurance', 'night_mainland',
       'night_zanzibar', 'payment_mode', 'first_trip_tz', 'most_impressing',
       'total_cost']

features_cat3 = ['region', 'age_group', 'purpose', 'main_activity', 'info_source', 'tour_arrangement',
                'package_transport_int', 'package_accomodation', 'package_food', 'package_transport_tz',
                'package_sightseeing', 'package_guided_tour', 'package_insurance',
                'payment_mode', 'first_trip_tz', 'most_impressing']

def designate_sex(row):
    if (row['travel_with'] == 'Alone') and ((row['total_male'] + row['total_female']) == 1):
        # Jeśli total_male == 1, przyjmujemy, że to mężczyzna, w przeciwnym razie kobieta
        return 'Male' if row['total_male'] == 1 else 'Female'
    else:
        return None

df_train['gender'] = df_train.apply(designate_sex, axis=1)
df_model = df_train[df_train['gender'].notna()].copy()

# Wybór cech (features) i zmienna docelowa (target)
X_train3 = df_model[features3]
y_train3 = df_model['gender']

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), features_cat3)
    ],
    remainder='passthrough'
)

model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

model_pipeline.fit(X_train3, y_train3)

# Predykcja płci dzieci - Model 4
features4 = ['region', 'age_group', 'purpose', 'main_activity', 'info_source',
       'tour_arrangement', 'package_transport_int', 'package_accomodation',
       'package_food', 'package_transport_tz', 'package_sightseeing',
       'package_guided_tour', 'package_insurance', 'night_mainland',
       'night_zanzibar', 'payment_mode', 'first_trip_tz', 'most_impressing',
       'total_cost']

features_cat4 = ['region', 'age_group', 'purpose', 'main_activity', 'info_source', 'tour_arrangement',
                'package_transport_int', 'package_accomodation', 'package_food', 'package_transport_tz',
                'package_sightseeing', 'package_guided_tour', 'package_insurance',
                'payment_mode', 'first_trip_tz', 'most_impressing']

df_train['male_children'] = df_train['total_male'].apply(lambda x: max(x - 1, 0))
df_train['female_children'] = df_train['total_female'].apply(lambda x: max(x - 1, 0))

df_filtered = df_train[(df_train['total_male'] > 0) & (df_train['total_female'] > 0)].copy()
df_children_model = df_filtered[df_filtered['travel_with'] == 'Spouse and Children'].copy()

# Przygotowanie macierzy cech
X_train_child = pd.get_dummies(df_children_model[features4], columns=features_cat4)

# Przygotowanie macierzy target – dwie kolumny: liczba dzieci mężczyzn i dzieci kobiet
y_train_child = df_children_model[['male_children', 'female_children']]

# Inicjalizacja modelu
multioutput_rf = MultiOutputRegressor(RandomForestRegressor(n_estimators=100, random_state=42))

# Trenowanie modelu
multioutput_rf.fit(X_train_child, y_train_child)

# Predykcja podziału dzieci oraz dorosłego - Model 5
features5 = ['region', 'age_group', 'purpose', 'main_activity', 'info_source',
       'tour_arrangement', 'package_transport_int', 'package_accomodation',
       'package_food', 'package_transport_tz', 'package_sightseeing',
       'package_guided_tour', 'package_insurance', 'night_mainland',
       'night_zanzibar', 'payment_mode', 'first_trip_tz', 'most_impressing',
       'total_cost']

features_cat5 = ['region', 'age_group', 'purpose', 'main_activity', 'info_source', 'tour_arrangement',
                'package_transport_int', 'package_accomodation', 'package_food', 'package_transport_tz',
                'package_sightseeing', 'package_guided_tour', 'package_insurance',
                'payment_mode', 'first_trip_tz', 'most_impressing']

df_filtered = df_train[(df_train['total_male'] + df_train['total_female'] > 1)].copy()
df_children_only_model = df_filtered[df_filtered['travel_with'] == 'Children'].copy()

X_train_child_only = pd.get_dummies(df_children_only_model[features5], columns=features_cat5)
y_train_child_only = df_children_only_model[['total_male', 'total_female']]

multioutput_rf_children_only = MultiOutputRegressor(RandomForestRegressor(n_estimators=100, random_state=42))

multioutput_rf_children_only.fit(X_train_child_only, y_train_child_only)

# Predykcja podziału przyjaciół - Model 6
features6 = ['region', 'age_group', 'purpose', 'main_activity', 'info_source',
       'tour_arrangement', 'package_transport_int', 'package_accomodation',
       'package_food', 'package_transport_tz', 'package_sightseeing',
       'package_guided_tour', 'package_insurance', 'night_mainland',
       'night_zanzibar', 'payment_mode', 'first_trip_tz', 'most_impressing',
       'total_cost']

features_cat6 = ['region', 'age_group', 'purpose', 'main_activity', 'info_source', 'tour_arrangement',
                'package_transport_int', 'package_accomodation', 'package_food', 'package_transport_tz',
                'package_sightseeing', 'package_guided_tour', 'package_insurance',
                'payment_mode', 'first_trip_tz', 'most_impressing']

df_filtered = df_train[(df_train['total_male'] + df_train['total_female'] > 1)].copy()
df_friend_model = df_filtered[df_filtered['travel_with'] == 'Friends/Relatives'].copy()

X_train_friends = pd.get_dummies(df_friend_model[features6], columns=features_cat6)
y_train_friends = df_friend_model[['total_male', 'total_female']]

multioutput_rf_friends = MultiOutputRegressor(RandomForestRegressor(n_estimators=100, random_state=42))

multioutput_rf_friends.fit(X_train_friends, y_train_friends)

# predykcja kategori  - Model 7
features7 = ['region', 'age_group', 'total_female',
       'total_male', 'purpose', 'main_activity', 'info_source',
       'tour_arrangement', 'package_transport_int', 'package_accomodation',
       'package_food', 'package_transport_tz', 'package_sightseeing',
       'package_guided_tour', 'package_insurance', 'night_mainland',
       'night_zanzibar', 'payment_mode', 'first_trip_tz', 'most_impressing',
       'total_cost']

features_cat7 = ['region', 'age_group', 'purpose', 'main_activity', 'info_source', 'tour_arrangement',
                'package_transport_int', 'package_accomodation', 'package_food', 'package_transport_tz',
                'package_sightseeing', 'package_guided_tour', 'package_insurance',
                'payment_mode', 'first_trip_tz', 'most_impressing']


allowed_categories7 = ['Friends/Relatives', 'Children', 'Spouse and Children']
# Filtrowanie danych treningowych - uwzględniamy tylko dozwolone kategorie
df_train_imp7 = df_train[df_train['travel_with'].isin(allowed_categories7)].copy()
X_train7 = pd.get_dummies(df_train_imp7[features7], columns=features_cat7)
y_train7 = df_train_imp7['travel_with']
# Trenowanie modelu
rf7 = RandomForestClassifier(n_estimators=100, random_state=42)
rf7.fit(X_train7, y_train7)

# Predykcja podziału dla wycieczki widmo - Model 8
features8 = ['region', 'age_group', 'purpose', 'main_activity', 'info_source',
       'tour_arrangement', 'package_transport_int', 'package_accomodation',
       'package_food', 'package_transport_tz', 'package_sightseeing',
       'package_guided_tour', 'package_insurance', 'night_mainland',
       'night_zanzibar', 'payment_mode', 'first_trip_tz', 'most_impressing',
       'total_cost']

features_cat8 = ['region', 'age_group', 'purpose', 'main_activity', 'info_source', 'tour_arrangement',
                'package_transport_int', 'package_accomodation', 'package_food', 'package_transport_tz',
                'package_sightseeing', 'package_guided_tour', 'package_insurance',
                'payment_mode', 'first_trip_tz', 'most_impressing']

df_filtered_phantom = df_train[(df_train['total_male'] + df_train['total_female'] > 1)].copy()
df_phantom_model = df_filtered_phantom[(df_filtered_phantom['travel_with'] == 'Friends/Relatives') | (df_filtered_phantom['travel_with'] == 'Spouse and Children') | (df_filtered_phantom['travel_with'] == 'Children')].copy()

X_train_phantom = pd.get_dummies(df_phantom_model[features8], columns=features_cat8)
y_train_phantom = df_phantom_model[['total_male', 'total_female']]

multioutput_rf_phantom = MultiOutputRegressor(RandomForestRegressor(n_estimators=100, random_state=42))

multioutput_rf_phantom.fit(X_train_phantom, y_train_phantom)

# Predykcja travel with dla rekordu widmo
allowed_categories8 = ['Children', 'Friends/Relatives', 'Spouse and Children']
# Filtrowanie danych treningowych - uwzględniamy tylko dozwolone kategorie
df_train_imp8 = df_train[df_train['travel_with'].isin(allowed_categories8)].copy()
X_train8 = pd.get_dummies(df_train_imp8[features8], columns=features_cat8)
y_train8 = df_train_imp8['travel_with']
# Trenowanie modelu
rf8 = RandomForestClassifier(n_estimators=100, random_state=42)
rf8.fit(X_train8, y_train8)

# Imputation
for index, row in df_train[df_train.isna().any(axis=1)].iterrows():
    # Completing the travel_with field with the estimated value when only one gender participated in the trip and the number of people exceeds one
    if (pd.isna(row['travel_with']) & ((row['total_male'] + row['total_female']) > 1) &
            ((row['total_male'] == 0 | pd.isna(row['total_male'])) | (row['total_female'] == 0 | pd.isna(row['total_female'])))):

        dummy_df = pd.get_dummies(row[features], columns=features_cat)
        dummy_df.columns = dummy_df.columns.astype(str)
        dummy_df = dummy_df.reindex(columns=X_train.columns, fill_value=0)

        predicted_value = rf.predict(dummy_df)

        df_train.loc[index, 'travel_with'] = predicted_value[0]

    elif (row['total_male'] + row['total_female'] > 1) & (pd.isna(row['travel_with'])):

        dummy_df = pd.get_dummies(row[features], columns=features_cat)
        dummy_df.columns = dummy_df.columns.astype(str)
        dummy_df = dummy_df.reindex(columns=X_train2.columns, fill_value=0)

        predicted_value = rf2.predict(dummy_df)

        print(predicted_value[0])
        df_train.loc[index, 'travel_with'] = predicted_value[0]

    elif (not pd.isna(row['travel_with'])) & (row['total_male'] == 0 | pd.isna(row['total_male'])) & (row['total_female'] == 0 | pd.isna(row['total_female'])):
        print("Brakuje liczby osob")
        persons = round(row['total_cost'] / mean, 0)

        if row['travel_with'] == 'Alone':
            print('predykcja płci')

            input_df = row[features3].to_frame().T

            predicted_value = model_pipeline.predict(input_df)

            if predicted_value[0] == 'Male':
                df_train.loc[index, 'total_male'] = 1
                df_train.loc[index, 'total_female'] = 0
            else:
                df_train.loc[index, 'total_male'] = 0
                df_train.loc[index, 'total_female'] = 1
        elif row['travel_with'] == 'Spouse':
            df_train.loc[index, 'total_male'] = 1
            df_train.loc[index, 'total_female'] = 1
        elif row['travel_with'] == 'Spouse and Children':
            df_train.loc[index, 'total_male'] = 1
            df_train.loc[index, 'total_female'] = 1
            if persons < 3:
                persons = 3
            print(f"Predykcja płci dziecka, przy {persons} wszystkich osobach")
            persons = persons - 2

            # Przygotowanie danych wejściowych dla modelu
            dummy_df = pd.get_dummies(row[features3], columns=features_cat3)
            dummy_df.columns = dummy_df.columns.astype(str)
            dummy_df = dummy_df.reindex(columns=X_train_child.columns, fill_value=0)

            # Surowe predykcje liczby dzieci dla obu płci
            pred = multioutput_rf.predict(dummy_df)
            pred_male, pred_female = pred[0, 0], pred[0, 1]

            # Skalowanie predykcji do znanej liczby dzieci (persons)
            pred_sum = pred_male + pred_female
            if pred_sum == 0:
                ratio_male = 0.5  # zabezpieczenie, gdyby suma była zerowa
            else:
                ratio_male = pred_male / pred_sum
            ratio_female = 1 - ratio_male

            # Obliczenie ostatecznej liczby dzieci danej płci
            male_children_final = round(persons * ratio_male)
            female_children_final = persons - male_children_final

            # Dodanie przewidywanej liczby dzieci do dorosłych
            df_train.loc[index, 'total_male'] += male_children_final
            df_train.loc[index, 'total_female'] += female_children_final

            print(f"Predykcja: {male_children_final} chłopców oraz {female_children_final} dziewczyn, przy {persons} żądanych osobach")


        elif row['travel_with'] == 'Children':
            print("Predykcja płci dziecka")

            if persons < 2:
                persons = 2

            # Przygotowanie danych wejściowych dla modelu
            dummy_df = pd.get_dummies(row[features5], columns=features_cat5)
            dummy_df.columns = dummy_df.columns.astype(str)
            dummy_df = dummy_df.reindex(columns=X_train_child_only.columns, fill_value=0)

            # Surowe predykcje liczby dzieci dla obu płci
            pred = multioutput_rf_children_only.predict(dummy_df)
            pred_male, pred_female = pred[0, 0], pred[0, 1]

            # Skalowanie predykcji do znanej liczby osób (persons)
            pred_sum = pred_male + pred_female
            if pred_sum == 0:
                ratio_male = 0.5  # zabezpieczenie, gdyby suma była zerowa
            else:
                ratio_male = pred_male / pred_sum
            ratio_female = 1 - ratio_male

            # Obliczenie ostatecznej liczby dzieci danej płci
            male_final = round(persons * ratio_male)
            female_final = persons - male_final

            # Przypisanie przewidywanej liczby dzieci
            df_train.loc[index, 'total_male'] = male_final
            df_train.loc[index, 'total_female'] = female_final

            print(f"Predykcja: {male_final} chłopców oraz {female_final} dziewczyn, przy {persons} żądanych osobach")

        elif row['travel_with'] == 'Friends/Relatives':
            print("Predykcja podziału")

            if persons < 2:
                persons = 2

            # Przygotowanie danych wejściowych dla modelu
            dummy_df = pd.get_dummies(row[features6], columns=features_cat6)
            dummy_df.columns = dummy_df.columns.astype(str)
            dummy_df = dummy_df.reindex(columns=X_train_friends.columns, fill_value=0)

            # Surowe predykcje liczby osób dla obu płci
            pred = multioutput_rf_friends.predict(dummy_df)
            pred_male, pred_female = pred[0, 0], pred[0, 1]

            # Skalowanie predykcji do znanej liczby osób (persons)
            pred_sum = pred_male + pred_female
            if pred_sum == 0:
                ratio_male = 0.5  # zabezpieczenie, gdyby suma była zerowa
            else:
                ratio_male = pred_male / pred_sum
            ratio_female = 1 - ratio_male

            # Obliczenie ostatecznej liczby osób danej płci
            male_final = round(persons * ratio_male)
            female_final = persons - male_final

            # Przypisanie przewidywanej liczby osób
            df_train.loc[index, 'total_male'] = male_final
            df_train.loc[index, 'total_female'] = female_final

            print(f"Predykcja: {male_final} mężczyzn oraz {female_final} kobiet, przy {persons} żądanych osobach")
        else:
            print(f"{row['travel_with']}"
              f"\n{row['total_male']}"
              f"\n{row['total_female']}")

    elif pd.isna(row['travel_with']) & ((row['total_male'] + row['total_female']) == 0):
        persons = round(row['total_cost'] / mean, 0)

        if persons <= 1:
            print('predykcja płci')
            df_train.loc[index, 'travel_with'] = 'Alone'

            input_df = row[features3].to_frame().T

            predicted_value = model_pipeline.predict(input_df)

            if predicted_value[0] == 'Male':
                df_train.loc[index, 'total_male'] = 1
                df_train.loc[index, 'total_female'] = 0
            else:
                df_train.loc[index, 'total_male'] = 0
                df_train.loc[index, 'total_female'] = 1

        else:
            print('predykcja płci i rozłożenia oraz kategorii travel_with')

            # predykcja podziału osób
            print(f"Predykcja: {persons} osob")

            # Przygotowanie danych wejściowych dla modelu
            dummy_df = pd.get_dummies(row[features8], columns=features_cat8)
            dummy_df.columns = dummy_df.columns.astype(str)
            dummy_df = dummy_df.reindex(columns=X_train_phantom.columns, fill_value=0)

            # Surowe predykcje liczby osób dla obu płci
            pred = multioutput_rf_phantom.predict(dummy_df)
            pred_male, pred_female = pred[0, 0], pred[0, 1]

            # Skalowanie predykcji do znanej liczby osób (persons)
            pred_sum = pred_male + pred_female
            if pred_sum == 0:
                ratio_male = 0.5  # zabezpieczenie, gdyby suma była zerowa
            else:
                ratio_male = pred_male / pred_sum
            ratio_female = 1 - ratio_male

            # Obliczenie ostatecznej liczby osób danej płci
            male_final = round(persons * ratio_male)
            female_final = persons - male_final

            # Przypisanie przewidywanej liczby osób
            df_train.loc[index, 'total_male'] = male_final
            df_train.loc[index, 'total_female'] = female_final

            row['total_male'] = male_final
            row['total_female'] = female_final

            print(f"Predykcja: {male_final} mężczyzn oraz {female_final} kobiet, przy {persons} żądanych osobach")

            # predykcja kategorii
            dummy_df = pd.get_dummies(row[features8], columns=features_cat8)
            dummy_df.columns = dummy_df.columns.astype(str)
            dummy_df = dummy_df.reindex(columns=X_train8.columns, fill_value=0)

            predicted_value = rf8.predict(dummy_df)

            df_train.loc[index, 'travel_with'] = predicted_value[0]


    elif pd.isna(row['travel_with']):
        print(f"{row['travel_with']}"
              f"\n{row['total_male']}"
              f"\n{row['total_female']}")


mask_valid = (df_train['total_male'].notna()) & \
             (df_train['total_female'].notna()) & \
             ((df_train['total_male'] + df_train['total_female']) != 0)
df_train.loc[mask_valid, 'total_cost_per_person'] = df_train.loc[mask_valid, 'total_cost'] / (
            df_train.loc[mask_valid, 'total_male'] + df_train.loc[mask_valid, 'total_female'])

count3 = df_train.isna().any(axis=1).sum()

print("Początkowa liczba wierszy z brakującymi wartościami:", count)
print("Liczba wierszy z brakującymi wartościami po uzupełnieniu braków w kolumnie most_impressing:", count2)
print("Końcowa liczba wierszy z brakującymi wartościami:", count3)

for column in df_train.columns:
    empty_count = df_train[column].isna().sum()

    if empty_count > 0:
        print(f"Column '{column}' has {empty_count} empty fields (NaN).")

for index, row in df_train[df_train.isna().any(axis=1)].iterrows():
    #print(row)
    pass

unique_countries_nan = df_train[df_train['region'].isna()]['country'].unique()
print("Państwa bez regionu: ", unique_countries_nan)


df_train.drop(["most_impressing", "gender"], axis=1, inplace=True)
df_train["total_people"] = df_train["total_male"] + df_train["total_female"]


Ustawienia walidacji krzyżowej

In [None]:
import random

seed = 42
random.seed(seed)
np.random.seed(seed)
tf.random.set_seed(seed)


n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=seed)


Tablice do przechowywania meta-features

In [None]:
oof_preds_xgb = np.zeros(len(df_train))
oof_preds_cat = np.zeros(len(df_train))
oof_preds_lgb = np.zeros(len(df_train))
meta_features = np.zeros((len(df_train), 3))


Przygotowanie danych

In [None]:
print(df_train.columns)
# Konwersja kolumn `object` na `category`
for col in df_train.select_dtypes(include=['object']).columns:
    df_train[col] = df_train[col].astype('category')
    new_col = "cat_" + col
    df_train.rename(columns={col: new_col}, inplace=True)

print(df_train.columns)
# Przygotowanie danych wejściowych
X = df_train.drop(columns=["total_cost", "cat_ID", 'total_cost_per_person', 'male_children', 'female_children', 'total_people', "cat_country"])  # Dane wejściowe
y = df_train["total_cost"]
print(X.columns)
cat_features = [col for col in X.columns if col.startswith('cat_')]

# Podział na zbiory treningowy i walidacyjny
if is_test_iteration:
    X, X_valid_end, y, y_valid_end = train_test_split(X, y, test_size=0.2, random_state=42)
    y = y.reset_index(drop=True)

print(f"Liczba wierszy w zbiorze danych treningowych: {len(X)}")
print(f"Liczba wierszy w zbiorze wartości: {len(y)}")


Dostrajanie XGBoost

In [None]:
def objective(trial):
    # Zakresy hiperparametrów
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 10),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 10),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 10),
        'random_state': seed,
        'verbose': 0,
        'enable_categorical': True
    }

    # Inicjalizacja modelu z parametrami
    model = XGBRegressor(**params)

    # KFold
    kf = KFold(n_splits=5, shuffle=True, random_state=seed)

    # Obliczenie metryki
    scores = cross_val_score(
        model,
        X,
        y,
        cv=kf,
        scoring='neg_mean_absolute_error'
    )

    mae = -scores.mean()

    return mae


# Tworzenie i optymalizacja dla XGBoost
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=150, show_progress_bar=True)

print("Najlepsza wartość MAE:", study.best_value)
print("Najlepsze parametry:", study.best_params)


Dostrajanie CatBoost

In [None]:
# Strojenie modelu CatBoost
def objective_catboost(trial):
    params = {
        'iterations': trial.suggest_int('iterations', 100, 1000),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'depth': trial.suggest_int('depth', 3, 10),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1.0, 10.0),
        'border_count': trial.suggest_int('border_count', 32, 255),
        'random_state': seed,
        'verbose': False,
        'cat_features': cat_features,
        'leaf_estimation_iterations': 1,
        'boosting_type': 'Ordered',
        'task_type': 'GPU'
    }
    model = CatBoostRegressor(**params)

    # KFold
    kf = KFold(n_splits=5, shuffle=True, random_state=seed)
    scores = cross_val_score(
        model, X, y, cv=kf, scoring='neg_mean_absolute_error'
    )
    mae = -scores.mean()
    return mae


# Tworzenie i optymalizacja dla CatBoost
study_catboost = optuna.create_study(direction='minimize')
study_catboost.optimize(objective_catboost, n_trials=5, show_progress_bar=True)

print("Najlepsza wartość MAE dla CatBoost:", study_catboost.best_value)
print("Najlepsze parametry dla CatBoost:", study_catboost.best_params)


Dostrajanie LightGBM

In [None]:
# Strojenie modelu LightGBM
def objective_lightgbm(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'num_leaves': trial.suggest_int('num_leaves', 20, 150),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 30),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 10),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 10),
        'random_state': seed
    }
    model = LGBMRegressor(**params)

    # KFold
    kf = KFold(n_splits=5, shuffle=True, random_state=seed)
    scores = cross_val_score(
        model, X, y, cv=kf, scoring='neg_mean_absolute_error'
    )
    mae = -scores.mean()
    return mae


# Tworzenie i optymalizacja dla LightGBM
study_lightgbm = optuna.create_study(direction='minimize')
study_lightgbm.optimize(objective_lightgbm, n_trials=250, show_progress_bar=True)

print("Najlepsza wartość MAE dla LightGBM:", study_lightgbm.best_value)
print("Najlepsze parametry dla LightGBM:", study_lightgbm.best_params)


Definicje modeli bazowych i ich parametrów

In [None]:

xgb_params_manual = {'n_estimators': 500, 'learning_rate': 0.05, 'max_depth': 6, 'subsample': 0.8, 'colsample_bytree': 0.8, 'random_state': seed,
              'enable_categorical': True, 'verbose': 1, 'use_label_encoder': False, 'eval_metric': 'mae'}
cat_params_manual = {'n_estimators': 500, 'learning_rate': 0.05, 'random_state': seed, 'verbose': 1}
lgb_params_manual = {'n_estimators': 500, 'learning_rate': 0.05, 'max_depth': 6, 'random_state': seed, 'verbose': 1}

xgb_params = {**study.best_params, 'enable_categorical': True, 'verbose': 1}
cat_params = {**study_catboost.best_params, 'verbose': 1, 'leaf_estimation_iterations': 1, 'boosting_type': 'Ordered' }
lgb_params = {**study_lightgbm.best_params, 'verbose': 1}


Inicjalizacja modeli

In [None]:
xgb_model = xgb.XGBRegressor(**xgb_params)
cat_model = CatBoostRegressor(**cat_params)
lgb_model = LGBMRegressor(**lgb_params)

models = [('xgb', xgb_model), ('cat', cat_model), ('lgb', lgb_model)]

oof_preds = {
    'xgb': np.zeros(len(X)),
    'cat': np.zeros(len(X)),
    'lgb': np.zeros(len(X))
}


In [None]:
mae_scores = []

for train_idx, val_idx in kf.split(X):
    X_train, X_valid = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_valid = y.iloc[train_idx], y.iloc[val_idx]

    # Trening CatBoost
    cat_model.fit(X_train, y_train, cat_features=cat_features)
    cat_preds = cat_model.predict(X_valid)
    oof_preds['cat'][val_idx] = cat_preds
    mae_scores.append(f"CatBoost fold MAE: {mean_absolute_error(y_valid, cat_preds):.4f}")

    # Trening XGBoost
    xgb_model.fit(X_train, y_train)
    xgb_preds = xgb_model.predict(X_valid)
    oof_preds['xgb'][val_idx] = xgb_preds
    mae_scores.append(f"XGBoost fold MAE: {mean_absolute_error(y_valid, xgb_preds):.4f}")

    # Trening LightGBM
    lgb_model.fit(X_train, y_train)
    lgb_preds = lgb_model.predict(X_valid)
    oof_preds['lgb'][val_idx] = lgb_preds

    mae_scores.append(f"LightGBM fold MAE: {mean_absolute_error(y_valid, lgb_preds):.4f}")

meta_features = np.column_stack((
    oof_preds['xgb'],  # kolumna z predykcji XGBoost
    oof_preds['cat'],  # kolumna z predykcji CatBoost
    oof_preds['lgb']   # kolumna z predykcji LightGBM
))

cat_model = CatBoostRegressor(**cat_params)
cat_model.fit(X, y, cat_features=cat_features)

xgb_model = xgb.XGBRegressor(**xgb_params)
xgb_model.fit(X, y)

lgb_model = LGBMRegressor(**lgb_params)
lgb_model.fit(X, y)

for score in mae_scores:
    print(f"Scores for learning meta model: {score}")

if is_test_iteration:
    cat_preds = cat_model.predict(X_valid_end)
    print(f"CatBoost fold MAE full model: {mean_absolute_error(y_valid_end, cat_preds):.4f}")

    xgb_preds = xgb_model.predict(X_valid_end)
    print(f"XGBoost fold MAE full model: {mean_absolute_error(y_valid_end, xgb_preds):.4f}")

    lgb_preds = lgb_model.predict(X_valid_end)
    print(f"LightGBM fold MAE full model: {mean_absolute_error(y_valid_end, lgb_preds):.4f}")

def build_meta_model(input_dim, n_neurons1, n_neurons2, dropout_rate, learning_rate):
    model = Sequential()
    model.add(Dense(n_neurons1, input_dim=input_dim, activation='relu'))
    model.add(Dropout(dropout_rate))
    model.add(Dense(n_neurons2, activation='relu'))
    model.add(Dense(1, activation='linear'))  # regresja
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss='mae')
    return model


def objective(trial):
    # Zakresy hiperparametrów
    n_neurons1 = trial.suggest_int("n_neurons1", 32, 128)
    n_neurons2 = trial.suggest_int("n_neurons2", 16, 64)
    dropout_rate = trial.suggest_float("dropout_rate", 0.1, 0.5)
    learning_rate = trial.suggest_loguniform("learning_rate", 1e-4, 1e-2)
    epochs = 100
    batch_size = trial.suggest_categorical("batch_size", [16, 32, 64])

    # Walidacji krzyżowa
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    mae_scores = []

    for train_idx, val_idx in kf.split(meta_features):
        X_train, X_val = meta_features[train_idx], meta_features[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        model = build_meta_model(meta_features.shape[1], n_neurons1, n_neurons2, dropout_rate, learning_rate)
        early_stop = EarlyStopping(monitor='loss', patience=10, restore_best_weights=True)

        model.fit(X_train, y_train,
                  epochs=epochs,
                  batch_size=batch_size,
                  verbose=0,
                  callbacks=[early_stop])

        mae = model.evaluate(X_val, y_val, verbose=0)
        mae_scores.append(mae)
    print(f"MAE: {np.mean(mae_scores)}")
    # Zwracamy średnią wartość błędu
    return np.mean(mae_scores)


# Uruchomienie optymalizacji
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=50)

print("Najlepsze hiperparametry: ", study.best_params)

# Trenowanie finalnego modelu z najlepszymi hiperparametrami
best_params = study.best_params
meta_model = build_meta_model(meta_features.shape[1],
                               best_params["n_neurons1"],
                               best_params["n_neurons2"],
                               best_params["dropout_rate"],
                               best_params["learning_rate"])
early_stop = EarlyStopping(monitor='loss', patience=10, restore_best_weights=True)
meta_model.fit(meta_features, y, epochs=100, batch_size=best_params["batch_size"], verbose=1, callbacks=[early_stop])


In [None]:
# Predykcje
def space_formatter(x, pos):
    return f"{int(x):,}".replace(',', ' ')

predictions_combined = []

if is_test_iteration:
    # Predykcje bazowych modeli na danych nowych
    preds_xgb = xgb_model.predict(X_valid_end)
    preds_cat = cat_model.predict(X_valid_end)
    preds_lgb = lgb_model.predict(X_valid_end)

    # Łączenie predykcji w macierz
    meta_features_new = np.column_stack((preds_xgb, preds_cat, preds_lgb))

    # Predykcja finalna modelu meta
    predictions = meta_model.predict(meta_features_new)

    mae = mean_absolute_error(y_valid_end, predictions)
    print(f"Średni błąd bezwzględny:", mae)

    plt.figure(figsize=(20, 10))
    plt.hist(predictions, bins=20, alpha=0.7, color='blue', label='Predykcje')
    plt.xlabel("Wartość błędu")
    plt.ylabel("Liczebność")
    plt.title("Histogram predykcji")
    plt.ticklabel_format(style='plain', axis='x')

    ax = plt.gca()
    ax.xaxis.set_major_formatter(FuncFormatter(space_formatter))

    plt.legend()
    plt.show()


In [None]:
if is_test_iteration:
    x = np.arange(len(y_valid_end))
    y_valid_array = y_valid_end.to_numpy().flatten()
    predictions_array = np.array(predictions).flatten()

    plt.figure(figsize=(10, 6))
    # Wykres punktowy dla wartości rzeczywistych (niebieski)
    plt.scatter(x, y_valid_end, color='blue', label='Wartość rzeczywista')

    # Wykres punktowy dla wartości estymowanych (czerwony)
    plt.scatter(x, predictions, color='red', label='Wartość estymowana')

    # Dla każdej pary linia łącząca punkty
    for i in range(len(x)):
        plt.plot([x[i], x[i]], [y_valid_array[i], predictions_array[i]], color='gray', linewidth=0.5)

    plt.xlabel('Indeks')
    plt.ylabel('Wartość')
    plt.title(f'Porównanie wartości rzeczywistych i estymowanych')
    plt.legend()

    ax = plt.gca()
    ax.yaxis.set_major_formatter(FuncFormatter(space_formatter))

    plt.show()


In [None]:
if not is_test_iteration:
    valid_preds = pd.read_csv('data/Test.csv')

    corrections = {
        'SWIZERLAND': 'SWITZERLAND',
        'UNITED STATES OF AMERICA': 'UNITED STATES',
        'COMORO': 'COMOROS',
        'MALT': 'MALTA',
        'UAE': 'UNITED ARAB EMIRATES',
        'UKRAIN': 'UKRAINE',
        'DRC': 'CONGO (DEMOCRATIC REPUBLIC OF THE)',
        'SWAZILAND': 'ESWATINI',
        'COSTARICA': 'COSTA RICA',
        'SCOTLAND': 'UNITED KINGDOM',
        'PHILIPINES': 'PHILIPPINES',
        'BOSNIA': 'BOSNIA AND HERZEGOVINA',
        'CAPE VERDE': 'CABO VERDE',
        'MORROCO': 'MOROCCO',
        'SOMALI': 'SOMALIA',
        'KOREA': 'SOUTH KOREA',
        'SAUD ARABIA': 'SAUDI ARABIA',
    }

    valid_preds['country'] = valid_preds['country'].replace(corrections)

    valid_preds = pd.merge(valid_preds, df_regions, how='left', left_on='country', right_on='name')
    valid_preds = valid_preds.drop(columns=['name'])
    print(valid_preds.columns)

    # Check before cleansing
    for column in valid_preds.columns:
        empty_count = valid_preds[column].isna().sum()

        if empty_count > 0:
            print(f"Column '{column}' has {empty_count} empty fields (NaN).")

    valid_preds.loc[valid_preds['most_impressing'].isna(), 'most_impressing'] = 'No comments'

    # Travel with imputation
    features_tw = ['region', 'age_group', 'total_female',
           'total_male', 'purpose', 'main_activity', 'info_source',
           'tour_arrangement', 'package_transport_int', 'package_accomodation',
           'package_food', 'package_transport_tz', 'package_sightseeing',
           'package_guided_tour', 'package_insurance', 'night_mainland',
           'night_zanzibar', 'payment_mode', 'first_trip_tz', 'most_impressing']

    features_cat_tw = ['region', 'age_group', 'purpose', 'main_activity', 'info_source', 'tour_arrangement',
                    'package_transport_int', 'package_accomodation', 'package_food', 'package_transport_tz',
                    'package_sightseeing', 'package_guided_tour', 'package_insurance',
                    'payment_mode', 'first_trip_tz', 'most_impressing']


    # Filtrowanie danych treningowych - uwzględniamy tylko dozwolone kategorie
    valid_preds_no_nan = valid_preds.dropna()

    allowed_categories = ['Children', 'Friends/Relatives', 'Spouse', 'Spouse and Children']
    valid_preds_imp = valid_preds_no_nan[valid_preds_no_nan['travel_with'].isin(allowed_categories)].copy()
    X_train_tw = pd.get_dummies(valid_preds_imp[features_tw], columns=features_cat_tw)
    y_train_tw = valid_preds_imp['travel_with']

    # Trenowanie modelu
    rf_tw = RandomForestClassifier(n_estimators=100, random_state=42)
    rf_tw.fit(X_train_tw, y_train_tw)

    print("Cleansing test dataset")
    for index, row in valid_preds[valid_preds.isna().any(axis=1)].iterrows():
        if pd.isna(row['travel_with']) & (row['total_male'] + row['total_female'] == 1) & (not pd.isna(row['total_male'])) & (not pd.isna(row['total_female'])):
            valid_preds.loc[index, 'travel_with'] = 'Alone'
        if pd.isna(row['travel_with']) & (row['total_male'] + row['total_female'] > 1) & (not pd.isna(row['total_male'])) & (not pd.isna(row['total_female'])):
            dummy_df = pd.get_dummies(row[features_tw], columns=features_cat_tw)
            dummy_df.columns = dummy_df.columns.astype(str)
            dummy_df = dummy_df.reindex(columns=X_train_tw.columns, fill_value=0)

            predicted_value = rf_tw.predict(dummy_df)

            print(f"\nrow"
                  f"\n{row['travel_with']}"
                  f"\n{row['total_male']}"
                  f"\n{row['total_female']}"
                  f"\n{predicted_value[0]}")

            valid_preds.loc[index, 'travel_with'] = predicted_value[0]

        if (row['travel_with'] == 'Alone') & ((pd.isna(row['total_male'])) | (pd.isna(row['total_female']))):
            if pd.isna(row['total_female']):
                valid_preds.loc[index, 'total_female'] = 0

                print(f"\nrow"
                      f"\n{row['travel_with']}"
                      f"\n{row['total_male']}"
                      f"\n{row['total_female']}"
                      f"\n{0}")
            elif pd.isna(row['total_female']):
                valid_preds.loc[index, 'total_male'] = 0

                print(f"\nrow"
                      f"\n{row['travel_with']}"
                      f"\n{row['total_male']}"
                      f"\n{row['total_female']}"
                      f"\n{0}")
            elif pd.isna(row['total_male']) & pd.isna(row['total_female']):
                # Gentleman
                valid_preds.loc[index, 'total_male'] = 0
                valid_preds.loc[index, 'total_female'] = 1

        elif (pd.isna(row['total_female'])) & (row['total_male'] > 0):

            if row['total_male'] > 1:
                valid_preds.loc[index, 'total_female'] = 0
            else:
                valid_preds.loc[index, 'total_female'] = 1

            print(f"\nrow"
                  f"\n{row['travel_with']}"
                  f"\n{row['total_male']}"
                  f"\n{row['total_female']}"
                  f"\n{0}")

        elif (pd.isna(row['total_male'])) & (row['total_female'] > 0):

            if row['total_female'] > 1:
                valid_preds.loc[index, 'total_male'] = 0
            else:
                valid_preds.loc[index, 'total_male'] = 1

            print(f"\nrow"
                  f"\n{row['travel_with']}"
                  f"\n{row['total_male']}"
                  f"\n{row['total_female']}"
                  f"\n{0}")

        if (row['total_male'] + row['total_female'] == 0) & (not pd.isna(row['total_male'])) & (not pd.isna(row['total_female'])):

            # Gentleman
            valid_preds.loc[index, 'total_female'] = 1

            if pd.isna(row['travel_with']):
                dummy_df = pd.get_dummies(row[features_tw], columns=features_cat_tw)
                dummy_df.columns = dummy_df.columns.astype(str)
                dummy_df = dummy_df.reindex(columns=X_train_tw.columns, fill_value=0)

                predicted_value = rf_tw.predict(dummy_df)

                valid_preds.loc[index, 'travel_with'] = predicted_value[0]

            print(f"\nrow widmo"
                  f"\n{row['travel_with']}"
                  f"\n{row['total_male']}"
                  f"\n{row['total_female']}")

    # Check after cleansing
    for index, row in valid_preds[valid_preds.isna().any(axis=1)].iterrows():
        if pd.isna(row['travel_with']) | pd.isna(row['total_male']) | pd.isna(row['total_female']):
            print(f"\nrow"
                  f"\n{row['travel_with']}"
                  f"\n{row['total_male']}"
                  f"\n{row['total_female']}")


    for column in valid_preds.columns:
        empty_count = valid_preds[column].isna().sum()

        if empty_count > 0:
            print(f"Column '{column}' has {empty_count} empty fields (NaN).")


    unique_countries_nan = valid_preds[valid_preds['region'].isna()]['country'].unique()
    print("\nPaństwa bez regionu: ", unique_countries_nan)

    for col in valid_preds.select_dtypes(include=['object']).columns:
        if col != "ID":
            valid_preds[col] = valid_preds[col].astype('category')
            new_col = "cat_" + str(col)
            valid_preds.rename(columns={col: new_col}, inplace=True)
    print(valid_preds.columns)
    valid_preds.drop(columns=['cat_most_impressing', 'cat_country'], inplace=True)
    print(valid_preds.columns)


Challange predictions

In [None]:
if not is_test_iteration:
    id_valid = valid_preds['ID']
    X_valid_preds = valid_preds.drop('ID', axis=1)

    # Predykcje bazowych modeli
    preds_xgb = xgb_model.predict(X_valid_preds)
    preds_cat = cat_model.predict(X_valid_preds)
    preds_lgb = lgb_model.predict(X_valid_preds)

    # Łączenie predykcji dla meta modelu
    meta_features_new = np.column_stack((preds_xgb, preds_cat, preds_lgb))

    # Predykcja finalna
    predictions = meta_model.predict(meta_features_new)

    if predictions.ndim > 1 and predictions.shape[1] == 1:
        predictions = predictions.ravel()

    # Finalne dane predykcji
    results = pd.DataFrame({
        'ID': id_valid,
        'total_cost': predictions
    })


    results.to_csv('data/submission.csv', index=False)
