# Tanzania Tourism Prediction - Prognozy dotyczące turystyki w Tanzanii
<h2>Autorzy:</h2><br>
<ul>
<li>Piotr Janiszek 247678</li>
<li>Kacper Białek 247629</li>
<li>Franciszek Pawlak 247756</li>
<li>Michał Korblit 242427</li>
</ul>

<h3>Imports</h3>

In [None]:
import math

import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import RandomForestRegressor


<h3>Data loadind</h3>


In [None]:
df_train = pd.read_csv('data/Train.csv')
df_test = pd.read_csv('data/Test.csv')

print(df_train.columns)

<h3>Data Cleaning</h3>

In [None]:
count = df_train.isna().any(axis=1).sum()
df_train.loc[df_train['most_impressing'].isna(), 'most_impressing'] = 'No comments'
count2 = df_train.isna().any(axis=1).sum()

mask_valid = (df_train['total_male'].notna()) & \
             (df_train['total_female'].notna()) & \
             ((df_train['total_male'] + df_train['total_female']) != 0)
df_train.loc[mask_valid, 'total_cost_per_person'] = df_train.loc[mask_valid, 'total_cost'] / (
            df_train.loc[mask_valid, 'total_male'] + df_train.loc[mask_valid, 'total_female'])
mean = df_train.loc[mask_valid, 'total_cost_per_person'].mean()
print("Średnia bez wierszy z zerową sumą:", mean)


# idzie przez kolumny z nan
for index, row in df_train[df_train.isna().any(axis=1)].iterrows():
    if (pd.isna(row['total_male']) | pd.isna(row['total_female'])) & (not (pd.isna(row['total_male']) & pd.isna(row['total_female']))):
        if pd.isna(row['total_male']):
            difference = row['total_cost'] - (row['total_female'] * mean)
            person_left = difference / mean
            person_left = round(person_left, 0)

            if person_left < 0:
                person_left = 0
            #print(f"Inserting {person_left}")
            df_train.loc[index, 'total_male'] = person_left

        else:
            difference = row['total_cost'] - (row['total_male'] * mean)

            person_left = difference / mean
            person_left = round(person_left, 0)

            if person_left < 0:
                person_left = 0
            #print(f"Inserting {person_left}")
            df_train.loc[index, 'total_female'] = person_left

for index, row in df_train[df_train.isna().any(axis=1)].iterrows():
    # Completing the travel_with field with the value Alone, when the number of people shows that he travels alone
    if pd.isna(row['travel_with']) & ((row['total_male'] + row['total_female']) == 1):
        df_train.loc[index, 'travel_with'] = 'Alone'


In [None]:
# Random forest imputation
mask_valid = (df_train['total_male'].notna()) & \
             (df_train['total_female'].notna()) & \
             ((df_train['total_male'] + df_train['total_female']) != 0)
df_train.loc[mask_valid, 'total_cost_per_person'] = df_train.loc[mask_valid, 'total_cost'] / (
            df_train.loc[mask_valid, 'total_male'] + df_train.loc[mask_valid, 'total_female'])
mean = df_train.loc[mask_valid, 'total_cost_per_person'].mean()

features = ['country', 'age_group', 'total_female',
       'total_male', 'purpose', 'main_activity', 'info_source',
       'tour_arrangement', 'package_transport_int', 'package_accomodation',
       'package_food', 'package_transport_tz', 'package_sightseeing',
       'package_guided_tour', 'package_insurance', 'night_mainland',
       'night_zanzibar', 'payment_mode', 'first_trip_tz', 'most_impressing',
       'total_cost']

features_cat = ['country', 'age_group', 'purpose', 'main_activity', 'info_source', 'tour_arrangement',
                'package_transport_int', 'package_accomodation', 'package_food', 'package_transport_tz',
                'package_sightseeing', 'package_guided_tour', 'package_insurance',
                'payment_mode', 'first_trip_tz', 'most_impressing']

# Określ dozwolone kategorie - Model 1
allowed_categories = ['Friends/Relatives', 'Children']
# Filtrowanie danych treningowych - uwzględniamy tylko dozwolone kategorie
df_train_imp = df_train[df_train['travel_with'].isin(allowed_categories)].copy()
X_train = pd.get_dummies(df_train_imp[features], columns=features_cat)
y_train = df_train_imp['travel_with']
# Trenowanie modelu
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Określ dozwolone kategorie - Model 2
allowed_categories2 = ['Children', 'Friends/Relatives', 'Spouse', 'Spouse and Children']
# Filtrowanie danych treningowych - uwzględniamy tylko dozwolone kategorie
df_train_imp2 = df_train[df_train['travel_with'].isin(allowed_categories)].copy()
X_train2 = pd.get_dummies(df_train_imp2[features], columns=features_cat)
y_train2 = df_train_imp2['travel_with']
# Trenowanie modelu
rf2 = RandomForestClassifier(n_estimators=100, random_state=42)
rf2.fit(X_train2, y_train2)

# Określ dozwolone kategorie - Model 3
features3 = ['country', 'age_group', 'purpose', 'main_activity', 'info_source',
       'tour_arrangement', 'package_transport_int', 'package_accomodation',
       'package_food', 'package_transport_tz', 'package_sightseeing',
       'package_guided_tour', 'package_insurance', 'night_mainland',
       'night_zanzibar', 'payment_mode', 'first_trip_tz', 'most_impressing',
       'total_cost']

features_cat3 = ['country', 'age_group', 'purpose', 'main_activity', 'info_source', 'tour_arrangement',
                'package_transport_int', 'package_accomodation', 'package_food', 'package_transport_tz',
                'package_sightseeing', 'package_guided_tour', 'package_insurance',
                'payment_mode', 'first_trip_tz', 'most_impressing']

def designate_sex(row):
    if (row['travel_with'] == 'Alone') and ((row['total_male'] + row['total_female']) == 1):
        # Jeśli total_male == 1, przyjmujemy, że to mężczyzna, w przeciwnym razie kobieta
        return 'Male' if row['total_male'] == 1 else 'Female'
    else:
        return None

df_train['gender'] = df_train.apply(designate_sex, axis=1)
df_model = df_train[df_train['gender'].notna()].copy()

# Wybór cech (features) i zmienna docelowa (target)
X_train3 = df_model[features3]
y_train3 = df_model['gender']

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), features_cat3)
    ],
    remainder='passthrough'
)

model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

model_pipeline.fit(X_train3, y_train3)

# Predykcja płci dzieci - Model 4
df_train['male_children'] = df_train['total_male'].apply(lambda x: max(x - 1, 0))
df_train['female_children'] = df_train['total_female'].apply(lambda x: max(x - 1, 0))

df_filtered = df_train[(df_train['total_male'] > 0) & (df_train['total_female'] > 0)].copy()
df_children_model = df_filtered[df_filtered['travel_with'] == 'Spouse and Children'].copy()

# Wybranie wierszy, gdzie mamy informację o liczbie dzieci
#df_children_model = df_train[df_train['travel_with'] == 'Spouse and Children'].copy()

# Przygotowanie macierzy cech z użyciem get_dummies (lub innej metody kodowania zmiennych kategorycznych)
X_train_child = pd.get_dummies(df_children_model[features3], columns=features_cat3)

# Przygotowanie macierzy target – dwie kolumny: liczba dzieci mężczyzn i dzieci kobiet
y_train_child = df_children_model[['male_children', 'female_children']]

# Inicjalizacja modelu
multioutput_rf = MultiOutputRegressor(RandomForestRegressor(n_estimators=100, random_state=42))

# Trenowanie modelu
multioutput_rf.fit(X_train_child, y_train_child)

# Predykcja na nowych danych (przykład)
#predicted_children = multioutput_rf.predict(X_train_child)

# Wyświetlenie przykładowych predykcji:
#df_children_model['predicted_male_children'] = predicted_children[:, 0]
#df_children_model['predicted_female_children'] = predicted_children[:, 1]
#print(df_children_model[['male_children', 'female_children', 'predicted_male_children', 'predicted_female_children']].head())

# Imputation
for index, row in df_train[df_train.isna().any(axis=1)].iterrows():
    # Completing the travel_with field with the estimated value when only one gender participated in the trip and the number of people exceeds one
    if (pd.isna(row['travel_with']) & ((row['total_male'] + row['total_female']) > 1) &
            ((row['total_male'] == 0 | pd.isna(row['total_male'])) | (row['total_female'] == 0 | pd.isna(row['total_female'])))):

        dummy_df = pd.get_dummies(row[features], columns=features_cat)
        dummy_df.columns = dummy_df.columns.astype(str)
        dummy_df = dummy_df.reindex(columns=X_train.columns, fill_value=0)

        predicted_value = rf.predict(dummy_df)

        #print(predicted_value[0])
        df_train.loc[index, 'travel_with'] = predicted_value[0]

    elif (row['total_male'] + row['total_female'] > 1) & (pd.isna(row['travel_with'])):

        dummy_df = pd.get_dummies(row[features], columns=features_cat)
        dummy_df.columns = dummy_df.columns.astype(str)
        dummy_df = dummy_df.reindex(columns=X_train2.columns, fill_value=0)

        predicted_value = rf2.predict(dummy_df)

        print(predicted_value[0])
        df_train.loc[index, 'travel_with'] = predicted_value[0]

    elif (not pd.isna(row['travel_with'])) & (row['total_male'] == 0 | pd.isna(row['total_male'])) & (row['total_female'] == 0 | pd.isna(row['total_female'])):
        print("Brakuje liczby osob")
        persons = round(row['total_cost'] / mean, 0)

        if row['travel_with'] == 'Alone':
            print('predykcja płci')

            #dummy_df = pd.get_dummies(row[features3], columns=features_cat3)
            #dummy_df.columns = dummy_df.columns.astype(str)
            #dummy_df = dummy_df.reindex(columns=X_train3.columns, fill_value=0)

            #predicted_value = model_pipeline.predict(dummy_df)
            #print(predicted_value[0])

            # Konwersja pojedynczego wiersza na DataFrame
            input_df = row[features3].to_frame().T

            predicted_value = model_pipeline.predict(input_df)
            #print(predicted_value[0])
            if predicted_value[0] == 'Male':
                df_train.loc[index, 'total_male'] = 1
                df_train.loc[index, 'total_female'] = 0
            else:
                df_train.loc[index, 'total_male'] = 0
                df_train.loc[index, 'total_female'] = 1
        elif row['travel_with'] == 'Spouse':
            df_train.loc[index, 'total_male'] = 1
            df_train.loc[index, 'total_female'] = 1
        elif row['travel_with'] == 'Spouse and Children':
            df_train.loc[index, 'total_male'] = 1
            df_train.loc[index, 'total_female'] = 1
            if persons < 3:
                persons = 3
            print(f"Predykcja płci dziecka, przy {persons} wszystkich osobach")
            persons = persons - 2

            # Przygotowanie danych wejściowych dla modelu
            dummy_df = pd.get_dummies(row[features3], columns=features_cat3)
            dummy_df.columns = dummy_df.columns.astype(str)
            dummy_df = dummy_df.reindex(columns=X_train_child.columns, fill_value=0)

            # Uzyskujemy surowe predykcje liczby dzieci dla obu płci
            pred = multioutput_rf.predict(dummy_df)
            pred_male, pred_female = pred[0, 0], pred[0, 1]

            # Skalujemy predykcje do znanej liczby dzieci (persons)
            pred_sum = pred_male + pred_female
            if pred_sum == 0:
                ratio_male = 0.5  # zabezpieczenie, gdyby suma była zerowa
            else:
                ratio_male = pred_male / pred_sum
            ratio_female = 1 - ratio_male

            # Obliczamy ostateczną liczbę dzieci danej płci
            male_children_final = round(persons * ratio_male)
            female_children_final = persons - male_children_final

            # Dodajemy przewidywaną liczbę dzieci do dorosłych
            df_train.loc[index, 'total_male'] += male_children_final
            df_train.loc[index, 'total_female'] += female_children_final

            print(f"Predykcja: {male_children_final} chłopców oraz {female_children_final} dziewczyn, przy {persons} żądanych osobach")


        elif row['travel_with'] == 'Children':
            print("Predykcja płci dziecka")
        elif row['travel_with'] == 'Friends/Relatives':
            print("Predykcja podziału")
        else:
            print(f"{row['travel_with']}"
              f"\n{row['total_male']}"
              f"\n{row['total_female']}")

    elif pd.isna(row['travel_with']) & ((row['total_male'] + row['total_female']) == 0):
        persons = round(row['total_cost'] / mean, 0)

        if persons <= 1:
            df_train.loc[index, 'travel_with'] = 'Alone'
        else:
            print('predykcja płci i rozłożenia')


    elif pd.isna(row['travel_with']):
        print(f"{row['travel_with']}"
              f"\n{row['total_male']}"
              f"\n{row['total_female']}")


mask_valid = (df_train['total_male'].notna()) & \
             (df_train['total_female'].notna()) & \
             ((df_train['total_male'] + df_train['total_female']) != 0)
df_train.loc[mask_valid, 'total_cost_per_person'] = df_train.loc[mask_valid, 'total_cost'] / (
            df_train.loc[mask_valid, 'total_male'] + df_train.loc[mask_valid, 'total_female'])

count3 = df_train.isna().any(axis=1).sum()

print("Początkowa liczba wierszy z brakującymi wartościami:", count)
print("Liczba wierszy z brakującymi wartościami po uzupełnieniu braków w kolumnie most_impressing:", count2)
print("Końcowa liczba wierszy z brakującymi wartościami:", count3)

for column in df_train.columns:
    empty_count = df_train[column].isna().sum()

    if empty_count > 0:
        print(f"Column '{column}' has {empty_count} empty fields (NaN).")

for index, row in df_train[df_train.isna().any(axis=1)].iterrows():
    #print(row)
    pass

df_train.drop(["most_impressing", "gender"], axis=1, inplace=True)
df_train["total_people"] = df_train["total_male"] + df_train["total_female"]
