In [3]:
import plotly.express as px
import pandas as pd
import numpy as np

from src_clean.dataloader.dataloader import DataLoader
from src_clean.preprocessor.prepro_new import PreprocessorTwo

In [5]:
train_df, test_df = DataLoader.load_train_test_dfs()

## Outlier Boxplots

In [3]:
import plotly.graph_objects as go

FEATURES_TO_DISPLAY = ['price_usd', 'gross_bookings_usd']

create_boxplot_features = ['price_usd',  'orig_destination_distance', 'visitor_hist_starrating', 'visitor_hist_adr_usd', 'prop_log_historical_price',
                           'comp1_rate_percent_diff', 'comp2_rate_percent_diff', 'comp3_rate_percent_diff', 'comp4_rate_percent_diff','comp5_rate_percent_diff',
                           'comp6_rate_percent_diff', 'comp7_rate_percent_diff', 'comp8_rate_percent_diff']

extra_features=['visitor_hist_starrating', 'visitor_hist_adr_usd', 'prop_review_score','prop_location_score1',
                'prop_location_score2', 'prop_log_historical_price', 'srch_query_affinity_score',]

fig = go.Figure()
fig.update_xaxes(type="log")
for feature in create_boxplot_features:
    fig.add_trace(go.Box(x=train_df[feature].dropna(), name=feature))

fig.update_layout(
    font={'size': 16})

fig.show(renderer="browser")

## Missing Data

In [7]:
train_df_with_nan_values = train_df[train_df.columns[train_df.isnull().any()]]
missing_data_series = (train_df_with_nan_values.isna().sum() / 4_958_347).sort_values()
missing_data_plot_df = pd.DataFrame(columns=['missing_data', 'feature'])
missing_data_plot_df['feature'] = missing_data_series.index
missing_data_plot_df['missing_data'] = missing_data_series.values
fig = px.bar(missing_data_plot_df, x='feature', y='missing_data')
fig.update_traces(marker_color='rgb(158,202,225)', marker_line_color='rgb(8,48,107)',
                  marker_line_width=1.5, opacity=0.6)
fig.show(renderer="browser")
fig.show()

In [8]:
from numpy.ma.core import transpose
def create_missingvalues(df: pd.DataFrame) -> None:
    count = df.shape[0]
    missing_df = df.isnull().sum()
    missing_df = missing_df/count
    missing_df = missing_df[missing_df!= 0].sort_values()

    fig = px.bar(y=missing_df.values, x=missing_df.index, labels={'y':'Missing values (%)', 'x': 'Features'}, template='plotly')
    fig.update_traces(marker_color='rgb(158,202,225)', marker_line_color='rgb(8,48,107)',
                      marker_line_width=1.5, opacity=0.6)
    fig.update_layout(font={'size': 16})
    fig.show(renderer="browser")
    fig.show(scale=2)


create_missingvalues(train_df)

In [13]:
import calendar

def extract_booking_travel_date(df: pd.DataFrame) -> pd.DataFrame:
    df['date_time'] = pd.to_datetime(df['date_time'], format='%Y-%m-%d')
    df['travel_time'] = pd.to_datetime((df['date_time'] + pd.to_timedelta(df['srch_booking_window'], unit='D')))
    df['booking'] = df['booking_bool'] + df['click_bool']
    return df[['date_time', 'travel_time', 'booking']]


df_month = extract_booking_travel_date(train_df)

data = df_month.groupby(['travel_time', 'booking'])['booking'].count().reset_index(name='counts')
data['month'] = (data['travel_time'].dt.month).apply(lambda x: calendar.month_abbr[x])
data["travel_time"] = data["travel_time"].dt.strftime('2013-%m-%d')
fig = px.bar(data[data['booking'] > 1], x="travel_time", y="counts", color='month',
             labels={'counts': 'Bookings', 'travel_time': 'Travel Month'})
# category_orders={'month': {
# 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun' , 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'}})
fig.update_traces(opacity=1)
fig.update_xaxes(dtick="M1", tickformat='%b', position=1)
fig.update_layout(
    font={'size': 16})

fig.show(renderer="browser")
fig.show(scale=2)