In [None]:
import pandas as pd
import jupyter_black
import black
import gc
from tqdm import tqdm
from pandas_profiling import ProfileReport
from datetime import datetime
import numpy as np

gc.collect()

jupyter_black.load(
    lab=False,
    line_length=100,
    verbosity="DEBUG",
    target_version=black.TargetVersion.PY310,
)

pd.set_option('display.max_columns', None)

In [None]:
df = pd.read_pickle("../tmp/product_tables.pickle")
df = df[:-2]

In [None]:
# Remove columns with more than half elements empty

threshold = 0.5
df = df.dropna(thresh=int(threshold * len(df)), axis=1)

In [None]:
# Delete fields that won't be used

del df["pdt_incexcl_LOCALECODE"]
del df["pdt_inclexcl_TYPE"]
del df["pdt_incexcl_INCLUSIONEXCLUSIONCLASSTYPE"]
del df["pdt_inclexcl_ENG_INCLUSIONEXCLUSIONCLASSTYPE"]
del df["pdt_incexcl_TYPE"]
del df["pdt_inclexcl_ENG_TYPE"]
del df["pdt_itinerary_POIID"]
del df["pdt_itinerary_POIORDER"]
del df["pdt_itinerary_POITYPEID"]
del df["pdt_itinerary_POITYPE"]
del df["pdt_itinerary_ISPASSBYPOI"]
del df["pdt_itinerary_ADMISSIONINCLUSIONTYPE"]
del df["pdt_itinerary_ADMISSIONINCLUSIONTYPEDESCRIPTION"]
del df["pdt_product_detail_TOURGRADECODE"]
del df["pdt_tourgrades_TOURGRADECODE"]
del df["pdt_itinerary_POIDURATIONINMIN"]
del df["pdt_itinerary_TALOCATIONID"]
del df["pdt_inclexcl_ENG_VIDESTINATIONCITY"]
del df["pdt_itinerary_VIDESTINATIONCITY"]
del df["pdt_product_level_SUPPLIERCODE"]
del df["pdt_product_level_SUPPLIERID"]
del df["pdt_product_level_CONFIRMATIONTYPE"]
del df["pdt_product_level_BOOKINGCUTOFFFIXEDHOURS"]
del df["pdt_product_level_HOURSINADVANCE"]
del df["pdt_product_level_HOURSINADVANCEREFERENCEPOINT"]
del df["pdt_product_level_HASFIXEDSTARTTIME"]
del df["pdt_product_level_PROFESSIONALVIDEOCOUNTLIVE"]
del df["pdt_product_level_ISSAPIPRODUCT"]
del df["pdt_product_level_SUPPLIERUPLOADEDPHOTOCOUNTS"]
del df["pdt_product_level_PROFESSIONALPHOTOCOUNTS"]
del df["pdt_product_level_RESTECH"]
del df["pdt_product_level_ISHOTELPICKUPAVAILABLE"]
del df["pdt_product_level_SUPPLIERCANCANCELONBADWEATHER"]
del df["pdt_product_level_TOTALAVGBUBBLERATING"]
del df["pdt_product_level_VIDESTINATIONCITY"]
del df["pdt_product_level_VIREVIEWCOUNT"]
del df["pdt_incexcl_VIDESTINATIONCITY"]
del df["pdt_product_level_VISUBCATEGORY"]
del df["pdt_product_level_CANCELLATIONPOLICY"]
del df["pdt_product_level_TAREVIEWCOUNT"]
del df["pdt_product_level_STATUS"]
del df["pdt_product_level_FIRSTPUBLISHEDDATE"]

In [None]:
# Create separate dataframe with descriptive content

df_text = df.copy()

descriptive_fields = ["pdt_incexcl_CONTENT", "pdt_inclexcl_INCLUSIONEXCLUSIONTEXT", "pdt_inclexcl_ENG_CONTENT", "pdt_itinerary_PRODUCTNAME", "pdt_itinerary_TAPOINAME", "pdt_product_detail_PRODUCTTITLE", "pdt_product_detail_PRODUCTDESCRIPTION", "pdt_product_detail_VIATOROVERVIEW", "pdt_product_detail_TOURGRADETITLE", "pdt_product_detail_VIATOROVERVIEWHIGHLIGHTS", "pdt_product_detail_TOURGRADEDESCRIPTION", "pdt_product_level_PRODUCTNAME", "pdt_tourgrades_TITLE"]

df_text = df_text[["PRODUCTCODE"] + descriptive_fields]

for del_col in descriptive_fields:

    del df[del_col]

df_text.to_pickle("../tmp/product_tables_descriptive.pickle")

In [None]:
# Explode df and start preprocessing of tabular data

for col in tqdm(df.columns):

    df = df.explode(col)

In [None]:
df['pdt_product_level_FIRSTPUBLISHEDDATE'] = pd.to_datetime(df['pdt_product_level_FIRSTPUBLISHEDDATE'])

In [None]:
df.fillna(np.nan, inplace=True)

In [None]:
df["pdt_product_level_MEANRATING"] = np.mean(df["pdt_product_level_TAAVGRATING"], df["pdt_product_level_VIAVGRATING"])

In [None]:
# Analyse tabular product data

profile = ProfileReport(df)

In [None]:
profile