In [1]:
import pandas as pd
import jupyter_black
import black
import gc
from tqdm import tqdm
from pandas_profiling import ProfileReport
from handlers import *
import numpy as np

gc.collect()

jupyter_black.load(
    lab=False,
    line_length=100,
    verbosity="DEBUG",
    target_version=black.TargetVersion.PY310,
)

pd.set_option('display.max_columns', None)

  from pandas_profiling import ProfileReport
DEBUG:jupyter_black:config: {'line_length': 100, 'target_versions': {<TargetVersion.PY310: 10>}}


<IPython.core.display.Javascript object>

In [2]:
df = pd.read_pickle("../tmp/product_tables.pickle")
df = df[:-2]

In [3]:
# Remove columns with more than half elements empty

threshold = 0.5
df = df.dropna(thresh=int(threshold * len(df)), axis=1)

In [5]:
# Create separate dataframe with descriptive content

df_text = df.copy()

descriptive_fields = ["pdt_incexcl_CONTENT", "pdt_inclexcl_INCLUSIONEXCLUSIONTEXT", "pdt_inclexcl_ENG_CONTENT", "pdt_itinerary_PRODUCTNAME", "pdt_itinerary_TAPOINAME", "pdt_product_detail_PRODUCTTITLE", "pdt_product_detail_PRODUCTDESCRIPTION", "pdt_product_detail_VIATOROVERVIEW", "pdt_product_detail_TOURGRADETITLE", "pdt_product_detail_VIATOROVERVIEWHIGHLIGHTS", "pdt_product_detail_TOURGRADEDESCRIPTION", "pdt_product_level_PRODUCTNAME", "pdt_tourgrades_TITLE"]

df_text = df_text[["PRODUCTCODE"] + descriptive_fields]

for del_col in descriptive_fields:

    del df[del_col]

df_text.to_pickle("../tmp/product_tables_descriptive.pickle")

In [6]:
# Explode df

for col in tqdm(df.columns):

    df = df.explode(col)

100%|██████████| 15/15 [00:00<00:00, 26.25it/s]


In [7]:
df.fillna(np.nan, inplace=True)

  df.fillna(np.nan, inplace=True)


In [8]:
# Remove products without pdt_product_detail_VIDESTINATIONCITY

df = df.dropna(subset=['pdt_product_detail_VIDESTINATIONCITY'])

In [9]:
# Compute mean of pdt_product_level_MINFLEXIBLEDURATION and pdt_product_level_MAXFLEXIBLEDURATION

def fill_fixed_duration(row):
    if np.isnan(row['pdt_product_level_FIXEDDURATION']):
        return (row['pdt_product_level_MINFLEXIBLEDURATION'] + row['pdt_product_level_MAXFLEXIBLEDURATION']) / 2
    else:
        return row['pdt_product_level_FIXEDDURATION']

df['pdt_product_level_FIXEDDURATION'] = df.apply(fill_fixed_duration, axis=1)

In [10]:
del df["pdt_product_level_MINFLEXIBLEDURATION"]
del df["pdt_product_level_MAXFLEXIBLEDURATION"]

In [11]:
# # Outlier detection and create intervals of bins for float and int columns

numerical_cols = ["pdt_product_level_RETAILPRICEFROMUSD", "pdt_product_level_FIXEDDURATION", "pdt_product_level_STOPSCOUNT", "pdt_product_level_STOPSTOTALDURATION", "pdt_product_level_TOTALREVIEWCOUNT", "pdt_product_level_TOTALAVGRATING"]

for numerical_col in numerical_cols:

    df = detect_and_treat_outliers(df, numerical_col, method='zscore', threshold=3)
    df = auto_bin_intervals(df, numerical_col)

outliers for pdt_product_level_RETAILPRICEFROMUSD:
[3737.5876325480003, 3737.5876325480003, 4173.0, 3644.53405615915, 5314.94228094522, 3199.2338059000003, 4351.079326271811, 3309.552213, 6069.38, 5022.631597652351, 7773.12321899168, 5979.323855864261, 7892.70837228808, 6400.0, 15444.576994000001, 7225.85566505, 44127.36284, 7005.21885085, 4716.111903525, 4136.940266250001, 4136.940266250001, 4357.57708045, 4357.57708045, 30000.0, 3695.6666378500004, 3198.130621829, 3860.041064429, 3695.6666378500004, 3695.6666378500004, 3308.449028929, 4411.633099929, 4742.588321229, 3308.449028929, 3861.1442485000002, 3861.1442485000002, 20000.0, 10000.0, 3995.0, 5800.0, 13500.0, 24100.0, 26100.0, 25000.0, 8700.0, 20100.0, 10300.0, 17300.0, 3350.0, 22400.0, 12800.0, 11100.0, 15100.0, 9600.0, 13600.0, 14500.0, 16100.0, 9800.0, 16300.0, 8600.0, 10600.0, 26600.0, 21400.0, 15300.0, 5500.0]
pdt_product_level_RETAILPRICEFROMUSD 16
outliers for pdt_product_level_FIXEDDURATION:
[525600.0, 3000000.0, 3000000.

In [14]:
# Only keep the English columns of df_text

df_text = df_text[["PRODUCTCODE", "pdt_inclexcl_ENG_CONTENT", "pdt_itinerary_PRODUCTNAME", "pdt_itinerary_TAPOINAME", "pdt_product_detail_PRODUCTTITLE", "pdt_product_detail_PRODUCTDESCRIPTION", "pdt_product_detail_TOURGRADETITLE", "pdt_product_detail_VIATOROVERVIEWHIGHLIGHTS", "pdt_product_detail_TOURGRADEDESCRIPTION"]]

In [17]:
# Explode df_text

for col in tqdm(df_text.columns):

    print(col)
    df_text = df_text.explode(col)
    print(df_text.shape)

 33%|███▎      | 3/9 [00:00<00:00, 24.74it/s]

PRODUCTCODE
(21296, 9)
pdt_inclexcl_ENG_CONTENT
(59786, 9)
pdt_itinerary_PRODUCTNAME
(59786, 9)
pdt_itinerary_TAPOINAME
(237448, 9)
pdt_product_detail_PRODUCTTITLE
(237448, 9)
pdt_product_detail_PRODUCTDESCRIPTION


 67%|██████▋   | 6/9 [00:00<00:00, 11.63it/s]

(237448, 9)
pdt_product_detail_TOURGRADETITLE
(488521, 9)
pdt_product_detail_VIATOROVERVIEWHIGHLIGHTS


 89%|████████▉ | 8/9 [00:00<00:00,  7.13it/s]

(488521, 9)
pdt_product_detail_TOURGRADEDESCRIPTION


100%|██████████| 9/9 [00:01<00:00,  4.72it/s]

(1876506, 9)





In [29]:
pdt_product_detail_PRODUCTDESCRIPTION, pdt_product_detail_VIATOROVERVIEWHIGHLIGHTS, pdt_product_detail_TOURGRADEDESCRIPTION

Unnamed: 0,PRODUCTCODE,pdt_inclexcl_ENG_CONTENT,pdt_itinerary_PRODUCTNAME,pdt_itinerary_TAPOINAME,pdt_product_detail_PRODUCTTITLE,pdt_product_detail_PRODUCTDESCRIPTION,pdt_product_detail_TOURGRADETITLE,pdt_product_detail_VIATOROVERVIEWHIGHLIGHTS,pdt_product_detail_TOURGRADEDESCRIPTION
0,100123P1,,,,Pyrenees-Barcelona Bike Tour,Our adventure starts off in La Cerdanya valley...,Summer 2019,,"You can choose between two dates.- Sun, 9 Jun ..."
1,100123P2,,,,Pyrenees Bike Tour,"In La Cerdanya, one of the largest valleys in ...",Summer 2019,,"In La Cerdanya, one of the largest valleys in ..."
2,100123P3,,,,Pyrenees-Costa Brava Bike Tour,We have planned this trip for those who love c...,Pyrenees-Costa Brava Bike Tour,,We have planned this trip for those who love c...
3,100123P4,,,,Trekking Tour Pyrenees - La Cerdanya,"In La Cerdanya, one of the largest valleys in ...",Trekking Tour Pyrenees - La Cerdanya,,"In La Cerdanya, one of the largest valleys in ..."
4,100123P5,,,,Hiking Tour Pyrenees - La Cerdanya,"In La Cerdanya, one of the largest valleys in ...",Hiking Tour Pyrenees - La Cerdanya,,"In La Cerdanya, one of the largest valleys in ..."
...,...,...,...,...,...,...,...,...,...
21294,9974P1,The Ice Bar Experience Admission Ticket,The Ice Bar Experience at Icebarcelona,Icebarcelona,The Ice Bar Experience at Icebarcelona,Experience the first ice bar at the beach in t...,The Ice Bar Experience,Enjoy a unique experience at the world's first...,Clothing: Jacket and gloves\n1 Drink included\n
21294,9974P1,One Drink,The Ice Bar Experience at Icebarcelona,Icebarcelona,The Ice Bar Experience at Icebarcelona,Experience the first ice bar at the beach in t...,The Ice Bar Experience,Enjoy a unique experience at the world's first...,Clothing: Jacket and gloves\n1 Drink included\n
21294,9974P1,Jacket and gloves,The Ice Bar Experience at Icebarcelona,Icebarcelona,The Ice Bar Experience at Icebarcelona,Experience the first ice bar at the beach in t...,The Ice Bar Experience,Enjoy a unique experience at the world's first...,Clothing: Jacket and gloves\n1 Drink included\n
21294,9974P1,Extra drinks at the terrace (available to purc...,The Ice Bar Experience at Icebarcelona,Icebarcelona,The Ice Bar Experience at Icebarcelona,Experience the first ice bar at the beach in t...,The Ice Bar Experience,Enjoy a unique experience at the world's first...,Clothing: Jacket and gloves\n1 Drink included\n


In [None]:
# Set string features

df["PRODUCTCODE"] = df["PRODUCTCODE"].astype(str)

df["pdt_product_detail_VIDESTINATIONCITY"] = df["pdt_product_detail_VIDESTINATIONCITY"].astype(str)

df["pdt_product_level_VICATEGORY"] = df["pdt_product_level_VICATEGORY"].astype(str)

df["pdt_product_level_VIPRODUCTTYPE"] = df["pdt_product_level_VIPRODUCTTYPE"].astype(str)

df['pdt_product_level_PRICINGUNITTYPE'] = df['pdt_product_level_PRICINGUNITTYPE'].astype(str)

df['pdt_product_level_RETAILPRICECURRENCY'] = df['pdt_product_level_RETAILPRICECURRENCY'].astype(str)

In [None]:
# Set boolean features

df['pdt_product_level_ISPRIVATETOUR'] = df['pdt_product_level_ISPRIVATETOUR'].astype(bool)

In [None]:
# Analyse tabular product data

profile = ProfileReport(df)

In [None]:
profile

In [24]:
df_text

Unnamed: 0,PRODUCTCODE,pdt_inclexcl_ENG_CONTENT,pdt_itinerary_PRODUCTNAME,pdt_itinerary_TAPOINAME,pdt_product_detail_PRODUCTTITLE,pdt_product_detail_PRODUCTDESCRIPTION,pdt_product_detail_TOURGRADETITLE,pdt_product_detail_VIATOROVERVIEWHIGHLIGHTS,pdt_product_detail_TOURGRADEDESCRIPTION
0,100123P1,,,,Pyrenees-Barcelona Bike Tour,Our adventure starts off in La Cerdanya valley...,Summer 2019,,"You can choose between two dates.- Sun, 9 Jun ..."
1,100123P2,,,,Pyrenees Bike Tour,"In La Cerdanya, one of the largest valleys in ...",Summer 2019,,"In La Cerdanya, one of the largest valleys in ..."
2,100123P3,,,,Pyrenees-Costa Brava Bike Tour,We have planned this trip for those who love c...,Pyrenees-Costa Brava Bike Tour,,We have planned this trip for those who love c...
3,100123P4,,,,Trekking Tour Pyrenees - La Cerdanya,"In La Cerdanya, one of the largest valleys in ...",Trekking Tour Pyrenees - La Cerdanya,,"In La Cerdanya, one of the largest valleys in ..."
4,100123P5,,,,Hiking Tour Pyrenees - La Cerdanya,"In La Cerdanya, one of the largest valleys in ...",Hiking Tour Pyrenees - La Cerdanya,,"In La Cerdanya, one of the largest valleys in ..."
...,...,...,...,...,...,...,...,...,...
21294,9974P1,The Ice Bar Experience Admission Ticket,The Ice Bar Experience at Icebarcelona,Icebarcelona,The Ice Bar Experience at Icebarcelona,Experience the first ice bar at the beach in t...,The Ice Bar Experience,Enjoy a unique experience at the world's first...,Clothing: Jacket and gloves\n1 Drink included\n
21294,9974P1,One Drink,The Ice Bar Experience at Icebarcelona,Icebarcelona,The Ice Bar Experience at Icebarcelona,Experience the first ice bar at the beach in t...,The Ice Bar Experience,Enjoy a unique experience at the world's first...,Clothing: Jacket and gloves\n1 Drink included\n
21294,9974P1,Jacket and gloves,The Ice Bar Experience at Icebarcelona,Icebarcelona,The Ice Bar Experience at Icebarcelona,Experience the first ice bar at the beach in t...,The Ice Bar Experience,Enjoy a unique experience at the world's first...,Clothing: Jacket and gloves\n1 Drink included\n
21294,9974P1,Extra drinks at the terrace (available to purc...,The Ice Bar Experience at Icebarcelona,Icebarcelona,The Ice Bar Experience at Icebarcelona,Experience the first ice bar at the beach in t...,The Ice Bar Experience,Enjoy a unique experience at the world's first...,Clothing: Jacket and gloves\n1 Drink included\n
