In [1]:
import pandas as pd
import numpy as np
import os
import benchmark


def add_engineered_features(df):
    return train_df


def preprocessing(train_path="", test_path=""):
    """Preprocesses the data after the first engineered features are added
        Args:
            - train_df (DataFrame): dataframe containing the training data
            - test_df (DataFrame): dataframe containing the test data
    """

    to_drop = ["visitor_hist_starrating"
              ,"visitor_hist_adr_usd"
              ,"srch_query_affinity_score"
              ,"gross_bookings_usd"
              ,"click_bool"
              ,"booking_bool"
              ,"position"
              ]

    to_decide = ["prop_country_id"
                ,"visitor_location_country_id"
                ,"srch_destination_id"
                ]

    to_fill = {"prop_review_score": "zero"
              ,"prop_location_score2": "median"
              }

    train_df = load_train()
    test_df = load_test()

    # prior_dict = get_prior_dict(train_df)


    # Drop columns
    train_df = train_df.drop(to_drop, axis=1)
    train_df = train_df.drop(to_decide, axis=1)

    # Fill NaNs
    train_df["prop_review_score"] = train_df["prop_review_score"].fillna(0)
    train_df["prop_location_score2"] = train_df["prop_location_score2"].fillna(train_df["prop_location_score2"].median())

    # Categorize
    train_df = categorize(train_df, "site_id")
    
    train_df = summarize_competitor_information(train_df)
    
    return train_df


def categorize(train_df, column):
    if column == "site_id":
        for i in train_df["site_id"].unique():
            train_df[f"site_{i}"] = train_df["site_id"] == i
        return train_df.drop(["site_id"], axis=1)

In [26]:
def load_train(path=os.path.join("data","training_set_VU_DM.csv")):
    return pd.read_csv(path)

def load_test(path=os.path.join("data", "test_set_VU_DM.csv")):
    return pd.read_csv(path)

In [12]:
def summarize_competitor_information(df):
    """Summarizes competitor information in
       Min, Mean, Max values.
    """

    for c in range(1, 9):

        # Handle nans in competitor information

        # Combine the sign of rate with rate_percent_diff
        df[f"comp{c}_signed_rate_percent_diff"] = df[f"comp{c}_rate"] * df[f"comp{c}_rate_percent_diff"]
        
    diff_names = [f"comp{c}_signed_rate_percent_diff" for c in range(1, 9)]
    
    competitor_diff_df = df[diff_names]

    # Add minimum of signed_rate_percent_diff between competors
    df["comp_diff_min"] = competitor_diff_df.min(axis=1)

    # Add maximum of signed_rate_percent_diff between competors
    df["comp_diff_max"] = competitor_diff_df.max(axis=1)

    # Add mean of signed_rate_percent_diff between competors
    df["comp_diff_mean"] = competitor_diff_df.mean(axis=1)

    # Add number of competitors that present this property
    df["comp_count"] = competitor_diff_df.count(axis=1)
    
    # Comp inv categorization + not boolean but count of number of competitors for the value?
    inv_names = [f"comp{c}_inv" for c in range(1, 9)]
    inv_df = df[inv_names].replace("nan", np.nan)
    inv_allnans = inv_df.isnull().all(1)
    
    min_id = competitor_diff_df.idxmin(axis=1)
    max_id = competitor_diff_df.idxmax(axis=1)
    
    min_list = []
    max_list = []

    for min_, max_ , (_, vals) in zip(min_id, max_id, inv_df.iterrows()):
        if isinstance(min_, str):
            min_list.append(vals[f"{min_[:5]}_inv"])
            max_list.append(vals[f"{max_[:5]}_inv"])
        else:
            
            # Current NAN indicator !!!==========================!!!!!!!!!!! Please fix
            min_list.append(np.nan)
            max_list.append(np.nan)
    
    df["comp_min_inv"] = min_list
    df["comp_max_inv"] = max_list
    df["comp_mean_inv"] = inv_df.mean(axis=1)
    
    comp_names = [f"comp{c}_rate_percent_diff" for c in range(1, 9)]
    comp_rate_names = [f"comp{c}_rate" for c in range(1, 9)]
    df = df.drop(columns=diff_names+comp_names+inv_names+comp_rate_names)
    
    return df
    
#     # Per min, mean en max de relevante bool neerzetten. er komen 3 kolommen voor.
#     pass

    
df = preprocessing()
df

Unnamed: 0,srch_id,date_time,prop_id,prop_starrating,prop_review_score,prop_brand_bool,prop_location_score1,prop_location_score2,prop_log_historical_price,price_usd,...,site_6,site_30,site_3,comp_diff_min,comp_diff_max,comp_diff_mean,comp_count,comp_min_inv,comp_max_inv,comp_mean_inv
0,1,2013-04-04 08:32:15,893,3,3.5,1,2.83,0.0438,4.95,104.77,...,False,False,False,,,,0,,,0.000000
1,1,2013-04-04 08:32:15,10404,4,4.0,1,2.20,0.0149,5.03,170.74,...,False,False,False,,,,0,,,0.333333
2,1,2013-04-04 08:32:15,21315,3,4.5,1,2.20,0.0245,4.92,179.80,...,False,False,False,,,,0,,,0.000000
3,1,2013-04-04 08:32:15,27348,2,4.0,1,2.83,0.0125,4.39,602.77,...,False,False,False,-5.0,-5.0,-5.0,3,0.0,0.0,0.250000
4,1,2013-04-04 08:32:15,29604,4,3.5,1,2.64,0.1241,4.93,143.58,...,False,False,False,,,,0,,,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199995,13431,2013-03-25 10:55:36,101543,4,4.5,1,1.10,0.0697,5.28,143.00,...,False,False,False,0.0,11.0,5.5,2,0.0,0.0,0.250000
199996,13431,2013-03-25 10:55:36,101874,3,3.5,1,1.79,0.0697,4.94,105.00,...,False,False,False,,,,0,,,0.000000
199997,13431,2013-03-25 10:55:36,103830,3,4.0,1,1.10,0.0697,5.29,146.00,...,False,False,False,0.0,0.0,0.0,1,0.0,0.0,0.000000
199998,13431,2013-03-25 10:55:36,112479,4,4.0,1,4.08,0.0697,5.62,259.00,...,False,False,False,,,,0,,,0.000000


In [23]:
    def k_fold_segmentation(train_df, k=10):
        folds = [[] for _ in range(k)]

        for s in train_df["srch_id"].unique():
            i = np.random.randint(0, 10)
            folds[i].append(s)
    #     for f in folds:
    #         print(len(f))
        
k_fold_segmentation(df)

815
773
861
801
787
807
827
759
826
813


In [28]:
test = load_test()
train = load_train()
testsites = test["site_id"].unique()
trainsites = train["site_id"].unique()
print(set(testsites), len(trainsites))

34 34


In [30]:
print(set(testsites) & set(trainsites))

{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34}
