In [9]:
import numpy as np
import pandas as pd
import time


def add_nan_column(df, cols):
    start_time = time.time()

    for col in cols:
        df[str(col) + '_is_nan'] = df[col].isna().astype(np.int8)

    # Print the runtime
    elapsed_time = time.time() - start_time
    print("add_nan_column runtime: %.2f seconds" % elapsed_time)

    return df


def add_zero_column(df, cols):
    start_time = time.time()

    for col in cols:
        df[str(col) + '_is_zero'] = df[col].eq(0).astype(np.int8)

    # Print the runtime
    elapsed_time = time.time() - start_time
    print("add_zero_column runtime: %.2f seconds" % elapsed_time)

    return df


#def calc_price_per_night(df: pd.DataFrame):


# Function to impute the prop_location_score2 values that are NaN based on the prop_location_score2 values of the same prop_id
def impute_prop_loc_score2_using_future_score2(df : pd.DataFrame) -> pd.DataFrame:
    start_time = time.time()

    # Retrieve the prop_ids that have a change in prop_location_score2 from NaN to a float over time
    has_change_over_time = df.groupby('prop_id')['prop_location_score2'].apply(lambda x: x.diff().notna().any())
    prop_ids_change = has_change_over_time[has_change_over_time == True].index

    # For each prop_id that has a change in prop_location_score2 from NaN to a float over time, impute the NaN values with the first non-NaN value
    for prop_id in prop_ids_change:
        mask = (df['prop_id'] == prop_id) & (df['prop_location_score2'].isnull())
        non_nan_value = df.loc[(df['prop_id'] == prop_id) & (df['prop_location_score2'].notna()), 'prop_location_score2'].values[0]
        df.loc[mask, 'prop_location_score2'] = non_nan_value

    # Print the runtime
    elapsed_time = time.time() - start_time
    print("impute_prop_loc_score2_using_future_score2 runtime: %.2f seconds" % elapsed_time)

    return df


# Impute remaning NaN values using the mean prop_locatioin_score2 values for each prop_location_score1 value
def impute_prop_location_score2_using_mean_score1(df : pd.DataFrame) -> pd.DataFrame:
    start_time = time.time()

    # Create an extra column with rounded 'prop_location_score1'
    df['rounded_score1'] = df['prop_location_score1'].round()

    # Calculate the mean of 'prop_location_score2' for each rounded integer value
    mean_score2_by_rounded = df.groupby('rounded_score1')['prop_location_score2'].mean()

    # Impute missing values of 'prop_location_score2' based on rounded integer values
    df['prop_location_score2'].fillna(df['rounded_score1'].map(mean_score2_by_rounded), inplace=True)

    # Drop the extra column 'rounded_score1' if no longer needed
    df.drop('rounded_score1', axis=1, inplace=True)

    # Print the runtime
    elapsed_time = time.time() - start_time
    print("impute_prop_location_score2_using_mean_score1 runtime: %.2f seconds" % elapsed_time)

    return df


def impute_prop_review_score(df : pd.DataFrame) -> pd.DataFrame:
    start_time = time.time()

    # Calculate the mean of 'prop_review_score' for each rounded integer value
    mean_review_score_for_starrating = df.groupby('prop_starrating')['prop_review_score'].mean()

    # Impute missing values of 'prop_review_score' based on rounded integer values
    df['prop_review_score'].fillna(df['prop_starrating'].map(mean_review_score_for_starrating), inplace=True)

    # Print the runtime
    elapsed_time = time.time() - start_time
    print("impute_prop_review_score runtime: %.2f seconds" % elapsed_time)

    return df


#def impute_prop_starrating(df: pd.DataFrame):

#def impute_query_affinity_score(df: pd.DataFrame):


# Calculate the mean distance per hotel and impute the NaN values with the mean distance
def impute_orig_destination_distance(df : pd.DataFrame) -> pd.DataFrame:
    start_time = time.time()

    # Calculate the mean distance per hotel
    mean_distance_per_hotel = df.groupby('prop_id')['orig_destination_distance'].mean()
    
    # Impute the NaN values with the mean distance
    df['orig_destination_distance'].fillna(df['prop_id'].map(mean_distance_per_hotel), inplace=True)

    # BUT there are also hotels that contain only NaN values for orig_destination_distance, 
    # for these we impute it with the mean distance for the country_id
    # Use visitor_location_country_id or country_id?

    # Calculate the mean distance per country
    mean_distance_per_country = df.groupby('visitor_location_country_id')['orig_destination_distance'].mean()

    # Impute the NaN values with the mean distance
    df['orig_destination_distance'].fillna(df['visitor_location_country_id'].map(mean_distance_per_country), inplace=True)
    
    # Print the runtime
    elapsed_time = time.time() - start_time
    print("impute_orig_destination_distance runtime: %.2f seconds" % elapsed_time)

    return df


def define_target(df : pd.DataFrame) -> pd.DataFrame:
    start_time = time.time()

    df['target'] = df['click_bool'] +  df['booking_bool'] * 5

    # Print the runtime
    elapsed_time = time.time() - start_time
    print("define_target runtime: %.2f seconds" % elapsed_time)

    return df


def one_hot_encode(df, cols):
    start_time = time.time()

    for col in cols:
        df = pd.get_dummies(df, columns=[col], prefix=col)

    # Print the runtime
    elapsed_time = time.time() - start_time
    print("one_hot_encode runtime: %.2f seconds" % elapsed_time)

    return df


def delete_id_columns(df: pd.DataFrame)-> pd.DataFrame:
    start_time = time.time()

    for col in df.columns:
        if col.endswith('_id'):
            df = df.drop(col, axis=1)

    # Print the runtime
    elapsed_time = time.time() - start_time
    print("delete_id_columns runtime: %.2f seconds" % elapsed_time)

    return df


def drop_nan_columns(df: pd.DataFrame)-> pd.DataFrame:
    start_time = time.time()

    for col in df.columns:
        if df[col].isna().sum() > 0:
            print(col)
            df = df.drop(col, axis=1)

    # Print the runtime
    elapsed_time = time.time() - start_time
    print("drop_nan_columns runtime: %.2f seconds" % elapsed_time)

    return df


def print_columns_containing_string(df):
    matching_columns = [col for col in df.columns if df[col].dtype == 'object']
    if matching_columns:
        print("Columns containing string values:")
        for col in matching_columns:
            print(col)


def remove_column(df, column_name):
    df = df.drop(column_name, axis=1)
    return df


#rescale all columns to [0,1]
def rescaler(df):
    for col in df.columns:
        # except for the id columns
        if not col.endswith('_id'):
            df[col] = (df[col] - df[col].min()) / (df[col].max() - df[col].min())
    return df


def main(df : pd.DataFrame) -> pd.DataFrame:
    df = remove_column(df, 'date_time')
    df = add_nan_column(df, ['prop_review_score', 'srch_query_affinity_score'])
    df = add_zero_column(df, ['prop_review_score', 'prop_starrating'])
    #df = impute_prop_loc_score2_using_future_score2(df)
    df = impute_prop_location_score2_using_mean_score1(df)
    df = impute_prop_review_score(df)
    df = impute_orig_destination_distance(df)
    df = define_target(df)
    #df = one_hot_encode(df, ['prop_country_id', 'visitor_location_country_id','prop_id'])
    #df = delete_id_columns(df)
    df = drop_nan_columns(df)
    print(df)
    df = print_columns_containing_string(df)
    df = rescaler(df)
    df.to_csv('data_cleaned.csv', index=False)

df = pd.read_csv('/Users/noahv/Data-Mining-techniques/course_dmt/ass2/datasets/training_set_VU_DM.csv')
main(df)






add_nan_column runtime: 0.07 seconds
add_zero_column runtime: 0.08 seconds
impute_prop_location_score2_using_mean_score1 runtime: 13.34 seconds
impute_prop_review_score runtime: 0.26 seconds
impute_orig_destination_distance runtime: 0.68 seconds
define_target runtime: 0.08 seconds
visitor_hist_starrating
visitor_hist_adr_usd
srch_query_affinity_score
comp1_rate
comp1_inv
comp1_rate_percent_diff
comp2_rate
comp2_inv
comp2_rate_percent_diff
comp3_rate
comp3_inv
comp3_rate_percent_diff
comp4_rate
comp4_inv
comp4_rate_percent_diff
comp5_rate
comp5_inv
comp5_rate_percent_diff
comp6_rate
comp6_inv
comp6_rate_percent_diff
comp7_rate
comp7_inv
comp7_rate_percent_diff
comp8_rate
comp8_inv
comp8_rate_percent_diff
gross_bookings_usd
drop_nan_columns runtime: 29.91 seconds
         srch_id  site_id  visitor_location_country_id  prop_country_id  \
0              1       12                          187              219   
1              1       12                          187              219   
2  

AttributeError: 'NoneType' object has no attribute 'columns'

CLEAN THE TEST DATA

In [None]:
import numpy as np
import pandas as pd
import time


def add_nan_column(df, cols):
    start_time = time.time()

    for col in cols:
        df[str(col) + '_is_nan'] = df[col].isna().astype(np.int8)

    # Print the runtime
    elapsed_time = time.time() - start_time
    print("add_nan_column runtime: %.2f seconds" % elapsed_time)

    return df


def add_zero_column(df, cols):
    start_time = time.time()

    for col in cols:
        df[str(col) + '_is_zero'] = df[col].eq(0).astype(np.int8)

    # Print the runtime
    elapsed_time = time.time() - start_time
    print("add_zero_column runtime: %.2f seconds" % elapsed_time)

    return df


#def calc_price_per_night(df: pd.DataFrame):


# Function to impute the prop_location_score2 values that are NaN based on the prop_location_score2 values of the same prop_id
def impute_prop_loc_score2_using_future_score2(df : pd.DataFrame) -> pd.DataFrame:
    start_time = time.time()

    # Retrieve the prop_ids that have a change in prop_location_score2 from NaN to a float over time
    has_change_over_time = df.groupby('prop_id')['prop_location_score2'].apply(lambda x: x.diff().notna().any())
    prop_ids_change = has_change_over_time[has_change_over_time == True].index

    # For each prop_id that has a change in prop_location_score2 from NaN to a float over time, impute the NaN values with the first non-NaN value
    for prop_id in prop_ids_change:
        mask = (df['prop_id'] == prop_id) & (df['prop_location_score2'].isnull())
        non_nan_value = df.loc[(df['prop_id'] == prop_id) & (df['prop_location_score2'].notna()), 'prop_location_score2'].values[0]
        df.loc[mask, 'prop_location_score2'] = non_nan_value

    # Print the runtime
    elapsed_time = time.time() - start_time
    print("impute_prop_loc_score2_using_future_score2 runtime: %.2f seconds" % elapsed_time)

    return df


# Impute remaning NaN values using the mean prop_locatioin_score2 values for each prop_location_score1 value
def impute_prop_location_score2_using_mean_score1(df : pd.DataFrame) -> pd.DataFrame:
    start_time = time.time()

    # Create an extra column with rounded 'prop_location_score1'
    df['rounded_score1'] = df['prop_location_score1'].round()

    # Calculate the mean of 'prop_location_score2' for each rounded integer value
    mean_score2_by_rounded = df.groupby('rounded_score1')['prop_location_score2'].mean()

    # Impute missing values of 'prop_location_score2' based on rounded integer values
    df['prop_location_score2'].fillna(df['rounded_score1'].map(mean_score2_by_rounded), inplace=True)

    # Drop the extra column 'rounded_score1' if no longer needed
    df.drop('rounded_score1', axis=1, inplace=True)

    # Print the runtime
    elapsed_time = time.time() - start_time
    print("impute_prop_location_score2_using_mean_score1 runtime: %.2f seconds" % elapsed_time)

    return df


def impute_prop_review_score(df : pd.DataFrame) -> pd.DataFrame:
    start_time = time.time()

    # Calculate the mean of 'prop_review_score' for each rounded integer value
    mean_review_score_for_starrating = df.groupby('prop_starrating')['prop_review_score'].mean()

    # Impute missing values of 'prop_review_score' based on rounded integer values
    df['prop_review_score'].fillna(df['prop_starrating'].map(mean_review_score_for_starrating), inplace=True)

    # Print the runtime
    elapsed_time = time.time() - start_time
    print("impute_prop_review_score runtime: %.2f seconds" % elapsed_time)

    return df


#def impute_prop_starrating(df: pd.DataFrame):

#def impute_query_affinity_score(df: pd.DataFrame):


# Calculate the mean distance per hotel and impute the NaN values with the mean distance
def impute_orig_destination_distance(df : pd.DataFrame) -> pd.DataFrame:
    start_time = time.time()

    # Calculate the mean distance per hotel
    mean_distance_per_hotel = df.groupby('prop_id')['orig_destination_distance'].mean()
    
    # Impute the NaN values with the mean distance
    df['orig_destination_distance'].fillna(df['prop_id'].map(mean_distance_per_hotel), inplace=True)

    # BUT there are also hotels that contain only NaN values for orig_destination_distance, 
    # for these we impute it with the mean distance for the country_id
    # Use visitor_location_country_id or country_id?

    # Calculate the mean distance per country
    mean_distance_per_country = df.groupby('visitor_location_country_id')['orig_destination_distance'].mean()

    # Impute the NaN values with the mean distance
    df['orig_destination_distance'].fillna(df['visitor_location_country_id'].map(mean_distance_per_country), inplace=True)
    
    # Print the runtime
    elapsed_time = time.time() - start_time
    print("impute_orig_destination_distance runtime: %.2f seconds" % elapsed_time)

    return df


def define_target(df : pd.DataFrame) -> pd.DataFrame:
    start_time = time.time()

    df['target'] = df['click_bool'] +  df['booking_bool'] * 5

    # Print the runtime
    elapsed_time = time.time() - start_time
    print("define_target runtime: %.2f seconds" % elapsed_time)

    return df


def one_hot_encode(df, cols):
    start_time = time.time()

    for col in cols:
        df = pd.get_dummies(df, columns=[col], prefix=col)

    # Print the runtime
    elapsed_time = time.time() - start_time
    print("one_hot_encode runtime: %.2f seconds" % elapsed_time)

    return df


def delete_id_columns(df: pd.DataFrame)-> pd.DataFrame:
    start_time = time.time()

    for col in df.columns:
        if col.endswith('_id'):
            df = df.drop(col, axis=1)

    # Print the runtime
    elapsed_time = time.time() - start_time
    print("delete_id_columns runtime: %.2f seconds" % elapsed_time)

    return df


#rescale all columns to [0,1]
def rescaler(df):
    for col in df.columns:
        # except for the id columns
        if col.endswith('_id') is False:
            df = df.with_columns(
                (pl.col(col) - pl.col(col).min()) / (pl.col(col).max() - pl.col(col).min())
            )
    return df


def main(df : pd.DataFrame) -> pd.DataFrame:
    df = add_nan_column(df, ['prop_review_score', 'srch_query_affinity_score'])
    df = add_zero_column(df, ['prop_review_score', 'prop_starrating'])
    #df = impute_prop_loc_score2_using_future_score2(df)
    df = impute_prop_location_score2_using_mean_score1(df)
    df = impute_prop_review_score(df)
    df = impute_orig_destination_distance(df)
    df = define_target(df)
    df = one_hot_encode(df, ['prop_country_id', 'visitor_location_country_id','prop_id'])
    df = delete_id_columns(df)
    df = rescaler(df)
    df.to_csv('test_data_cleaned.csv', index=False)

df = pd.read_csv('/Users/noahv/Data-Mining-techniques/course_dmt/ass2/datasets/test_set_VU_DM.csv')
main(df)




