In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [None]:
train_data_path = '../data/raw/training_set_VU_DM.csv'
list_of_chunks = []
for chunk in pd.read_csv(train_data_path, chunksize=100000):
    chunk["search_month"] = pd.to_datetime(chunk["date_time"]).dt.month
    modified_chunk = chunk.drop(columns=["date_time","gross_bookings_usd"])
    list_of_chunks.append(modified_chunk)
df = pd.concat(list_of_chunks, ignore_index=True)
print(df.shape)
print(df.info())





In [None]:
print("--- Comprehensive Data Overview ---")

summary_list = []

for col in df.columns:
    col_summary = {
        'Feature': col,
        'Data Type': df[col].dtype,
        'Non-Null Count': df[col].count(),
        'Missing Count': df[col].isnull().sum(),
        'Missing (%)': f"{df[col].isnull().mean() * 100:.2f}%",
        'Unique Values': df[col].nunique()
    }
    
    if pd.api.types.is_numeric_dtype(df[col]):
        col_summary['Mean'] = f"{df[col].mean():.2f}"
        col_summary['Median'] = f"{df[col].median():.2f}"
        col_summary['Std Dev'] = f"{df[col].std():.2f}"
        col_summary['Min'] = f"{df[col].min():.2f}"
        col_summary['25%'] = f"{df[col].quantile(0.25):.2f}"
        col_summary['75%'] = f"{df[col].quantile(0.75):.2f}"
        col_summary['Max'] = f"{df[col].max():.2f}"
    else: # Top Value and Frequency for non numeric  data
        col_summary['Top Value'] = df[col].mode()[0] if not df[col].mode().empty else 'N/A'
        col_summary['Top Freq'] = df[col].value_counts().iloc[0] if not df[col].value_counts().empty else 'N/A'
        
    summary_list.append(col_summary)


df_summary = pd.DataFrame(summary_list)



print(df_summary)

In [None]:
# print(df.isna().sum())
missing_percentages = df.isna().sum()/len(df)*100
missing_percentages_sorted = missing_percentages[missing_percentages > 0].sort_values(ascending=False)
print(missing_percentages_sorted)


In [None]:
if not missing_percentages_sorted.empty:
        plt.figure(figsize=(15, 7))
        missing_percentages_sorted.plot(kind='barh', color='purple')
        plt.title('Percentage of Missing Values per Column (in df_processed)')
        plt.ylabel('Percentage Missing (%)')
        plt.xticks(rotation=75, ha='right')
        plt.tight_layout() 
        plt.show()

In [None]:
# location_scores = df.loc[:,['prop_location_score1', 'prop_location_score2']]
# print(location_scores)
print(df['orig_destination_distance'].sample(50))
plt.figure(figsize=(15, 7))
# sns.histplot(df['orig_destination_distance'], bins=100, kde=True)
sns.boxplot(x=df['orig_destination_distance'])

In [None]:
print("--- Distribution of click_bool ---")
plt.figure(figsize=(3, 6))
sns.countplot(x='click_bool',hue='click_bool', data=df, palette="pastel",legend=False)
plt.title('Distribution of click_bool')
plt.xlabel('Clicked (0=No, 1=Yes)')
plt.ylabel('Count')

total_clicks = len(df['click_bool'])
if total_clicks > 0: 
    for p in plt.gca().patches:
        percentage = f'{100 * p.get_height() / total_clicks:.1f}%'
        x_ann = p.get_x() + p.get_width() / 2
        y_ann = p.get_height()
        plt.gca().annotate(percentage, (x_ann, y_ann), ha='center', va='bottom')
plt.show()
print(df['click_bool'].value_counts(normalize=True))
print("-" * 50)

In [None]:
print("--- Distribution of booking_bool ---")
plt.figure(figsize=(3, 6))
sns.countplot(x='booking_bool',hue='booking_bool', data=df, palette="pastel",legend=False)
plt.title('Distribution of booking_bool')
plt.xlabel('Booked (0=No, 1=Yes)')
plt.ylabel('Count')

total_bookings = len(df['booking_bool'])
if total_bookings > 0: 
    for p in plt.gca().patches:
        percentage = f'{100 * p.get_height() / total_bookings:.1f}%'
        x_ann = p.get_x() + p.get_width() / 2
        y_ann = p.get_height()
        plt.gca().annotate(percentage, (x_ann, y_ann), ha='center', va='bottom')
plt.show()
print(df['booking_bool'].value_counts(normalize=True))
print("-" * 50)

In [None]:
#Imputing values
#prop_review_score
print("Review scores filling")
mean_review_value = df['prop_review_score'].mean()
print(df['prop_review_score'].isna().sum())
df['prop_review_score'].fillna(mean_review_value,inplace=True)
print(df['prop_review_score'].isna().sum())



#prop_location_score2
print(50*'-')
print("location scores filling")
print(df['prop_location_score2'].head(10))
country_q1_map = df.groupby('prop_country_id')['prop_location_score2'].quantile(0.25)
global_q1_fallback_loc2 = df['prop_location_score2'].quantile(0.25)
print(country_q1_map.head(10))
print(global_q1_fallback_loc2)
nans_before_imputation = df['prop_location_score2'].isnull().sum()
print(f"\nMissing values in 'prop_location_score2' BEFORE imputation: {nans_before_imputation}")

imputation_values_for_nan_rows = df.loc[df['prop_location_score2'].isnull(), 'prop_country_id'].map(country_q1_map)
df.loc[df['prop_location_score2'].isnull(), 'prop_location_score2'] = imputation_values_for_nan_rows
df['prop_location_score2'].fillna(global_q1_fallback_loc2, inplace=True)

nans_after_imputation = df['prop_location_score2'].isnull().sum()
print(f"Missing values in 'prop_location_score2' AFTER imputation: {nans_after_imputation}")
print(df['prop_location_score2'].head(10))
sns.histplot(df['prop_location_score2'], bins=100, kde=True)




In [None]:
#orig_destination_distance
#later we will use log_orig_destination_distance and drop the original column
print(50*'-')
print("location scores filling")
print("Distance to the property values filling")
df['log_orig_destination_distance'] = np.log1p(df['orig_destination_distance'])
print(df['log_orig_destination_distance'].tail(50))
median_log_distance = df['log_orig_destination_distance'].median()
df['distance_missing'] = df['orig_destination_distance'].isna().astype(int)
df['log_orig_destination_distance'].fillna(median_log_distance,inplace=True)
print(50*'-')
print(df['log_orig_destination_distance'].isna().sum())    
# df['log_orig_destination_distance'].plot(kind='hist', bins=100, figsize=(15, 7), color='purple')
sns.histplot(df['log_orig_destination_distance'], bins=50, kde=True)
df = df.drop(columns=['orig_destination_distance'])

In [None]:
#Price_usd fixing values 
zero_price_count = (df['price_usd'] == 0).sum()
print(f"Number of zero prices: {zero_price_count}")
positive_prices = df.loc[(df['price_usd'] > 0) & (df['price_usd'].notna()), 'price_usd']
low_percentile_price = positive_prices.quantile(0.01) # 1st percentile
print(f"1st percentile of positive prices: {low_percentile_price:.2f}")
        
# Replace 0s with this low percentile value
df.loc[df['price_usd'] == 0, 'price_usd'] = low_percentile_price
print(f"Replaced {zero_price_count} zero price_usd values with {low_percentile_price:.2f}")

#Replace extreme outliers with the 99.5th percentile value
upper_cap_limit = df['price_usd'].quantile(0.995)
print(f"99th percentile (upper cap limit) for price_usd: {upper_cap_limit:.2f}")
# Identify values above the cap
outliers_above_cap = (df['price_usd'] > upper_cap_limit).sum()
print(f"Number of price_usd values above {upper_cap_limit:.2f}: {outliers_above_cap}")

if outliers_above_cap > 0:
        # Cap the values: set any price_usd greater than upper_cap_limit to upper_cap_limit
        df.loc[df['price_usd'] > upper_cap_limit, 'price_usd'] = upper_cap_limit
        print(f"Capped {outliers_above_cap} high outliers to {upper_cap_limit:.2f}")
else:
        print("No high outliers found above the 99th percentile to cap.")

#Replace low extreme outliers with the 1st percentile value

lower_cap_limit_low = df['price_usd'].quantile(0.01) 
print(f"\n1st percentile (lower cap limit) for price_usd: {lower_cap_limit_low:.2f}")

outliers_below_cap_low = (df['price_usd'] < lower_cap_limit_low).sum()
print(f"Number of price_usd values below {lower_cap_limit_low:.2f}: {outliers_below_cap_low}")
if outliers_below_cap_low > 0:
        # Cap the values
        df.loc[df['price_usd'] < lower_cap_limit_low, 'price_usd'] = lower_cap_limit_low
        print(f"Capped {outliers_below_cap_low} low outliers to {lower_cap_limit_low:.2f}")
else:
        print("No low outliers found below the 1st percentile to cap.")


sns.histplot(df['price_usd'], bins=100, kde=True)




In [None]:
#visitor history missing values 
df['visitor_history_present'] = df['visitor_hist_adr_usd'].notna().astype(int)
print(f"Number of entries with visitor history (ADR): {df['visitor_history_present'].sum()}")

#Engineer the new feature
df['price_diff'] = 0.0


condition_history_present = df['visitor_hist_adr_usd'].notna()
df.loc[condition_history_present, 'price_diff'] = df.loc[condition_history_present, 'visitor_hist_adr_usd'] - df.loc[condition_history_present, 'price_usd']

print(df['price_diff'].describe(percentiles=[.01, .05, .25, .5, .75, .95, .99]))
price_diff_with_history = df.loc[df['visitor_history_present'] == 1, 'price_diff']
sns.histplot(price_diff_with_history, bins=100, kde=True)
df = df.drop(columns=['visitor_hist_adr_usd'])



In [None]:
#Starrating history missing values and feature engineering
df['visitor_starrating_hist_present'] = df['visitor_hist_starrating'].notna().astype(int)
print(df['visitor_starrating_hist_present'].sum())

df['starrating_diff'] =0.0
condition_star_history_present = df['visitor_hist_starrating'].notna() 

df.loc[condition_star_history_present, 'starrating_diff'] = \
    df.loc[condition_star_history_present, 'visitor_hist_starrating'] - df.loc[condition_star_history_present, 'prop_starrating']

print(df['starrating_diff'].describe(percentiles=[.01, .05, .25, .5, .75, .95, .99]))   
sns.histplot(df['starrating_diff'], bins=100, kde=True)
df = df.drop(columns=['visitor_hist_starrating'])


In [None]:
df = df.drop(columns=['srch_query_affinity_score'])
print(df.isna().sum())

In [None]:
#check for competition and feature engineering
comp_rate_cols = [f'comp{i}_rate' for i in range(1, 9)]
comp_inv_cols = [f'comp{i}_inv' for i in range(1, 9)]
df['expedia_price_competitiveness'] = 0

#check for any cheaper alternative. we fill with -1 if there is at least one cheaper alternative
is_any_comp_cheaper = (df[comp_rate_cols] == -1).any(axis=1)
df.loc[is_any_comp_cheaper, 'expedia_price_competitiveness'] = -1



#check for expedia advantage
is_any_comp_better_deal_for_expedia = (df[comp_rate_cols] == 1).any(axis=1)

#Basically here we fill with 1 if there is no cheaper alternative AND Expedia is better than at least one competitor
condition_for_positive_competitiveness = (df['expedia_price_competitiveness'] == 0) & is_any_comp_better_deal_for_expedia
df.loc[condition_for_positive_competitiveness, 'expedia_price_competitiveness'] = 1

print("Engineered 'expedia_price_competitiveness'. Values:")
print(df['expedia_price_competitiveness'].value_counts(dropna=False))


#Sole availability check 
df['expedia_has_sole_availability_among_known_comps'] = 0


has_any_comp_inv_data = df[comp_inv_cols].notna().any(axis=1)
is_any_comp_also_available = (df[comp_inv_cols] == 0).any(axis=1)

# Expedia has sole availability if:
# - There IS competitor inventory data AND
# - NO competitor (with data) also has availability (i.e., all known competitor inv statuses are +1)
condition_sole_availability = has_any_comp_inv_data & (~is_any_comp_also_available)
df.loc[condition_sole_availability, 'expedia_has_sole_availability_among_known_comps'] = 1


print("Engineered 'expedia_has_sole_availability_among_known_comps'. Values:")
print(df['expedia_has_sole_availability_among_known_comps'].value_counts(dropna=False))

df['total_fee'] = df['price_usd']*df['srch_room_count']

#Dropping original columns
comp_rate_cols = [f'comp{i}_rate' for i in range(1, 9)]
comp_inv_cols = [f'comp{i}_inv' for i in range(1, 9)]
comp_diff_cols = [f'comp{i}_rate_percent_diff' for i in range(1, 9)]

columns_to_drop = comp_rate_cols + comp_inv_cols + comp_diff_cols
df.drop(columns=columns_to_drop, inplace=True, errors='ignore')
print(df.columns.tolist())

print(df.isna().sum())
print(50*'-')
print(df.shape)
print(50*'-')
print(df.info())




In [None]:
print("--- Generating Feature-Feature Correlation Matrix Heatmap ---")


numerical_predictor_cols = [
    'prop_starrating', 'prop_review_score', 'prop_brand_bool',
    'prop_location_score1', 'prop_location_score2', 'prop_log_historical_price',
    'position', 'price_usd', 
    'promotion_flag',
    'srch_length_of_stay', 'srch_booking_window', 'srch_adults_count',
    'srch_children_count', 'srch_room_count', 'srch_saturday_night_bool',
    'random_bool',
    'search_month',
    'log_orig_destination_distance', 'distance_missing',
    'visitor_history_present',
    'price_diff',
    'visitor_starrating_hist_present', 
    'starrating_diff',
    'expedia_price_competitiveness',
    'total_fee',
    'expedia_has_sole_availability_among_known_comps'
    
]

existing_numerical_predictor_cols = [col for col in numerical_predictor_cols if col in df.columns]

if existing_numerical_predictor_cols:
    correlation_matrix = df[existing_numerical_predictor_cols].corr()

    plt.figure(figsize=(22, 20)) 
    sns.heatmap(correlation_matrix, annot=False, cmap='coolwarm', fmt=".2f", linewidths=.5, vmin=-1, vmax=1)
    plt.title('Correlation Matrix of Numerical & Flag Predictor Features', fontsize=16)
    plt.xticks(fontsize=10, rotation=90) 
    plt.yticks(fontsize=10)
    plt.tight_layout()
    plt.show()
else:
    print("No numerical predictor columns identified for feature-feature correlation matrix.")


# Correlation Between Features and Target Values ---
print("\n--- Generating Feature-Target Correlations ---")

targets = ['click_bool', 'booking_bool']

if existing_numerical_predictor_cols:
    for target in targets:
        if target in df.columns:
            df_for_corr = df[existing_numerical_predictor_cols + [target]].copy()
            target_correlations = df_for_corr.corr()[target].sort_values(ascending=False)

            print(f"\n--- Correlation with {target} ---")
            print(target_correlations.drop(target, errors='ignore').dropna()) 

            # Visualization
            plt.figure(figsize=(8, 12)) # Made taller for more features
            target_correlations.drop(target, errors='ignore').dropna().plot(kind='barh', colormap='viridis')
            plt.title(f'Feature Correlation with {target}', fontsize=14)
            plt.xlabel('Pearson Correlation Coefficient', fontsize=12)
            plt.grid(axis='x', linestyle='--')
            plt.tight_layout()
            plt.show()
        else:
            print(f"Target column '{target}' not found in DataFrame.")
else:
    print("No numerical predictor columns identified for target correlation.")

In [None]:
#Export to .CSV
output_dir = '../data/modified/' # As per your screenshot
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

csv_output_path = os.path.join(output_dir, 'df_imputed_improved_train.csv')

try:
    df.to_csv(csv_output_path, index=False)
except Exception as e:
    print(f"Error saving DataFrame to CSV: {e}") 