In [1]:
import pyarrow.parquet as pq
import pandas as pd
import polars as pl
import datetime
from polars.datatypes import (
    Boolean, UInt8, UInt16, UInt32, UInt64, Utf8,
    Int8, Int16, Int32, Int64,
    Float32, Float64
)
from polars.datatypes import Utf8, Datetime, Date, Time
import xgboost as xgb
import numpy as np
import xgboost as xgb
from sklearn.model_selection import GroupKFold
from sklearn.metrics import ndcg_score
from geopy.distance import geodesic
from src.utils.utils import evaluate_feature
from src.train_new_base import train_base_model

In [2]:
def analyze_columns(df: pl.DataFrame) -> pl.DataFrame:
    """
    Для каждого столбца считает:
     - null_count: число пропущенных (None/null)
     - nan_count: число NaN (только для float)
     - min, max, mean (все как строки, чтобы избежать смешанных типов)
     - dtype
     - n_unique: число уникальных значений
    """
    numeric_types = {
        UInt8, UInt16, UInt32, UInt64,
        Int8, Int16, Int32, Int64,
        Float32, Float64
    }

    def fmt(x):
        if x is None:
            return None
        # datetime -> ISO
        if isinstance(x, (datetime.date, datetime.datetime)):
            return x.isoformat()
        # остальное просто str()
        return str(x)

    rows = []
    for name in df.columns:
        s = df[name]
        dtype = s.dtype

        null_count = s.null_count()
        nan_count  = int(s.is_nan().sum()) if dtype in {Float32, Float64} else 0

        # min/max
        try:
            mn = s.min()
            mx = s.max()
        except Exception:
            mn = mx = None

        # mean
        mean = s.mean() if dtype in numeric_types else None

        rows.append({
            "column":   name,
            "dtype":    str(dtype),
            "nulls":    null_count,
            "nans":     nan_count,
            "min":      fmt(mn),
            "max":      fmt(mx),
            "mean":     fmt(mean),
            "n_unique": s.n_unique(),
        })

    return pl.DataFrame(rows)


In [3]:
train_raw = pl.read_parquet('data/train.parquet').drop('__index_level_0__').with_columns(pl.col('selected').cast(pl.Int8))
test_raw = pl.read_parquet('data/test.parquet').drop('__index_level_0__').with_columns(pl.lit(2, dtype=pl.Int8).alias("selected"))
data = pl.concat([train_raw, test_raw])
#train_size = train_raw.shape[0]
#test_size = test_raw.shape[0]
#val_size = train_raw.shape[0]*0.1

In [4]:
test_raw = 0
train_raw = 0

In [3]:
rows = (
    pl.scan_parquet("data/train.parquet")        # or scan_parquet/scan_ndjson/…
      .select(pl.len())           # len() == COUNT(*)
      .collect(streaming=True)    # constant-memory execution
      .item()                     # get the scalar
)

  .collect(streaming=True)    # constant-memory execution


In [None]:
res = analyze_columns(data)

In [None]:
cols_use = res.filter(pl.col('nulls') < data.shape[0]*0.95)['column'].to_list()
data = data.select(cols_use)
#cols_use.remove('selected')

In [14]:
data.columns

['Id',
 'bySelf',
 'companyID',
 'corporateTariffCode',
 'frequentFlyer',
 'nationality',
 'isAccess3D',
 'isVip',
 'legs0_arrivalAt',
 'legs0_departureAt',
 'legs0_duration',
 'legs0_segments0_aircraft_code',
 'legs0_segments0_arrivalTo_airport_city_iata',
 'legs0_segments0_arrivalTo_airport_iata',
 'legs0_segments0_baggageAllowance_quantity',
 'legs0_segments0_baggageAllowance_weightMeasurementType',
 'legs0_segments0_cabinClass',
 'legs0_segments0_departureFrom_airport_iata',
 'legs0_segments0_duration',
 'legs0_segments0_flightNumber',
 'legs0_segments0_marketingCarrier_code',
 'legs0_segments0_operatingCarrier_code',
 'legs0_segments0_seatsAvailable',
 'legs0_segments1_aircraft_code',
 'legs0_segments1_arrivalTo_airport_city_iata',
 'legs0_segments1_arrivalTo_airport_iata',
 'legs0_segments1_baggageAllowance_quantity',
 'legs0_segments1_baggageAllowance_weightMeasurementType',
 'legs0_segments1_cabinClass',
 'legs0_segments1_departureFrom_airport_iata',
 'legs0_segments1_duration'

In [17]:
data['bySelf'].value_counts()

bySelf,count
bool,u32
False,3702556
True,21340592


In [4]:
def d2r(deg: str) -> pl.Expr:
    return pl.col(deg) * np.pi / 180

def polars_haversine(s_lat: str, s_lng: str, e_lat: str, e_lng: str) -> pl.Expr:
    R = 6373.0

    s_lat = d2r(s_lat)
    s_lng = d2r(s_lng)
    e_lat = d2r(e_lat)
    e_lng = d2r(e_lng)

    return (2 * R * (
        (e_lat - s_lat).truediv(2).sin().pow(2) + 
        (s_lat.cos() * e_lat.cos()) * (e_lng - s_lng).truediv(2).sin().pow(2))
            .sqrt().arcsin())


def feature_prep(dataset: pl.dataframe):
    drop_columns = ['Id', 'bySelf', 'pricingInfo_passengerCount', 'profileId', 'ranker_id', 'requestDate', '__index_level_0__']
    cat_columns = ['companyID', 'corporateTariffCode', 'frequentFlyer', 'nationality'] + \
    [f'legs{leg}_segments{seg}_aircraft_code' for leg in [0, 1] for seg in [0, 1, 2, 3]] + \
    [f'legs{leg}_segments{seg}_arrivalTo_airport_city_iata' for leg in [0, 1] for seg in [0, 1, 2, 3]] + \
    [f'legs{leg}_segments{seg}_arrivalTo_airport_iata' for leg in [0, 1] for seg in [0, 1, 2, 3]] + \
    [f'legs{leg}_segments{seg}_departureFrom_airport_iata' for leg in [0, 1] for seg in [0, 1, 2, 3]] + \
    [f'legs{leg}_segments{seg}_flightNumber' for leg in [0, 1] for seg in [0, 1, 2, 3]] + \
    [f'legs{leg}_segments{seg}_cabinClass' for leg in [0, 1] for seg in [0, 1, 2, 3]] + \
    [f'legs{leg}_segments{seg}_marketingCarrier_code' for leg in [0, 1] for seg in [0, 1, 2, 3]] + \
    [f'legs{leg}_segments{seg}_operatingCarrier_code' for leg in [0, 1] for seg in [0, 1, 2, 3]]
    bool_columns = ['isAccess3D', 'isVip'] + \
    [f'legs{leg}_segments{seg}_baggageAllowance_weightMeasurementType' for leg in [0, 1] for seg in [0, 1, 2, 3]] + \
    ['miniRules0_statusInfos', 'miniRules1_statusInfos'] + ['pricingInfo_isAccessTP'] + ['sex']
    num_columns = [f'legs{leg}_segments{seg}_seatsAvailable' for leg in [0, 1] for seg in [0, 1, 2, 3]] + \
    ['miniRules0_monetaryAmount', 'miniRules1_monetaryAmount'] + ['taxes', 'totalPrice']
    time_columns = [f'legs{leg}_arrivalAt' for leg in [0, 1]] + [f'legs{leg}_departureAt' for leg in [0, 1]]
    divide_columns = ['legs0_segments0_baggageAllowance_quantity', "legs0_segments1_baggageAllowance_quantity"]
    all_columns = set(dataset.columns)
    #all_columns -= set(drop_columns)

    cat_columns = set(cat_columns) & all_columns
    bool_columns = set(bool_columns) & all_columns
    num_columns = set(num_columns) & all_columns
    time_columns = set(time_columns) & all_columns
    divide_columns = set(divide_columns) & all_columns

    # bool features
    bool_features = [pl.col(col_name).fill_null(pl.col(col_name).median()) for col_name in bool_columns]
    drop_columns.extend(bool_columns)
    # num_features
    num_features = [pl.col(col_name).fill_null(pl.col(col_name).median()) for col_name in num_columns]
    drop_columns.extend(num_columns)
    # time features
    time_features = []
    for col in time_columns:
        # 1) парсим в Datetime (microseconds precision)
        dt = pl.col(col).str.strptime(pl.Datetime, format="%Y-%m-%dT%H:%M:%S")
        # 2) извлекаем нужные признаки
        time_features.append(dt.dt.weekday().alias(f"{col}_weekday"))
        time_features.append(dt.dt.hour().alias(f"{col}_hour"))

    duration_columns = [f'legs{leg}_duration' for leg in [0, 1]]

    drop_columns.extend(list(time_columns))



    # divide features
    filled = pl.col('legs0_segments0_baggageAllowance_quantity').fill_null(0)
    divide_features = [pl.when(filled > 3)
      .then(filled)
    .when((filled > 0) & (filled <= 3))
      .then(10)
    .otherwise(0)
    .alias(f"{col}_kg"),

    # units:
    #   — если исходное > 3, то 1
    #   — если 0 < исходное <= 3, то само значение
    #   — иначе 0
    pl.when(filled > 3)
      .then(1)
    .when((filled > 0) & (filled <= 3))
      .then(filled)
    .otherwise(0)
    .alias(f"{col}_units"),
    ]
    drop_columns.extend(divide_columns)
    
    # cat featrues
    cat_features = [(pl.col(c).rank("dense") - 1).fill_null(-1).cast(pl.Int32) for c in cat_columns]
    drop_columns.extend(cat_columns)
    
    
    return dataset.select(all_columns).with_columns(bool_features + num_features + time_features + divide_features + cat_features)

In [7]:
data = data.with_columns(pl.col('companyID').cast(pl.Int32),
                  pl.col('corporateTariffCode').cast(pl.Int16),
                  pl.col('nationality').cast(pl.Int8),
                  pl.col('legs0_segments0_baggageAllowance_quantity').cast(pl.Int8),
                  pl.col('legs0_segments0_baggageAllowance_weightMeasurementType').cast(pl.Boolean),
                  pl.col('legs0_segments0_cabinClass').cast(pl.Int8),
                  pl.col('legs0_segments0_seatsAvailable').cast(pl.Int8),
                  pl.col('legs0_segments1_baggageAllowance_quantity').cast(pl.Int8),
                  pl.col('legs0_segments1_baggageAllowance_weightMeasurementType').cast(pl.Boolean),
                  pl.col('legs0_segments1_cabinClass').cast(pl.Int8),
                  pl.col('legs0_segments1_seatsAvailable').cast(pl.Int8),
                  pl.col('legs0_segments2_baggageAllowance_quantity').cast(pl.Int8),
                  pl.col('legs0_segments2_baggageAllowance_weightMeasurementType').cast(pl.Boolean),
                  pl.col('legs0_segments2_cabinClass').cast(pl.Int8),
                  pl.col('legs0_segments2_seatsAvailable').cast(pl.Int8),
                  pl.col('legs0_segments3_baggageAllowance_quantity').cast(pl.Int8),
                  pl.col('legs0_segments3_baggageAllowance_weightMeasurementType').cast(pl.Boolean),
                  pl.col('legs0_segments3_cabinClass').cast(pl.Int8),
                  pl.col('legs0_segments3_seatsAvailable').cast(pl.Int8),

                  pl.col('legs1_segments0_baggageAllowance_quantity').cast(pl.Int8),
                  pl.col('legs1_segments0_baggageAllowance_weightMeasurementType').cast(pl.Boolean),
                  pl.col('legs1_segments0_cabinClass').cast(pl.Int8),
                  pl.col('legs1_segments0_seatsAvailable').cast(pl.Int8),
                  pl.col('legs1_segments1_baggageAllowance_quantity').cast(pl.Int8),
                  pl.col('legs1_segments1_baggageAllowance_weightMeasurementType').cast(pl.Boolean),
                  pl.col('legs1_segments1_cabinClass').cast(pl.Int8),
                  pl.col('legs1_segments1_seatsAvailable').cast(pl.Int8),
                  pl.col('legs1_segments2_baggageAllowance_quantity').cast(pl.Int8),
                  pl.col('legs1_segments2_baggageAllowance_weightMeasurementType').cast(pl.Boolean),
                  pl.col('legs1_segments2_cabinClass').cast(pl.Int8),
                  pl.col('legs1_segments2_seatsAvailable').cast(pl.Int8),
                  pl.col('legs1_segments3_baggageAllowance_quantity').cast(pl.Int8),
                  pl.col('legs1_segments3_baggageAllowance_weightMeasurementType').cast(pl.Boolean),
                  pl.col('legs1_segments3_cabinClass').cast(pl.Int8),
                  pl.col('legs1_segments3_seatsAvailable').cast(pl.Int8),

                  pl.col('miniRules0_monetaryAmount').cast(pl.Int32),
                  pl.col('miniRules1_monetaryAmount').cast(pl.Int32),
                  pl.col('miniRules0_percentage').cast(pl.Int8),
                  pl.col('miniRules1_percentage').cast(pl.Int8),
                  pl.col('miniRules0_statusInfos').cast(pl.Boolean),
                  pl.col('miniRules1_statusInfos').cast(pl.Boolean),
                  pl.col('pricingInfo_isAccessTP').cast(pl.Boolean),
                  pl.col('pricingInfo_passengerCount').cast(pl.Boolean),
                  pl.col('profileId').cast(pl.Int32),
                  pl.col('taxes').cast(pl.Float32),
                  )

In [8]:
data.write_parquet('data/full_data_with_trimmed_sizes.parquet')

In [5]:
data = pl.read_parquet('data/full_data_with_trimmed_sizes.parquet')

In [6]:
airports_data = pl.read_csv('data/airports.csv')
# 2) extract the departure/arrival codes
data = data.with_columns([pl.col("searchRoute").str.extract(r"^([^/]{3})([^/]{3})", 1).alias("airportcode1"),
                          pl.col("searchRoute").str.extract(r"^([^/]{3})([^/]{3})", 2).alias("airportcode2"),
                            ])

# 3) prepare two lookups with lat/lon already aliased
lookup1 = airports_data.select([
    pl.col("IATA"),
    pl.col("GeoPointLat").alias("GeoPointLat_search_airport1"),
    pl.col("GeoPointLong").alias("GeoPointLong_search_airport1"),
])
lookup2 = airports_data.select([
    pl.col("IATA"),
    pl.col("GeoPointLat").alias("GeoPointLat_search_airport2"),
    pl.col("GeoPointLong").alias("GeoPointLong_search_airport2"),
])
lookup3 = (airports_data.filter(pl.col("GeoPointLat") != 0).select([pl.col("City_IATA"),
                                                                    pl.col("GeoPointLat").alias("GeoPointLat_search_city_IATA_1"),
                                                                    pl.col("GeoPointLong").alias("GeoPointLong_search_city_IATA_1"),
                                                                    ]).unique(subset="City_IATA")
)
lookup4 = (airports_data.filter(pl.col("GeoPointLat") != 0).select([pl.col("City_IATA"),
                                                                    pl.col("GeoPointLat").alias("GeoPointLat_search_city_IATA_2"),
                                                                    pl.col("GeoPointLong").alias("GeoPointLong_search_city_IATA_2"),
                                                                    ]).unique(subset="City_IATA")
)

In [7]:
data = (
    data
    .join(lookup1, left_on="airportcode1", right_on="IATA", how="left")
    .join(lookup2, left_on="airportcode2", right_on="IATA", how="left")
    .join(lookup3, left_on="airportcode1", right_on="City_IATA", how="left")
    .join(lookup4, left_on="airportcode2", right_on="City_IATA", how="left")
)

data = data.with_columns([
                # if lat_air1 is null, take lat_city1
                pl.coalesce(
                [pl.col("GeoPointLat_search_airport1"), pl.col("GeoPointLat_search_city_IATA_1")]
                ).alias("GeoPointLat_search_dep"),
                pl.coalesce(
                [pl.col("GeoPointLong_search_airport1"), pl.col("GeoPointLong_search_city_IATA_1")]
                ).alias("GeoPointLong_search_dep"),
                pl.coalesce(
                [pl.col("GeoPointLat_search_airport2"), pl.col("GeoPointLat_search_city_IATA_2")]
                ).alias("GeoPointLat_search_dest"),
                pl.coalesce(
                [pl.col("GeoPointLong_search_airport2"), pl.col("GeoPointLong_search_city_IATA_2")]
                ).alias("GeoPointLong_search_dest"),
            ])

In [8]:
data = data.with_columns(data.select(polars_haversine("GeoPointLat_search_dep", "GeoPointLong_search_dep", "GeoPointLat_search_dest", "GeoPointLong_search_dest"))['literal'].alias('route_distance_km')).with_columns(pl.col('route_distance_km').fill_null(pl.median('route_distance_km')).alias('route_distance_km'))


In [9]:
total_rows = data.height

hour_sin_map = {h: np.sin(2 * np.pi * h / 24) for h in range(24)}
hour_cos_map = {h: np.cos(2 * np.pi * h / 24) for h in range(24)}
wday_sin_map = {d: np.sin(2 * np.pi * d / 7) for d in range(7)}
wday_cos_map = {d: np.cos(2 * np.pi * d / 7) for d in range(7)}

data = data.with_columns([
    # 1) raw count per company
    pl.len().over("companyID").alias("company_count"),
    # 2) relative frequency = count(company_id) / total_rows
    (pl.len().over("companyID") / pl.lit(total_rows)).alias("company_freq"),
    pl.col("corporateTariffCode")
        .fill_null(-1)
        .cast(pl.Utf8)
        .cast(pl.Categorical)     
        .alias("tariff_code_filled"),
    pl.col("frequentFlyer").str.count_matches("/").fill_null(0).alias('frequentFlyer_n_programs'),
    pl.col('nationality').cast(pl.Utf8).cast(pl.Categorical).alias("nationality_cat"),
    pl.col("ranker_id").n_unique().over("profileId").alias("unique_ranker_count"),
    pl.col("requestDate").dt.hour().replace_strict(hour_sin_map, default=0).alias("hour_sin"),
    pl.col("requestDate").dt.hour().replace_strict(hour_cos_map, default=0).alias("hour_cos"),
    pl.col("requestDate").dt.weekday().replace_strict(wday_sin_map, default=0).alias("wday_sin"),
    pl.col("requestDate").dt.weekday().replace_strict(wday_cos_map, default=0).alias("wday_cos"),
    pl.col("searchRoute").str.count_matches("/").fill_null(0).alias('twoway_route'),

    ])
data = data.with_columns(data.select([(pl.col('airportcode1') + pl.col('airportcode2')).alias('search_oneway'), (pl.col('totalPrice')*(pl.col('twoway_route')*(-0.5) + 1)).alias('oneway_price')]))

lookup_prices = data.select('twoway_route', 'search_oneway', 'oneway_price').group_by("search_oneway").agg((pl.col("oneway_price")).mean().alias("avg_oneway_price"))
data = data.join(lookup_prices, on='search_oneway', how='left')
for i in [0, 1]:
    data = data.with_columns([
        pl.when(
            pl.col(f"legs{i}_duration").str.contains(r"\.")
        )
        .then(
            pl.col(f"legs{i}_duration").str.split(".").list.get(-1)
        )
        .otherwise(
            pl.col(f"legs{i}_duration").str.split(".").list.get(0)
        )
        .str.strptime(pl.Time, "%H:%M:%S")
        .alias("duration_time"),
        pl.col(f"legs{i}_duration")
        .str.extract(r"^(\d+)\.", 1) 
        .cast(pl.Int32)                  # строка → целое
        .fill_null(0)
        .alias("duration_days")
    ]).with_columns([(pl.col("duration_time")
        .cast(pl.Int64)    # nanoseconds since midnight
        / 1e9              # → seconds
        / 3600             # → hours
        ).alias("time_in_hours")]).with_columns([
        ((pl.col("duration_days") * 24) + pl.col("time_in_hours"))
        .alias(f"total_duration_hours_leg{i}")])



time_cols = ["legs0_departureAt", "legs0_arrivalAt", "legs1_departureAt", "legs1_arrivalAt"]

period_exprs = []
for col in time_cols:
    dt   = pl.col(col).str.strptime(pl.Datetime, "%Y-%m-%dT%H:%M:%S")
    hour = dt.dt.hour()

    period = (
        pl.when(dt.is_null()).then(pl.lit("no_return"))
          .when(hour < 6)   .then(pl.lit("night"))
          .when(hour < 12)  .then(pl.lit("morning"))
          .when(hour < 18)  .then(pl.lit("day"))
          .otherwise(pl.lit("evening"))
          .cast(pl.Categorical)
          .alias(f"{col}_period")
    )
    period_exprs.append(period)

# добавляем в DataFrame
data = data.with_columns(period_exprs)
for col in time_cols:
    data = data.with_columns(pl.col(col).str.strptime(pl.Datetime, "%Y-%m-%dT%H:%M:%S").alias(col))

In [10]:
aircraft_codes = ['legs0_segments0_aircraft_code', 'legs0_segments1_aircraft_code', 'legs0_segments2_aircraft_code', 'legs0_segments3_aircraft_code',
                  'legs1_segments0_aircraft_code', 'legs1_segments1_aircraft_code', 'legs1_segments2_aircraft_code', 'legs1_segments3_aircraft_code']

data = data.with_columns([(pl.len().over(aircraft_code) / pl.lit(total_rows)).alias(f"{aircraft_code}_freq") for aircraft_code in aircraft_codes])
data = data.with_columns([
    # если тип измерения = 1 (килограммы), то кладём quantity в *_kg, иначе 0
    pl.when(pl.col(f"legs{leg}_segments{seg}_baggageAllowance_weightMeasurementType") == 1)
      .then(pl.col(f"legs{leg}_segments{seg}_baggageAllowance_quantity"))
      .otherwise(pl.col(f'legs{leg}_segments{seg}_baggageAllowance_quantity').filter(pl.col(f'legs{leg}_segments{seg}_baggageAllowance_quantity') > 3).mean()*pl.col(f"legs{leg}_segments{seg}_baggageAllowance_quantity"))
      .alias(f"legs{leg}_segments{seg}_baggageAllowance_quantity_kg") for leg in [0, 1] for seg in [0, 1, 2, 3]
])
data = data.with_columns([
    # если тип измерения = 1 (килограммы), то кладём quantity в *_kg, иначе 0
    pl.when(pl.col(f"legs{leg}_segments{seg}_baggageAllowance_weightMeasurementType") == 0)
      .then(pl.col(f"legs{leg}_segments{seg}_baggageAllowance_quantity"))
      .otherwise(pl.col(f"legs{leg}_segments{seg}_baggageAllowance_quantity") / pl.col(f'legs{leg}_segments{seg}_baggageAllowance_quantity').filter(pl.col(f'legs{leg}_segments{seg}_baggageAllowance_quantity') > 3).mean())
      .alias(f"legs{leg}_segments{seg}_baggageAllowance_quantity_units") for leg in [0, 1] for seg in [0, 1, 2, 3]
])
data = data.with_columns([
        # cast to Polars Categorical
        pl.col(f"legs{leg}_segments{seg}_cabinClass")
          .cast(pl.Utf8)           # first make it integer
          .cast(pl.Categorical)     # then categorical
          .alias(f"legs{leg}_segments{seg}_cabinClass_cat") for leg in [0, 1] for seg in [0, 1, 2, 3]
    ])
change_exprs = []
for leg in [0, 1]:
    # build a 0/1 expr for each transition i→i+1
    transitions = []
    for i in range(3):  # segments 0→1, 1→2, 2→3
        arr = f"legs{leg}_segments{i}_arrivalTo_airport_iata"
        dep = f"legs{leg}_segments{i+1}_departureFrom_airport_iata"
        transitions.append(
            # True if they differ → cast to 1, else 0
            (pl.col(arr) != pl.col(dep))
            .fill_null(False)    # treat any Null as “no change”
            .cast(pl.UInt8)
        )
    # sum the three 0/1 flags
    change_exprs.append(
        sum(transitions)
        .alias(f"legs{leg}_airport_changes_count")
    )
data = data.with_columns(change_exprs)

In [11]:
legs          = [0, 1]
max_segments  = 4                 # segments 0 … 3
indicator_typ = pl.UInt8          # 0/1 integers

exprs = []

for leg in legs:
    # build per-segment “is present?” flags → UInt8
    flags = [
        pl.col(f"legs{leg}_segments{i}_departureFrom_airport_iata")
          .is_not_null()
          .cast(indicator_typ)
        for i in range(max_segments)
    ]

    # horizontal sum of the 4 flags
    seg_cnt = pl.sum_horizontal(*flags).alias(f"legs{leg}_num_segments")
    exprs.append(seg_cnt)

# add per-leg counts, then total
data = (
    data.with_columns(exprs)
      .with_columns(
          (pl.col("legs0_num_segments") + pl.col("legs1_num_segments"))
          .alias("total_segments_count")
      )
)


In [12]:
data.columns

['Id',
 'bySelf',
 'companyID',
 'corporateTariffCode',
 'frequentFlyer',
 'nationality',
 'isAccess3D',
 'isVip',
 'legs0_arrivalAt',
 'legs0_departureAt',
 'legs0_duration',
 'legs0_segments0_aircraft_code',
 'legs0_segments0_arrivalTo_airport_city_iata',
 'legs0_segments0_arrivalTo_airport_iata',
 'legs0_segments0_baggageAllowance_quantity',
 'legs0_segments0_baggageAllowance_weightMeasurementType',
 'legs0_segments0_cabinClass',
 'legs0_segments0_departureFrom_airport_iata',
 'legs0_segments0_duration',
 'legs0_segments0_flightNumber',
 'legs0_segments0_marketingCarrier_code',
 'legs0_segments0_operatingCarrier_code',
 'legs0_segments0_seatsAvailable',
 'legs0_segments1_aircraft_code',
 'legs0_segments1_arrivalTo_airport_city_iata',
 'legs0_segments1_arrivalTo_airport_iata',
 'legs0_segments1_baggageAllowance_quantity',
 'legs0_segments1_baggageAllowance_weightMeasurementType',
 'legs0_segments1_cabinClass',
 'legs0_segments1_departureFrom_airport_iata',
 'legs0_segments1_duration'

In [13]:
for i in [0, 1]:
    for j in [0,1,2,3]:
        data = data.with_columns([
            pl.when(
                pl.col(f"legs{i}_segments{j}_duration").str.contains(r"\.")
            )
            .then(
                pl.col(f"legs{i}_segments{j}_duration").str.split(".").list.get(-1)
            )
            .otherwise(
                pl.col(f"legs{i}_segments{j}_duration").str.split(".").list.get(0)
            )
            .str.strptime(pl.Time, "%H:%M:%S")
            .alias(f"duration_time_legs{i}_segments{j}"),
            pl.col(f"legs{i}_segments{j}_duration")
            .str.extract(r"^(\d+)\.", 1) 
            .cast(pl.Int32)                  # строка → целое
            .fill_null(0)
            .alias(f"duration_days_legs{i}_segments{j}")
        ]).with_columns([(pl.col(f"duration_time_legs{i}_segments{j}")
            .cast(pl.Int64)    # nanoseconds since midnight
            / 1e9              # → seconds
            / 3600             # → hours
            ).alias(f"time_in_hours_legs{i}_segments{j}")]).with_columns([
            ((pl.col(f"duration_days_legs{i}_segments{j}") * 24) + pl.col(f"time_in_hours_legs{i}_segments{j}")).fill_null(0)
            .alias(f"total_duration_hours_leg{i}_segment{j}")])

In [14]:
data = data.with_columns([(pl.col('total_duration_hours_leg0') - (pl.col('total_duration_hours_leg0_segment0') + 
                  pl.col('total_duration_hours_leg0_segment1') + 
                  pl.col('total_duration_hours_leg0_segment2') + 
                  pl.col('total_duration_hours_leg0_segment2'))).cast(pl.Float32).alias('layover_hours_leg0'),

                  (pl.col('total_duration_hours_leg1') - (pl.col('total_duration_hours_leg1_segment0') + 
                  pl.col('total_duration_hours_leg1_segment1') + 
                  pl.col('total_duration_hours_leg1_segment2') + 
                  pl.col('total_duration_hours_leg1_segment2'))).cast(pl.Float32).alias('layover_hours_leg1')])
data = data.with_columns((pl.col('legs0_departureAt') - pl.col('requestDate')).dt.total_days().alias('days_before_flight_leg0'))

In [15]:
data = data.with_columns([(pl.col('legs0_departureAt') + (pl.col('total_duration_hours_leg0_segment0')*3.6e6).cast(pl.Duration('ms'))).alias('leg0_seg0_arrivalAt'),
                   (pl.col('legs1_departureAt') + (pl.col('total_duration_hours_leg1_segment0')*3.6e6).cast(pl.Duration('ms'))).alias('leg1_seg0_arrivalAt')])
data = data.with_columns([pl.when((pl.col('leg0_seg0_arrivalAt').dt.hour() < 6) & (pl.col('leg0_seg0_arrivalAt').dt.hour() > 0) & (pl.col('duration_time_legs0_segments1').is_not_null())).then(1).otherwise(0).alias('night_layover_leg0'),
                   pl.when((pl.col('leg1_seg0_arrivalAt').dt.hour() < 6) & (pl.col('leg1_seg0_arrivalAt').dt.hour() > 0) & (pl.col('duration_time_legs1_segments1').is_not_null())).then(1).otherwise(0).alias('night_layover_leg1'),])

In [16]:
a, b, c, d = (pl.col("legs0_segments0_marketingCarrier_code"),
              pl.col("legs0_segments1_marketingCarrier_code"),
              pl.col("legs0_segments2_marketingCarrier_code"),
              pl.col("legs0_segments3_marketingCarrier_code"))

data = data.with_columns([
    (
        # все 6 попарных проверок «равны ИЛИ хотя бы один null»
        (a.is_null() | b.is_null() | (a == b)) &
        (a.is_null() | c.is_null() | (a == c)) &
        (a.is_null() | d.is_null() | (a == d)) &
        (b.is_null() | c.is_null() | (b == c)) &
        (b.is_null() | d.is_null() | (b == d)) &
        (c.is_null() | d.is_null() | (c == d))
    )
    .cast(pl.UInt8)                 # True → 1, False → 0
    .alias("same_operator_carrier_leg0")
])
a, b, c, d = (pl.col("legs1_segments0_marketingCarrier_code"),
              pl.col("legs1_segments1_marketingCarrier_code"),
              pl.col("legs1_segments2_marketingCarrier_code"),
              pl.col("legs1_segments3_marketingCarrier_code"))
data = data.with_columns([
    (
        # все 6 попарных проверок «равны ИЛИ хотя бы один null»
        (a.is_null() | b.is_null() | (a == b)) &
        (a.is_null() | c.is_null() | (a == c)) &
        (a.is_null() | d.is_null() | (a == d)) &
        (b.is_null() | c.is_null() | (b == c)) &
        (b.is_null() | d.is_null() | (b == d)) &
        (c.is_null() | d.is_null() | (c == d))
    )
    .cast(pl.UInt8)                 # True → 1, False → 0
    .alias("same_operator_carrier_leg1")
])

In [17]:
data = data.with_columns([pl.when(pl.col('legs0_segments0_marketingCarrier_code').is_in(pl.col('frequentFlyer')) & (pl.col('same_operator_carrier_leg0') == 1)).then(1).otherwise(0).alias('ticket_is_in_FFprogramms_leg0'),
                   pl.when(pl.col('legs1_segments0_marketingCarrier_code').is_in(pl.col('frequentFlyer')) & (pl.col('same_operator_carrier_leg1') == 1)).then(1).otherwise(0).alias('ticket_is_in_FFprogramms_leg1')])
data = data.with_columns([
    pl.col("totalPrice")
      .rank("dense", descending=False)   # 1 = самая дешёвая цена
      .over("ranker_id")                 # ранжируем в пределах каждой группы
      .alias("totalPrice_rank")
])
data = data.with_columns([
    (pl.col("total_duration_hours_leg0") + pl.col("total_duration_hours_leg1").fill_null(0))
      .rank("dense", descending=False)   # 1 = самая дешёвая цена
      .over("ranker_id")                 # ранжируем в пределах каждой группы
      .alias("totalTime_hours_ranked")
])
kg_cols    = [f"legs{leg}_segments{seg}_baggageAllowance_quantity_kg"
              for leg in [0, 1] for seg in [0, 1, 2, 3]]

units_cols = [f"legs{leg}_segments{seg}_baggageAllowance_quantity_units"
              for leg in [0, 1] for seg in [0, 1, 2, 3]]

# ─── 2. функция «все одинаковы (игнорируя null)» без сбора списка в каждой строке
def equal_flag(cols: list[str], flag_name: str) -> pl.Expr:
    not_null_cnt = pl.sum_horizontal(*[
        pl.col(c).is_not_null().cast(pl.UInt8) for c in cols
    ])                              # сколько ненулевых значений в строке

    same_value = (
        pl.min_horizontal(*cols) ==         # min == max
        pl.max_horizontal(*cols)
    )

    # итог: 1, если (a) есть хотя бы одно ненулевое
    #       и (b) min == max (т.е. все ненулевые одинаковы)
    return (
        (not_null_cnt > 0) & same_value
    ).cast(pl.UInt8).alias(flag_name)

data = data.with_columns([
    equal_flag(kg_cols,    "baggage_kg_equal_flag"),
    equal_flag(units_cols, "baggage_units_equal_flag"),
])
data = data.with_columns([
    # len() в оконной функции → число строк в текущем окне
    pl.len()
      .over("ranker_id")          # окно = все строки того же ranker_id
      .alias("tickets_in_session")
])
rem_cols = [
    f"legs{leg}_segments{seg}_seatsAvailable"
    for leg in [0, 1]
    for seg in [0, 1, 2, 3]
]

# 2. среднее по валидным (не-null) сегментам
data = data.with_columns([
    pl.mean_horizontal(*rem_cols)               # null игнорируются
      .alias("remainingTickets_avg")
])

# 3. ранжируем по среднему (больше билетов → ранг 1)
data = data.with_columns([
    pl.col("remainingTickets_avg")
      .rank(method="dense", descending=True)    # 1, 2, 3, …
      .cast(pl.UInt32)
      .alias("remainingTickets_rank")
])
data = data.with_columns(
          pl.col("ranker_id")
            .n_unique()            # distinct searches
            .over("profileId")     # window partition = user
            .alias("user_search_freq")
      )

Please use `implode` to return to previous behavior.

See https://github.com/pola-rs/polars/issues/22149 for more information.
  data = data.with_columns([pl.when(pl.col('legs0_segments0_marketingCarrier_code').is_in(pl.col('frequentFlyer')) & (pl.col('same_operator_carrier_leg0') == 1)).then(1).otherwise(0).alias('ticket_is_in_FFprogramms_leg0'),


In [18]:
legs = range(2)
segs = range(4)

pairs = [
    (
        f"legs{leg}_segments{seg}_marketingCarrier_code",
        f"legs{leg}_segments{seg}_operatingCarrier_code",
        f"match_l{leg}s{seg}",   # 1/0/Null  – совпали коды?
        f"valid_l{leg}s{seg}",   # 1/0        – оба кода не Null?
    )
    for leg in legs
    for seg in segs
]

# ------------ выражения для каждой пары ----------------------------------
match_exprs, valid_exprs = [], []
for m_col, o_col, match_tmp, valid_tmp in pairs:
    match_exprs.append(
        (pl.when(pl.col(m_col).is_not_null() & pl.col(o_col).is_not_null())
             .then((pl.col(m_col) == pl.col(o_col)).cast(pl.Int8))
             .otherwise(None)
         ).alias(match_tmp)
    )
    valid_exprs.append(
        (pl.col(m_col).is_not_null() & pl.col(o_col).is_not_null())
          .cast(pl.Int8)
          .alias(valid_tmp)
    )
# ------------ списки колонок-выражений для горизонтальных сумм ----------
match_cols  = [pl.col(c) for _, _, c, _ in pairs]   # pl.col("match_l0s0"), ...
valid_cols  = [pl.col(v) for _, _, _, v in pairs]   # pl.col("valid_l0s0"), ...

data = data.with_columns(match_exprs + valid_exprs).with_columns([
        pl.sum_horizontal(match_cols).alias("match_sum"),
        pl.sum_horizontal(valid_cols).alias("valid_sum"),
    ]).with_columns(
        pl.when(pl.col("valid_sum") > 0)
        .then((pl.col("match_sum") / pl.col("valid_sum")).cast(pl.Float32))
        .otherwise(None)
        .alias("operator_marketer_match_rate")
    ).drop(
        [c for _, _, c, _ in pairs] +
        [v for _, _, _, v in pairs] +
        ["match_sum", "valid_sum"]
    )
data = data.with_columns(
          pl.min('totalPrice')
            .over("ranker_id")
            .alias("min_price_in_search")
      ).with_columns(
          (
              pl.col('totalPrice') <= pl.col("min_price_in_search") * 1.20
          ).cast(pl.Int8)
           .alias("within_20pct_of_min")           # 1 = да, 0 = нет
      ).drop("min_price_in_search")
data = data.with_columns([pl.col('legs0_departureAt').dt.weekday().replace_strict(wday_sin_map, default=0).alias("leg0_depday_sin"),
                          pl.col('legs0_departureAt').dt.weekday().replace_strict(wday_cos_map, default=0).alias("leg0_depday_cos"),
                          pl.col('legs1_departureAt').dt.weekday().replace_strict(wday_sin_map, default=0).alias("leg1_depday_sin"),
                          pl.col('legs1_departureAt').dt.weekday().replace_strict(wday_cos_map, default=0).alias("leg1_depday_cos"),])
data = data.with_columns([pl.col('legs0_arrivalAt').dt.weekday().replace_strict(wday_sin_map, default=0).alias("leg0_arrday_sin"),
                        pl.col('legs0_arrivalAt').dt.weekday().replace_strict(wday_cos_map, default=0).alias("leg0_arrday_cos"),
                        pl.col('legs1_arrivalAt').dt.weekday().replace_strict(wday_sin_map, default=0).alias("leg1_arrday_sin"),
                        pl.col('legs1_arrivalAt').dt.weekday().replace_strict(wday_cos_map, default=0).alias("leg1_arrday_cos")])
data = data.with_columns(
          (
              pl.col("total_duration_hours_leg0") +
              pl.col("total_duration_hours_leg1").fill_null(0)
          ).alias("flight_hours")
      ).with_columns([
          pl.min('totalPrice').over("ranker_id").alias("min_price_in_search"),
          pl.min("flight_hours").over("ranker_id").alias("min_time_in_search"),
      ]).with_columns(
          (
              (
                  pl.col('totalPrice')   / pl.col("min_price_in_search") +
                  pl.col("flight_hours") / pl.col("min_time_in_search")
              ) / 2
          ).alias("opt_ticket_score")
      ).drop(["min_price_in_search", "min_time_in_search", "flight_hours"])

In [19]:
seg_exprs = []
for leg in (0, 1):
    seg_cols = [f"legs{leg}_segments{s}_duration" for s in range(4) if f"legs{leg}_segments{s}_duration" in data.columns]
    if seg_cols:
        seg_exprs.append(
            pl.sum_horizontal(pl.col(c).is_not_null() for c in seg_cols)
                .cast(pl.Int32).alias(f"n_segments_leg{leg}")
        )
    else:
        seg_exprs.append(pl.lit(0).cast(pl.Int32).alias(f"n_segments_leg{leg}"))
data = data.with_columns(seg_exprs)
data = data.with_columns((pl.col("legs1_segments0_departureFrom_airport_iata").is_null()).cast(pl.Int32).alias("is_one_way"))
data = data.with_columns([
    (pl.col("n_segments_leg0") + pl.col("n_segments_leg1")).alias("total_segments"),
    (pl.col("n_segments_leg0") == 1).cast(pl.Int32).alias("is_direct_leg0"),
    pl.when(pl.col("is_one_way") == 1).then(0)
        .otherwise((pl.col("n_segments_leg1") == 1).cast(pl.Int32)).alias("is_direct_leg1"),
])

data = data.with_columns([(pl.col("totalPrice") / (pl.col("taxes") + 1)).alias("price_per_tax"),
                   (pl.col("taxes") / (pl.col("totalPrice") + 1)).alias("tax_rate"),
                   pl.col("totalPrice").log1p().alias("log_price"),
                    pl.when(pl.col("total_duration_hours_leg1").fill_null(0) > 0)
                    .then(pl.col("total_duration_hours_leg0") / (pl.col("total_duration_hours_leg1") + 1))
                    .otherwise(1.0).alias("duration_ratio"),
                    pl.col("corporateTariffCode").is_not_null().cast(pl.Int32).alias("has_corporate_tariff"),
                    (pl.col("pricingInfo_isAccessTP") == 1).cast(pl.Int32).alias("has_access_tp"),
                    (pl.col("miniRules0_monetaryAmount").fill_null(0) + 
                    pl.col("miniRules1_monetaryAmount").fill_null(0)).alias("total_fees"),
                    pl.col("searchRoute").is_in(["MOWLED/LEDMOW", "LEDMOW/MOWLED", "MOWLED", "LEDMOW", "MOWAER/AERMOW"])
                    .cast(pl.Int32).alias("is_popular_route"),
                    pl.mean_horizontal(["legs0_segments0_cabinClass", "legs1_segments0_cabinClass"]).alias("avg_cabin_class"),
                    (pl.col("legs0_segments0_cabinClass").fill_null(0) - 
                    pl.col("legs1_segments0_cabinClass").fill_null(0)).alias("cabin_class_diff"),
                    (pl.col("frequentFlyer").fill_null("").str.count_matches("/") + 
                     (pl.col("frequentFlyer").fill_null("") != "").cast(pl.Int32)).alias("n_ff_programs"),
                    ])
data = data.with_columns([(pl.col("is_direct_leg0") & pl.col("is_direct_leg1")).cast(pl.Int32).alias("both_direct"),
                    ((pl.col("isVip") == 1) | (pl.col("n_ff_programs") > 0)).cast(pl.Int32).alias("is_vip_freq"),
                    (pl.col("total_fees") > 0).cast(pl.Int32).alias("has_fees"),
                    (pl.col("total_fees") / (pl.col("totalPrice") + 1)).alias("fee_rate"),])
price_exprs = [
    (pl.col("totalPrice").rank("average").over("ranker_id") / 
     pl.col("totalPrice").count().over("ranker_id")).alias("price_pct_rank"),
    (pl.col("totalPrice") == pl.col("totalPrice").min().over("ranker_id")).cast(pl.Int32).alias("is_cheapest"),
    ((pl.col("totalPrice") - pl.col("totalPrice").median().over("ranker_id")) / 
     (pl.col("totalPrice").std().over("ranker_id") + 1)).alias("price_from_median"),
    (pl.col("legs0_num_segments") == pl.col("legs0_num_segments").min().over("ranker_id")).cast(pl.Int32).alias("is_min_segments"),
]
data = data.with_columns(price_exprs)

direct_cheapest = (
    data.filter(pl.col("is_direct_leg0") == 1)
    .group_by("ranker_id")
    .agg(pl.col("totalPrice").min().alias("min_direct"))
)

In [21]:
data = data.join(direct_cheapest, on="ranker_id", how="left").with_columns(
    ((pl.col("is_direct_leg0") == 1) & 
     (pl.col("totalPrice") == pl.col("min_direct"))).cast(pl.Int32).fill_null(0).alias("is_direct_cheapest")
).drop("min_direct")

In [40]:
data['legs0_segments0_departureFrom_airport_iata'].value_counts(), data['legs0_segments0_departureFrom_airport_iata'].is_null().sum()

(shape: (433, 2)
 ┌─────────────────────────────────┬────────┐
 │ legs0_segments0_departureFrom_… ┆ count  │
 │ ---                             ┆ ---    │
 │ str                             ┆ u32    │
 ╞═════════════════════════════════╪════════╡
 │ TOF                             ┆ 42825  │
 │ NER                             ┆ 13426  │
 │ PKV                             ┆ 180    │
 │ OGZ                             ┆ 2211   │
 │ LHW                             ┆ 306    │
 │ …                               ┆ …      │
 │ CZX                             ┆ 38     │
 │ RUH                             ┆ 1456   │
 │ OVB                             ┆ 487490 │
 │ DOH                             ┆ 221    │
 │ ZAQ                             ┆ 146    │
 └─────────────────────────────────┴────────┘,
 2)

In [25]:
data.write_parquet("data/data_with_features.parquet")

In [4]:
data = pl.read_parquet('data/data_with_features.parquet')

In [5]:
feature_cols = ['company_count', 'company_freq', 'totalPrice', 'route_distance_km', 'frequentFlyer_n_programs', 'unique_ranker_count', 'hour_sin', 
                'hour_cos', 'wday_sin', 'wday_cos', 'twoway_route', 'avg_oneway_price', 'total_duration_hours_leg0', 'total_duration_hours_leg1', 'legs0_segments0_aircraft_code_freq',
                'legs1_segments0_aircraft_code_freq', 'frequentFlyer', 'isVip', 'miniRules0_monetaryAmount', 'miniRules0_percentage',
                'miniRules1_monetaryAmount', 'miniRules1_percentage', 'sex', 'total_segments_count', 'legs0_num_segments', 'legs1_num_segments', 'tariff_code_filled', 'nationality_cat', 'miniRules0_statusInfos',
                'miniRules1_statusInfos', 'pricingInfo_isAccessTP', 'legs0_departureAt_period', 'legs0_arrivalAt_period', 'legs1_departureAt_period', 'legs1_arrivalAt_period',
                'legs0_segments0_baggageAllowance_quantity_kg', 'legs1_segments0_baggageAllowance_quantity_kg', 'legs0_segments0_cabinClass_cat', 'legs1_segments0_cabinClass_cat']
FEATURE_COLUMNS_V1 = [
    # 1-2. Пересадки
    "layover_hours_leg0",
    "layover_hours_leg1",

    # 3. Сколько часов до вылёта
    "days_before_flight_leg0",

    # 4 + 6. Смена аэропорта внутри одной ноги
    "legs0_airport_changes_count",
    "legs1_airport_changes_count",

    # 5. Ночная пересадка
    "night_layover_leg0",
    "night_layover_leg1",

    # 7 + 11. Одна ли авиакомпания-оператор
    "same_operator_carrier_leg0",
    "same_operator_carrier_leg1",

    # 8. Участие билета в программе лояльности
    "ticket_is_in_FFprogramms_leg0",
    "ticket_is_in_FFprogramms_leg1",

    # 9. Ранг по цене
    "totalPrice_rank",

    # 10. Ранг по общему времени перелёта
    "totalTime_hours_ranked",

    # 12. Согласованность багажа
    "baggage_kg_equal_flag",
    "baggage_units_equal_flag",

    # 13. Сколько билетов показано в поисковой сессии
    "tickets_in_session",

    # 14. Метрики оставшихся билетов
    "remainingTickets_avg",
    "remainingTickets_rank",

    # 15. Частота покупок пользователя
    "user_search_freq",

    # 16. Доля совпадений оператор-продавец
    "operator_marketer_match_rate",

    # 17. Билет ≤ 20 % дороже минимального
    "within_20pct_of_min",

    # 18. День недели вылета (синус/косинус)
    "leg0_depday_sin", "leg1_depday_sin",
    "leg0_depday_cos", "leg1_depday_cos",

    # 19. День недели прилёта (синус/косинус)
    "leg0_arrday_sin", "leg1_arrday_sin",
    "leg0_arrday_cos", "leg1_arrday_cos",

    # 20. Комплексная «оптимальность» билета
    "opt_ticket_score",
]
FEATURE_COLUMNS_v2 = [
    # Cчётчики сегментов и направления
    "n_segments_leg0",
    "n_segments_leg1",
    "is_one_way",
    "total_segments",
    "is_direct_leg0",
    "is_direct_leg1",
    "both_direct",

    # Стоимость, налоги, сборы
    "price_per_tax",
    "tax_rate",
    "log_price",
    "total_fees",
    "has_fees",
    "fee_rate",

    # Продолжительности
    "duration_ratio",

    # Тарифы и привилегии
    "has_corporate_tariff",
    "has_access_tp",
    "n_ff_programs",
    "is_vip_freq",

    # Класс обслуживания
    "avg_cabin_class",
    "cabin_class_diff",

    # Популярные маршруты
    "is_popular_route",

    # Относительные метрики цены и сегментов внутри поиска
    "price_pct_rank",
    "is_cheapest",
    "price_from_median",
    "is_min_segments",
    "is_direct_cheapest",
]
cat_feat_add = [
    'legs0_segments0_aircraft_code', 'legs0_segments0_arrivalTo_airport_city_iata',
    'legs0_segments0_arrivalTo_airport_iata', 'legs0_segments0_departureFrom_airport_iata',
    'legs0_segments0_marketingCarrier_code', 'legs0_segments0_operatingCarrier_code',
    'legs0_segments0_flightNumber',
    'legs0_segments1_aircraft_code', 'legs0_segments1_arrivalTo_airport_city_iata',
    'legs0_segments1_arrivalTo_airport_iata', 'legs0_segments1_departureFrom_airport_iata',
    'legs0_segments1_marketingCarrier_code', 'legs0_segments1_operatingCarrier_code',
    'legs0_segments1_flightNumber',
    # Leg 1 segments 0-1
    'legs1_segments0_aircraft_code', 'legs1_segments0_arrivalTo_airport_city_iata',
    'legs1_segments0_arrivalTo_airport_iata', 'legs1_segments0_departureFrom_airport_iata',
    'legs1_segments0_marketingCarrier_code', 'legs1_segments0_operatingCarrier_code',
    'legs1_segments0_flightNumber',
    'legs1_segments1_aircraft_code', 'legs1_segments1_arrivalTo_airport_city_iata',
    'legs1_segments1_arrivalTo_airport_iata', 'legs1_segments1_departureFrom_airport_iata',
    'legs1_segments1_marketingCarrier_code', 'legs1_segments1_operatingCarrier_code',
    'legs1_segments1_flightNumber',
]

In [None]:
data.select(feature_cols + FEATURE_COLUMNS_V1 + FEATURE_COLUMNS_v2 + cat_feat_add +['selected', 'ranker_id']).with_columns(pl.col('company_freq').cast(pl.Float32),
                                       pl.col('totalPrice').cast(pl.Float32),
                                       pl.col('route_distance_km').cast(pl.Float32),
                                       pl.col('frequentFlyer_n_programs').cast(pl.UInt8),
                                       pl.col('unique_ranker_count').cast(pl.UInt8),
                                       pl.col('hour_sin').cast(pl.Float32),
                                       pl.col('hour_cos').cast(pl.Float32),
                                       pl.col('wday_sin').cast(pl.Float32),
                                       pl.col('wday_cos').cast(pl.Float32),
                                       pl.col('twoway_route').cast(pl.UInt8),
                                       pl.col('avg_oneway_price').cast(pl.Float32),
                                       pl.col('total_duration_hours_leg0').cast(pl.Float32),
                                       pl.col('total_duration_hours_leg1').cast(pl.Float32),
                                       pl.col('legs0_segments0_aircraft_code_freq').cast(pl.Float32),
                                       pl.col('legs1_segments0_aircraft_code_freq').cast(pl.Float32),
                                       pl.col('miniRules0_monetaryAmount').cast(pl.Float32),
                                       pl.col('miniRules0_percentage').cast(pl.Float32),
                                       pl.col('miniRules1_monetaryAmount').cast(pl.Float32),
                                       pl.col('miniRules1_percentage').cast(pl.Float32),
                                       pl.col('legs0_segments0_baggageAllowance_quantity_kg').cast(pl.Float32),
                                       pl.col('legs1_segments0_baggageAllowance_quantity_kg').cast(pl.Float32)).write_parquet("data/clean_data_cut.parquet")

In [6]:
INTEGER_DTYPES = {
    pl.Int8, pl.Int16, pl.Int32, pl.Int64,
    pl.UInt8, pl.UInt16, pl.UInt32, pl.UInt64,
}
FLOAT_DTYPES = {pl.Float32, pl.Float64}


def _pick_smallest_int_dtype(col_min: int, col_max: int) -> pl.DataType:
    """Return narrowest signed/unsigned integer dtype that holds [min, max]."""
    if col_min >= 0:                       # можно без знака
        if col_max <= 255:
            return pl.UInt8
        if col_max <= 65535:
            return pl.UInt16
        if col_max <= 4294967295:
            return pl.UInt32
        return pl.UInt64
    # signed диапазон
    if -128 <= col_min and col_max <= 127:
        return pl.Int8
    if -32768 <= col_min and col_max <= 32767:
        return pl.Int16
    if -2147483648 <= col_min and col_max <= 2147483647:
        return pl.Int32
    return pl.Int64


def downcast_numeric(df: pl.DataFrame, cast_floats: bool = True) -> pl.DataFrame:
    """Down-cast int / float columns к меньшему dtype, не теряя данных."""
    int_cols = [c for c in df.columns if df[c].dtype in INTEGER_DTYPES]
    float_cols = [c for c in df.columns if df[c].dtype in FLOAT_DTYPES]

    # 1-проходная статистика для целочисленных
    stats = (
        df.select(
            [pl.col(c).min().alias(f"{c}__min") for c in int_cols] +
            [pl.col(c).max().alias(f"{c}__max") for c in int_cols]
        ).to_dicts()[0]
        if int_cols else {}
    )

    cast_exprs = []

    # целочисленные
    for c in int_cols:
        new_dt = _pick_smallest_int_dtype(stats[f"{c}__min"], stats[f"{c}__max"])
        if new_dt != df[c].dtype:
            cast_exprs.append(pl.col(c).cast(new_dt))

    # вещественные
    if cast_floats:
        for c in float_cols:
            if df[c].dtype == pl.Float64:
                cast_exprs.append(pl.col(c).cast(pl.Float32))

    return df.with_columns(cast_exprs) if cast_exprs else df
subset = data.select(feature_cols + FEATURE_COLUMNS_V1 + FEATURE_COLUMNS_v2 + cat_feat_add + 
                     ["selected", "ranker_id"])
df_opt = downcast_numeric(subset)

In [16]:
data = pl.read_parquet("data/clean_data_cut.parquet")

In [7]:
df_opt.write_parquet("data/clean_data_cut.parquet")

In [17]:
feature_cols = ['company_count', 'company_freq', 'totalPrice', 'route_distance_km', 'frequentFlyer_n_programs', 'unique_ranker_count', 'hour_sin', 
                'hour_cos', 'wday_sin', 'wday_cos', 'twoway_route', 'avg_oneway_price', 'total_duration_hours_leg0', 'total_duration_hours_leg1', 'legs0_segments0_aircraft_code_freq',
                'legs1_segments0_aircraft_code_freq', 'legs0_airport_changes_count', 'legs1_airport_changes_count', 'frequentFlyer', 'isVip', 'miniRules0_monetaryAmount', 'miniRules0_percentage',
                'miniRules1_monetaryAmount', 'miniRules1_percentage', 'sex', 'total_segments_count', 'legs0_num_segments', 'legs1_num_segments', 'tariff_code_filled', 'nationality_cat', 'miniRules0_statusInfos_cat',
                'miniRules1_statusInfos_cat', 'pricingInfo_isAccessTP_cat', 'legs0_departureAt_period', 'legs0_arrivalAt_period', 'legs1_departureAt_period', 'legs1_arrivalAt_period',
                'legs0_segments0_baggageAllowance_quantity_kg', 'legs1_segments0_baggageAllowance_quantity_kg', 'legs0_segments0_cabinClass_cat', 'legs1_segments0_cabinClass_cat']

cat_features_final = ['tariff_code_filled', 'nationality_cat', 'miniRules0_statusInfos_cat', 'miniRules1_statusInfos_cat', 'pricingInfo_isAccessTP_cat', 'legs0_departureAt_period', 'legs0_arrivalAt_period',
                      'legs1_departureAt_period', 'legs1_arrivalAt_period', 'legs0_segments0_cabinClass_cat', 'legs1_segments0_cabinClass_cat', 'frequentFlyer']
#feature_cols = ['totalPrice']
#cat_features_final = []
#X = data.select(feature_cols)
#y = data.select("selected")            # Polars DataFrame with 1 col
#groups = data.select("ranker_id")      # Polars DataFrame with 1 col

In [6]:
n2 = rows

In [7]:

# 2) Encode your categoricals
data_xgb = X.with_columns([
    (pl.col(c).rank("dense") - 1)
      .fill_null(-1)
      .cast(pl.Int32)
      .alias(c)
    for c in cat_features_final
])

# 3) Carve out your held-out test set (optional)
n2 = rows
X_trval = data_xgb[:n2]
y_trval = y[:n2]
g_trval = groups[:n2]

X_test  = data_xgb[n2:]
y_test  = y[n2:]
g_test  = groups[n2:]

def get_group_sizes(ranker_ids: np.ndarray) -> np.ndarray:
    # unique in order of first appearance + their counts
    uniq, idx, counts = np.unique(ranker_ids, return_index=True, return_counts=True)
    return counts[np.argsort(idx)]


dtrain = xgb.DMatrix(
    X_trval,
    label=y_trval,
    group=get_group_sizes(g_trval),
    feature_names=data_xgb.columns
)


In [8]:

# 5. Set params to use pairwise ranking + ndcg@3 + histogram tree builder
params = {
    "objective":   "rank:pairwise",
    "eval_metric": "ndcg@3",
    "tree_method": "hist",      # 🔥 much faster
    "seed":        42,
    "n_jobs":      -1,
}

In [9]:
cv_std = np.std(pd.read_csv('model/cv_results.csv')['val-top@3'].values)
cv_std

np.float64(0.002217702949901177)

In [10]:
data['route_distance_km'].std()

1464.87890625

In [12]:
booster = xgb.Booster()
booster.load_model('model/base.json')

In [16]:
dval_base = xgb.DMatrix(data[:n2].select(['totalPrice', 'legs0_num_segments', 'company_count']).to_numpy(), 
                        feature_names=['totalPrice', 'legs0_num_segments', 'company_count'], 
                        group=get_group_sizes(g_trval))


array([ 0.45293006, -1.73744   , -1.7689474 , ..., -0.793912  ,
       -0.7519032 , -0.75896746], shape=(18145372,), dtype=float32)

In [37]:
booster.predict(dval_base)
residuals = data[:n2].select(['selected']).to_numpy().ravel().astype(np.float32) - booster.predict(dval_base)
feature_vals = data[:n2].select(['route_distance_km']).to_numpy().ravel()

In [38]:
residuals, feature_vals

(array([0.5470699 , 1.73744   , 1.7689474 , ..., 0.793912  , 0.7519032 ,
        0.75896746], shape=(18145372,), dtype=float32),
 array([1163.6287, 1163.6287, 1163.6287, ..., 1454.2382, 1454.2382,
        1454.2382], shape=(18145372,), dtype=float32))

In [39]:
corr_mat  = np.corrcoef(residuals, feature_vals)
print("σ(res):", np.nanstd(residuals),
      "σ(feat):", np.nanstd(feature_vals))
print("ρ =", corr_mat[0,1])

σ(res): 0.897735 σ(feat): 1432.1514
ρ = nan


In [41]:
if np.nanstd(residuals)==0 or np.nanstd(feature_vals)==0:
    corr = 0.0
else:
    corr = np.corrcoef(residuals, feature_vals)[0,1]

In [48]:
max(feature_vals)

np.float32(14524.84)

In [54]:
np.isfinite(feature_vals).sum()

np.int64(18145346)

In [49]:
print("residuals NaN/inf:", np.any(~np.isfinite(residuals)))
print("feature   NaN/inf:", np.any(~np.isfinite(feature_vals)))

# 2) Make absolutely sure they’re the same length and correspond row‐for‐row
print("shapes:", residuals.shape, feature_vals.shape)

# 3) Mask out any bad entries
mask = np.isfinite(residuals) & np.isfinite(feature_vals)
res_clean  = residuals[mask]
feat_clean = feature_vals[mask]

print("after mask, length =", len(res_clean))

# 4) Recompute their std to double‐check
print("σ(res):", np.std(res_clean), "σ(feat):", np.std(feat_clean))

# 5) Now compute Pearson on the clean vectors
if len(res_clean) < 2 or np.std(res_clean) == 0 or np.std(feat_clean) == 0:
    corr = 0.0
else:
    corr = np.corrcoef(res_clean, feat_clean)[0,1]

print("ρ =", corr)

residuals NaN/inf: False
feature   NaN/inf: True
shapes: (18145372,) (18145372,)
after mask, length = 18145346
σ(res): 0.8977355 σ(feat): 1432.1514
ρ = 0.6465344087730645


In [11]:
feature_list = ['totalPrice', 'legs0_num_segments', 'ranker_id', 'selected', 'company_count', 'route_distance_km']
res = evaluate_feature(data[:n2][feature_list], 
                       new_feature_name='route_distance_km', 
                       group_col='ranker_id', 
                       label_col='selected', 
                       baseline_model='model/base.json', 
                       params=params, 
                       sigma0=cv_std,
                       corr_threshold=0.00)
res

3558975 3558975
[[ 1. nan]
 [nan nan]]


{'corr': np.float64(nan),
 'warm_delta': None,
 'mini_delta': None,
 'decision': 'fail: corr≈0'}

In [25]:
res = train_base_model(data=data[:n2], 
                 features=feature_list, 
                 label_col='selected', 
                 group_col='ranker_id', 
                 params=params, 
                 num_boost_round=30, 
                 baseline_model_path='model/', 
                 seed=42, 
                 verbose_eval_size=5, 
                 full_cv = True)
res

0.2676111328016063

In [18]:
xgb_importance = final_model.get_score(importance_type='gain')
xgb_importance_df = pl.DataFrame(
    [{'feature': k, 'importance': v} for k, v in xgb_importance.items()]
).sort('importance', descending=bool(1))
print(xgb_importance_df.to_pandas().to_string())

      feature  importance
0  totalPrice  649.964111


In [13]:
data_clean = feature_prep(data)

['legs0_segments0_baggageAllowance_quantity',
 'legs0_segments1_baggageAllowance_quantity',
 'legs1_segments0_duration',
 'legs1_segments0_baggageAllowance_quantity',
 'legs1_duration',
 'legs0_segments1_duration',
 'legs1_segments1_baggageAllowance_quantity',
 'legs1_arrivalAt',
 'legs1_segments1_duration',
 'legs1_departureAt',
 'legs1_departureAt_weekday',
 'legs1_departureAt_hour',
 'legs1_arrivalAt_weekday',
 'legs1_arrivalAt_hour']

In [27]:
str_cols = [name 
            for name, dtype in zip(data_clean.columns, data_clean.dtypes) 
            if dtype not in {Int32, Int64, Int8, Float64, Boolean}]

In [28]:
data_clean = data_clean.drop(str_cols).drop(['Id'])

In [29]:
nulls_dict = data_clean.null_count().to_dicts()[0]
cols_with_null = [col for col, cnt in nulls_dict.items() if cnt > 0]
cols_with_null

['legs0_segments0_baggageAllowance_quantity',
 'legs0_segments1_baggageAllowance_quantity',
 'legs1_segments0_baggageAllowance_quantity',
 'legs1_segments1_baggageAllowance_quantity',
 'legs1_departureAt_weekday',
 'legs1_departureAt_hour',
 'legs1_arrivalAt_weekday',
 'legs1_arrivalAt_hour']

In [30]:
data_clean

legs0_segments0_baggageAllowance_quantity,legs0_segments1_cabinClass,legs0_segments1_seatsAvailable,legs0_segments1_baggageAllowance_quantity,legs1_segments1_cabinClass,legs0_segments0_cabinClass,miniRules1_monetaryAmount,legs0_segments0_marketingCarrier_code,legs1_segments0_arrivalTo_airport_iata,miniRules1_statusInfos,legs0_segments0_flightNumber,legs0_segments1_baggageAllowance_weightMeasurementType,legs0_segments0_departureFrom_airport_iata,legs1_segments1_operatingCarrier_code,legs0_segments0_aircraft_code,legs1_segments0_baggageAllowance_quantity,selected,isVip,legs0_segments1_flightNumber,legs1_segments0_departureFrom_airport_iata,pricingInfo_isAccessTP,legs1_segments1_arrivalTo_airport_iata,legs1_segments1_departureFrom_airport_iata,legs0_segments1_marketingCarrier_code,legs0_segments1_aircraft_code,legs0_segments0_baggageAllowance_weightMeasurementType,legs0_segments1_arrivalTo_airport_city_iata,legs1_segments0_aircraft_code,miniRules0_monetaryAmount,pricingInfo_passengerCount,legs1_segments0_baggageAllowance_weightMeasurementType,profileId,corporateTariffCode,legs0_segments0_arrivalTo_airport_city_iata,legs0_segments0_operatingCarrier_code,legs1_segments0_cabinClass,legs1_segments1_baggageAllowance_quantity,legs0_segments1_departureFrom_airport_iata,legs1_segments1_aircraft_code,sex,bySelf,legs1_segments0_operatingCarrier_code,legs1_segments1_seatsAvailable,legs1_segments1_arrivalTo_airport_city_iata,legs1_segments0_seatsAvailable,legs1_segments1_baggageAllowance_weightMeasurementType,legs1_segments1_flightNumber,frequentFlyer,legs1_segments1_marketingCarrier_code,miniRules0_statusInfos,legs0_segments1_arrivalTo_airport_iata,legs1_segments0_arrivalTo_airport_city_iata,legs0_segments0_arrivalTo_airport_iata,taxes,nationality,legs0_segments1_operatingCarrier_code,companyID,legs1_segments0_flightNumber,legs0_segments0_seatsAvailable,legs1_segments0_marketingCarrier_code,isAccess3D,totalPrice,legs1_departureAt_weekday,legs1_departureAt_hour,legs0_arrivalAt_weekday,legs0_arrivalAt_hour,legs1_arrivalAt_weekday,legs1_arrivalAt_hour,legs0_departureAt_weekday,legs0_departureAt_hour,legs0_departureAt_kg,legs0_departureAt_units
f64,i32,f64,f64,i32,i32,f64,i32,i32,f64,i32,f64,i32,i32,i32,f64,i64,f64,i32,i32,f64,i32,i32,i32,i32,f64,i32,i32,f64,i64,f64,i64,i32,i32,i32,i32,f64,i32,i32,f64,bool,i32,f64,i32,f64,f64,i32,i32,i32,f64,i32,i32,i32,f64,i32,i32,i32,i32,f64,i32,f64,f64,i8,i8,i8,i8,i8,i8,i8,i8,f64,f64
1.0,-1,9.0,,-1,0,0.0,89,300,1.0,1256,0.0,347,-1,108,1.0,1,0.0,-1,150,1.0,-1,-1,-1,-1,0.0,-1,100,2800.0,1,0.0,2087645,-1,233,121,0,,-1,-1,1.0,true,105,9.0,-1,9.0,0.0,-1,109,-1,1.0,-1,279,256,370.0,36,-1,297,1146,9.0,81,0.0,16884.0,2,9,6,16,2,14,6,15,10.0,1.0
1.0,0,4.0,1.0,0,0,3500.0,123,245,1.0,4084,0.0,347,106,90,1.0,0,0.0,3557,150,1.0,179,216,118,88,0.0,204,84,2300.0,1,0.0,2087645,123,356,164,0,1.0,323,75,1.0,true,141,9.0,166,9.0,0.0,1983,109,94,1.0,233,224,383,2240.0,36,161,297,3111,4.0,113,1.0,51125.0,2,22,6,14,3,8,6,9,10.0,1.0
1.0,0,4.0,1.0,0,0,3500.0,123,245,1.0,4084,0.0,347,106,90,1.0,0,0.0,3557,150,1.0,179,216,118,88,0.0,204,84,2300.0,1,0.0,2087645,-1,356,164,0,1.0,323,75,1.0,true,141,9.0,166,9.0,0.0,1983,109,94,1.0,233,224,383,2240.0,36,161,297,3111,4.0,113,0.0,53695.0,2,22,6,14,3,8,6,9,10.0,1.0
1.0,0,4.0,1.0,0,0,0.0,123,245,1.0,4084,0.0,347,106,90,1.0,0,0.0,3557,150,1.0,179,216,118,88,0.0,204,84,0.0,1,0.0,2087645,123,356,164,0,1.0,323,75,1.0,true,141,9.0,166,9.0,0.0,1983,109,94,1.0,233,224,383,2240.0,36,161,297,3111,4.0,113,1.0,81880.0,2,22,6,14,3,8,6,9,10.0,1.0
1.0,0,4.0,1.0,0,0,0.0,123,245,1.0,4084,0.0,347,106,90,1.0,0,0.0,3557,150,1.0,179,216,118,88,0.0,204,84,0.0,1,0.0,2087645,-1,356,164,0,1.0,323,75,1.0,true,141,9.0,166,9.0,0.0,1983,109,94,1.0,233,224,383,2240.0,36,161,297,3111,4.0,113,0.0,86070.0,2,22,6,14,3,8,6,9,10.0,1.0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
2.0,-1,9.0,,-1,0,0.0,128,-1,1.0,739,0.0,330,-1,12,,0,0.0,-1,-1,1.0,-1,-1,-1,-1,0.0,-1,-1,0.0,1,0.0,3647580,65,21,171,-1,,-1,-1,1.0,false,-1,9.0,-1,5.0,0.0,-1,-1,-1,1.0,-1,-1,22,566.0,36,-1,296,-1,4.0,-1,1.0,16486.0,,,3,12,,,3,9,10.0,2.0
1.0,-1,9.0,,-1,0,2800.0,128,-1,1.0,188,0.0,330,-1,12,,0,0.0,-1,-1,1.0,-1,-1,-1,-1,0.0,-1,-1,2800.0,1,0.0,3647580,65,21,171,-1,,-1,-1,1.0,false,-1,9.0,-1,5.0,0.0,-1,-1,-1,1.0,-1,-1,22,566.0,36,-1,296,-1,4.0,-1,1.0,11701.0,,,4,1,,,3,21,10.0,1.0
2.0,-1,9.0,,-1,0,0.0,128,-1,1.0,188,0.0,330,-1,12,,0,0.0,-1,-1,1.0,-1,-1,-1,-1,0.0,-1,-1,0.0,1,0.0,3647580,65,21,171,-1,,-1,-1,1.0,false,-1,9.0,-1,5.0,0.0,-1,-1,-1,1.0,-1,-1,22,566.0,36,-1,296,-1,4.0,-1,1.0,16486.0,,,4,1,,,3,21,10.0,2.0
1.0,-1,9.0,,-1,0,2800.0,128,-1,1.0,190,0.0,330,-1,13,,0,0.0,-1,-1,1.0,-1,-1,-1,-1,0.0,-1,-1,2800.0,1,0.0,3647580,65,21,171,-1,,-1,-1,1.0,false,-1,9.0,-1,5.0,0.0,-1,-1,-1,1.0,-1,-1,22,566.0,36,-1,296,-1,9.0,-1,1.0,14431.0,,,3,18,,,3,15,10.0,1.0


In [22]:
def filtering(dataset: pl.DataFrame, proportion: int = 0.1):
    '''Filter out portion of data'''
    total_n = dataset.height
    cond = pl.col("selected") == 1
    df_filt = dataset.filter(cond)
    n_filt = df_filt.height

    # 4) сколько всего строк мы хотим на выходе
    target_n = int(total_n * proportion)

    # 5) сколько строк надо добрать из “не-подошедших”
    #    если n_filt >= target_n — оставляем только df_filt
    n_to_sample = max(target_n - n_filt, 0)

    # 6) теперь из оставшихся (где условие ложно) берём n_to_sample случайных
    df_rest = dataset.filter(~cond)
    df_rest_samp = df_rest.sample(n=n_to_sample)

    # 7) объединяем результирующий набор
    result = pl.concat([df_filt, df_rest_samp])

    return result

In [23]:
n1 = 16487352 # split train to train and val (10%) in time
n2 = train_clean.height
groups = train_raw.select('ranker_id')
X = train_clean.drop('selected')
y = train_clean.select('selected')

data_xgb_tr, data_xgb_va, data_xgb_te = X[:n1], X[n1:n2], X[n2:]
y_tr, y_va, y_te = y[:n1], y[n1:n2], y[n2:]
groups_tr, groups_va, groups_te = groups[:n1], groups[n1:n2], groups[n2:]

group_sizes_tr = groups_tr.group_by('ranker_id').agg(pl.len()).sort('ranker_id')['len'].to_numpy()
group_sizes_va = groups_va.group_by('ranker_id').agg(pl.len()).sort('ranker_id')['len'].to_numpy()
group_sizes_te = groups_te.group_by('ranker_id').agg(pl.len()).sort('ranker_id')['len'].to_numpy()
dtrain = xgb.DMatrix(data_xgb_tr, label=y_tr, group=group_sizes_tr, feature_names=X.columns)
dval   = xgb.DMatrix(data_xgb_va, label=y_va, group=group_sizes_va, feature_names=X.columns)
dtest  = xgb.DMatrix(data_xgb_te, label=y_te, group=group_sizes_te, feature_names=X.columns)
RANDOM_STATE = 42
final_xgb_params = {'objective': 'rank:pairwise', 'eval_metric': 'ndcg@3', 
                    'max_depth': 8, 'min_child_weight': 14, 'subsample': 0.9, 
                    'colsample_bytree': 1.0, 'lambda': 3.5330891736457763 , 
                    'learning_rate': 0.0521879929228514 ,
                    'seed': RANDOM_STATE, 'n_jobs': -1}
test_clean = test_clean.fill_null(0)
X_test = test_clean
groups = test_raw.select('ranker_id')
group_sizes_te = groups.group_by('ranker_id').agg(pl.len()).sort('ranker_id')['len'].to_numpy()
dtest  = xgb.DMatrix(X_test, group=group_sizes_te, feature_names=X_test.columns)

In [33]:
X_test.select(train_clean.columns)

ColumnNotFoundError: selected

In [26]:
xgb_model = xgb.train(final_xgb_params,
    dtrain,
    num_boost_round=1,
    evals=[(dtrain, 'train'), (dval, 'val')],
    early_stopping_rounds=100,
    verbose_eval=50
)

[0]	train-ndcg@3:0.75208	val-ndcg@3:0.77892


In [27]:
xgb_test_preds = xgb_model.predict(dtest)

ValueError: feature_names mismatch: ['pricingInfo_isAccessTP', 'legs1_segments1_baggageAllowance_weightMeasurementType', 'legs0_segments0_cabinClass', 'companyID', 'legs1_segments1_operatingCarrier_code', 'legs1_segments0_seatsAvailable', 'legs0_segments1_seatsAvailable', 'legs0_segments0_baggageAllowance_quantity', 'legs0_segments1_baggageAllowance_quantity', 'legs1_segments1_marketingCarrier_code', 'legs0_segments0_seatsAvailable', 'legs1_segments0_baggageAllowance_weightMeasurementType', 'legs1_segments0_flightNumber', 'legs0_segments1_departureFrom_airport_iata', 'legs1_segments1_aircraft_code', 'legs0_segments0_marketingCarrier_code', 'legs1_segments1_seatsAvailable', 'sex', 'legs0_segments0_arrivalTo_airport_city_iata', 'legs1_segments0_arrivalTo_airport_city_iata', 'legs1_segments0_operatingCarrier_code', 'legs1_segments0_aircraft_code', 'legs0_segments1_marketingCarrier_code', 'legs0_segments1_aircraft_code', 'profileId', 'legs1_segments1_flightNumber', 'legs1_segments1_baggageAllowance_quantity', 'miniRules0_statusInfos', 'legs1_segments0_departureFrom_airport_iata', 'nationality', 'legs1_segments1_arrivalTo_airport_iata', 'taxes', 'legs0_segments0_arrivalTo_airport_iata', 'legs1_segments0_arrivalTo_airport_iata', 'legs0_segments0_operatingCarrier_code', 'legs1_segments0_cabinClass', 'legs0_segments0_flightNumber', 'pricingInfo_passengerCount', 'legs0_segments0_baggageAllowance_weightMeasurementType', 'legs0_segments1_baggageAllowance_weightMeasurementType', 'miniRules1_statusInfos', 'legs0_segments0_departureFrom_airport_iata', 'frequentFlyer', 'miniRules1_monetaryAmount', 'legs0_segments0_aircraft_code', 'legs1_segments1_cabinClass', 'legs0_segments1_arrivalTo_airport_city_iata', 'legs0_segments1_flightNumber', 'legs1_segments0_marketingCarrier_code', 'legs1_segments1_arrivalTo_airport_city_iata', 'legs0_segments1_operatingCarrier_code', 'bySelf', 'legs0_segments1_arrivalTo_airport_iata', 'totalPrice', 'legs1_segments1_departureFrom_airport_iata', 'legs1_segments0_baggageAllowance_quantity', 'miniRules0_monetaryAmount', 'isAccess3D', 'legs0_segments1_cabinClass', 'isVip', 'corporateTariffCode', 'legs1_arrivalAt_weekday', 'legs1_arrivalAt_hour', 'legs0_arrivalAt_weekday', 'legs0_arrivalAt_hour', 'legs0_departureAt_weekday', 'legs0_departureAt_hour', 'legs1_departureAt_weekday', 'legs1_departureAt_hour', 'legs1_departureAt_kg', 'legs1_departureAt_units'] ['legs1_segments0_departureFrom_airport_iata', 'pricingInfo_isAccessTP', 'nationality', 'legs0_segments1_arrivalTo_airport_city_iata', 'legs1_segments1_arrivalTo_airport_iata', 'legs1_segments1_seatsAvailable', 'legs0_segments1_flightNumber', 'legs1_segments1_baggageAllowance_weightMeasurementType', 'sex', 'taxes', 'legs0_segments0_arrivalTo_airport_city_iata', 'legs1_segments0_arrivalTo_airport_city_iata', 'legs0_segments0_cabinClass', 'legs1_segments0_marketingCarrier_code', 'companyID', 'legs1_segments1_arrivalTo_airport_city_iata', 'legs0_segments0_arrivalTo_airport_iata', 'legs1_segments0_seatsAvailable', 'legs1_segments1_operatingCarrier_code', 'legs1_segments0_operatingCarrier_code', 'legs1_segments0_arrivalTo_airport_iata', 'legs0_segments0_operatingCarrier_code', 'legs1_segments0_aircraft_code', 'legs0_segments1_seatsAvailable', 'legs0_segments1_operatingCarrier_code', 'legs1_segments0_cabinClass', 'legs0_segments1_marketingCarrier_code', 'legs0_segments0_flightNumber', 'pricingInfo_passengerCount', 'legs0_segments0_baggageAllowance_weightMeasurementType', 'legs0_segments1_baggageAllowance_weightMeasurementType', 'bySelf', 'legs0_segments1_aircraft_code', 'legs0_segments0_baggageAllowance_quantity', 'legs0_segments1_arrivalTo_airport_iata', 'totalPrice', 'miniRules1_statusInfos', 'legs0_segments0_departureFrom_airport_iata', 'legs1_segments0_baggageAllowance_quantity', 'legs1_segments1_departureFrom_airport_iata', 'miniRules0_monetaryAmount', 'legs0_segments1_baggageAllowance_quantity', 'isAccess3D', 'legs0_segments1_cabinClass', 'frequentFlyer', 'legs1_segments1_marketingCarrier_code', 'profileId', 'miniRules1_monetaryAmount', 'legs0_segments0_seatsAvailable', 'legs0_segments0_aircraft_code', 'legs1_segments0_baggageAllowance_weightMeasurementType', 'legs1_segments1_flightNumber', 'isVip', 'legs1_segments1_baggageAllowance_quantity', 'legs1_segments0_flightNumber', 'legs0_segments1_departureFrom_airport_iata', 'legs1_segments1_cabinClass', 'corporateTariffCode', 'legs0_segments0_marketingCarrier_code', 'legs1_segments1_aircraft_code', 'miniRules0_statusInfos', 'legs1_arrivalAt_weekday', 'legs1_arrivalAt_hour', 'legs0_arrivalAt_weekday', 'legs0_arrivalAt_hour', 'legs0_departureAt_weekday', 'legs0_departureAt_hour', 'legs1_departureAt_weekday', 'legs1_departureAt_hour', 'legs1_departureAt_kg', 'legs1_departureAt_units']

In [85]:
dtrain.feature_names

['legs0_segments1_aircraft_code',
 'legs0_segments1_cabinClass',
 'profileId',
 'isAccess3D',
 'totalPrice',
 'legs0_segments0_arrivalTo_airport_city_iata',
 'miniRules0_monetaryAmount',
 'legs1_segments0_arrivalTo_airport_iata',
 'legs1_segments1_aircraft_code',
 'nationality',
 'miniRules1_monetaryAmount',
 'legs1_segments0_marketingCarrier_code',
 'legs1_segments1_flightNumber',
 'legs0_segments0_seatsAvailable',
 'legs1_segments1_baggageAllowance_weightMeasurementType',
 'legs0_segments1_departureFrom_airport_iata',
 'legs1_segments0_seatsAvailable',
 'miniRules0_statusInfos',
 'legs0_segments1_baggageAllowance_weightMeasurementType',
 'legs0_segments1_baggageAllowance_quantity',
 'legs0_segments1_flightNumber',
 'legs0_segments1_arrivalTo_airport_iata',
 'legs0_segments1_operatingCarrier_code',
 'legs1_segments0_aircraft_code',
 'sex',
 'legs0_segments0_operatingCarrier_code',
 'Id',
 'legs0_segments1_seatsAvailable',
 'legs0_segments0_arrivalTo_airport_iata',
 'pricingInfo_passen

In [86]:
dtest.feature_names

['legs1_segments0_baggageAllowance_weightMeasurementType',
 'legs1_segments1_cabinClass',
 'companyID',
 'legs0_segments0_baggageAllowance_quantity',
 'legs0_segments1_marketingCarrier_code',
 'legs0_segments1_departureFrom_airport_iata',
 'legs0_segments1_aircraft_code',
 'legs1_segments0_seatsAvailable',
 '__index_level_0__',
 'miniRules0_statusInfos',
 'legs1_segments0_operatingCarrier_code',
 'legs0_segments0_cabinClass',
 'legs0_segments1_baggageAllowance_weightMeasurementType',
 'legs1_segments0_baggageAllowance_quantity',
 'legs1_segments0_flightNumber',
 'legs0_segments1_cabinClass',
 'legs0_segments1_arrivalTo_airport_city_iata',
 'isAccess3D',
 'legs1_segments1_arrivalTo_airport_city_iata',
 'profileId',
 'legs1_segments1_marketingCarrier_code',
 'isVip',
 'miniRules1_statusInfos',
 'legs0_segments1_baggageAllowance_quantity',
 'totalPrice',
 'legs0_segments1_flightNumber',
 'legs0_segments1_arrivalTo_airport_iata',
 'legs0_segments0_arrivalTo_airport_city_iata',
 'legs0_segm