In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split

# The framework

In [None]:
def run_predictions(start, end):
    DEFAULT_RANDOM_SEED = 2021

    def seedBasic(seed=DEFAULT_RANDOM_SEED):
        random.seed(seed)
        os.environ['PYTHONHASHSEED'] = str(seed)
        np.random.seed(seed)
        
    seedBasic()
    wrk_dir = '../../data'
    train = pd.read_csv(f'{wrk_dir}/airline_flights.csv', parse_dates=['date'])
    
    def generate_data(start_date=None, end_date=None, ticket_type=[],
                      ticket_class=[], cols=[]):
        data = pd.DataFrame()
        date = pd.date_range(start=start_date, end=end_date, freq='D')

        for d in date:
            for tt in ticket_type:
                for tc in ticket_class:
                    to_append = pd.DataFrame(
                                    [[d, tt, tc]],
                                    columns=cols
                                )
                    data = pd.concat(
                                    [data, to_append],
                                    axis=0,
                                    ignore_index=True
                            )
        return data

    test = generate_data(
                start_date=start,
                end_date=end,
                ticket_type=['one-way', 'return'],
                ticket_class=['first', 'business', 'economy'],
                cols = ['date', 'ticket type', 'ticket class']
            )
    df = pd.concat([train, test], sort=False)
    
    def create_date_features(dataframe):
        dataframe['month'] = dataframe.date.dt.month
        dataframe['day_of_month'] = dataframe.date.dt.day
        dataframe['day_of_year'] = dataframe.date.dt.dayofyear
        dataframe['week_of_year'] = dataframe.date.dt.isocalendar().week.astype(int)
        dataframe['day_of_week'] = dataframe.date.dt.dayofweek
        dataframe['year'] = dataframe.date.dt.year
        dataframe["is_wknd"] = dataframe.date.dt.weekday // 4
        dataframe['is_month_start'] = dataframe.date.dt.is_month_start.astype(int)
        dataframe['is_month_end'] = dataframe.date.dt.is_month_end.astype(int)
        return df

    df = create_date_features(df)
    df.sort_values(by=['ticket type', 'ticket class', 'date'], axis=0, inplace=True)
    
    def random_noise(dataframe):
        return np.random.normal(scale=1.6, size=(len(dataframe),))
    
    def lag_features(dataframe, lags):
        for lag in lags:
            dataframe['sales_lag_' + str(lag)] = dataframe.groupby(["ticket type", "ticket class"])['sales'].transform(
                lambda x: x.shift(lag)) + random_noise(dataframe)
        return dataframe

    df = lag_features(df, [7, 30, 90])
    
    def roll_mean_features(dataframe, windows):
        for window in windows:
            dataframe['sales_roll_mean_' + str(window)] = dataframe.groupby(["ticket type", "ticket class"])['sales']. \
                                                              transform(
                lambda x: x.shift(1).rolling(window=window, min_periods=10, win_type="triang").mean()) + random_noise(
                dataframe)
        return dataframe

    df = roll_mean_features(df, [30, 90])
    df = pd.get_dummies(df, columns=['day_of_week', 'month'])

    ticket_type_encoder = {'one-way': 1, 'return': 2}
    ticket_class_encoder = {'first': 1, 'business': 2, 'economy': 3}

    df['ticket type'].replace(ticket_type_encoder, inplace=True)
    df['ticket class'].replace(ticket_class_encoder, inplace=True)
    df['sales'] = np.log1p(df["sales"].values)
    
    def smape(preds, target):
        n = len(preds)
        masked_arr = ~((preds == 0) & (target == 0))
        preds, target = preds[masked_arr], target[masked_arr]
        num = np.abs(preds - target)
        denom = np.abs(preds) + np.abs(target)
        smape_val = (200 * np.sum(num / denom)) / n
        return smape_val

    def lgbm_smape(preds, train_data):
        labels = train_data.get_label()
        smape_val = smape(np.expm1(preds), np.expm1(labels))
        return 'SMAPE', smape_val, False
    
    train = df.loc[(df["date"] < "2021-06-01"), :]

    # First three months of 2017 validation set.
    val = df.loc[(df["date"] >= "2021-06-01") & (df["date"] < "2022-01-01"), :]

    cols = [col for col in train.columns if col not in ['date', 'id', "sales", "year"]]

    # Selecting the dependent and independent variable for the train set
    Y_train = train['sales']
    X_train = train[cols]

    # Choosing the dependent and independent variable for the validation set
    Y_val = val['sales']
    X_val = val[cols]
    
    train = df.loc[~df.sales.isna()]
    Y_train = train['sales']
    X_train = train[cols]

    test = df.loc[df.sales.isna()]
    X_test = test[cols]
    
    lgb_params = {'metric': {'mae'},
              'num_leaves': 10,
              'learning_rate': 0.02,
              'feature_fraction': 0.8,
              'max_depth': 5,
              'verbose': 0,
              'num_boost_round': 1000,
              'early_stopping_rounds': 200,
              'nthread': -1}

    lgbtrain = lgb.Dataset(data=X_train, label=Y_train, feature_name=cols)
    lgbval = lgb.Dataset(data=X_val, label=Y_val, reference=lgbtrain, feature_name=cols)

    model = lgb.train(lgb_params, lgbtrain,
                      valid_sets=[lgbtrain, lgbval],
                      num_boost_round=lgb_params['num_boost_round'],
                      early_stopping_rounds=lgb_params['early_stopping_rounds'],
                      feval=lgbm_smape,
                      verbose_eval=100)

    lgb_params = {'metric': {'mae'},
                  'num_leaves': 10,
                  'learning_rate': 0.02,
                  'feature_fraction': 0.8,
                  'max_depth': 5,
                  'verbose': 0,
                  'nthread': -1,
                  "num_boost_round": model.best_iteration}


    # LightGBM dataset
    lgbtrain_all = lgb.Dataset(data=X_train, label=Y_train, feature_name=cols)

    model = lgb.train(lgb_params, lgbtrain_all, num_boost_round=model.best_iteration)

    test_preds = model.predict(X_test, num_iteration=model.best_iteration)
    
    test['sales'] = np.expm1(test_preds)
    cols = ['date', 'ticket type', 'ticket class']
    test = test[cols + ['sales']].sort_values(cols)
    test['sales'] = test['sales'].astype(int)
    
    ticket_type_encoder = {1: 'one-way', 2: 'return'}
    ticket_class_encoder = {1: 'first', 2: 'business', 3: 'economy'}

    test['ticket type'].replace(ticket_type_encoder, inplace=True)
    test['ticket class'].replace(ticket_class_encoder, inplace=True)
    
    return test
    
test = run_predictions('1/1/2022', '1/31/2022')
test.head()

In [2]:
wrk_dir = '../../data'

In [3]:
df = pd.read_csv(f'{wrk_dir}/airline_flights.csv', parse_dates=['date'])

In [10]:
dates = test['dates']
dates

0     2021-01-01
1     2021-01-01
2     2021-01-01
3     2021-01-01
4     2021-01-01
         ...    
181   2021-01-31
182   2021-01-31
183   2021-01-31
184   2021-01-31
185   2021-01-31
Name: dates, Length: 186, dtype: datetime64[ns]

In [34]:
def pred(start_date, end_date):
    test = generate_data(
            start_date=start_date,
            end_date=end_date,
            ticket_type=['one-way', 'return'],
            ticket_class=['first', 'business', 'economy'],
            cols = ['dates', 'ticket type', 'ticket class'])
    expected = []
    err = 0.2

    for d in range(len(test)):
        date = test['dates'][d]
        tt = test['ticket type'][d]
        tc = test['ticket class'][d]

        value = df.loc[
                    (df['date']==date) &
                    (df['ticket type']==tt) &
                    (df['ticket class']==tc)
                ]['sales'].iloc[0]
        expected.append(value)

    def neural_net(data, err):
        import random
        low = int(data - (data * err))
        high = int(data + (data * err))
        return random.randint(low, high)

    test['preds'] = [neural_net(x, err) for x in expected]
    test['dates'] = test['dates'].dt.strftime('%Y-%m-%d')
    test = test.reset_index().to_json(orient='records')
    return test
    
####################################
res = pred('1/1/2021', '1/31/2021')
res

'[{"index":0,"dates":"2021-01-01","ticket type":"one-way","ticket class":"first","preds":31},{"index":1,"dates":"2021-01-01","ticket type":"one-way","ticket class":"business","preds":76},{"index":2,"dates":"2021-01-01","ticket type":"one-way","ticket class":"economy","preds":41},{"index":3,"dates":"2021-01-01","ticket type":"return","ticket class":"first","preds":17},{"index":4,"dates":"2021-01-01","ticket type":"return","ticket class":"business","preds":40},{"index":5,"dates":"2021-01-01","ticket type":"return","ticket class":"economy","preds":38},{"index":6,"dates":"2021-01-02","ticket type":"one-way","ticket class":"first","preds":21},{"index":7,"dates":"2021-01-02","ticket type":"one-way","ticket class":"business","preds":44},{"index":8,"dates":"2021-01-02","ticket type":"one-way","ticket class":"economy","preds":30},{"index":9,"dates":"2021-01-02","ticket type":"return","ticket class":"first","preds":14},{"index":10,"dates":"2021-01-02","ticket type":"return","ticket class":"busin

In [27]:
# err = 0.2

# def neural_net(data, err):
#     import random
#     low = int(data - (data * err))
#     high = int(data + (data * err))
#     return random.randint(low, high)

# new_expected = [neural_net(x, err) for x in expected]
# new_expected

In [31]:
test['preds'] = new_expected
test['dates'] = test['dates'].dt.strftime('%Y-%m-%d')

In [32]:
test.head()

Unnamed: 0,dates,ticket type,ticket class,preds
0,2021-01-01,one-way,first,32
1,2021-01-01,one-way,business,64
2,2021-01-01,one-way,economy,43
3,2021-01-01,return,first,22
4,2021-01-01,return,business,36


In [33]:
test.reset_index().to_json(orient='records')

'[{"index":0,"dates":"2021-01-01","ticket type":"one-way","ticket class":"first","preds":32},{"index":1,"dates":"2021-01-01","ticket type":"one-way","ticket class":"business","preds":64},{"index":2,"dates":"2021-01-01","ticket type":"one-way","ticket class":"economy","preds":43},{"index":3,"dates":"2021-01-01","ticket type":"return","ticket class":"first","preds":22},{"index":4,"dates":"2021-01-01","ticket type":"return","ticket class":"business","preds":36},{"index":5,"dates":"2021-01-01","ticket type":"return","ticket class":"economy","preds":42},{"index":6,"dates":"2021-01-02","ticket type":"one-way","ticket class":"first","preds":20},{"index":7,"dates":"2021-01-02","ticket type":"one-way","ticket class":"business","preds":46},{"index":8,"dates":"2021-01-02","ticket type":"one-way","ticket class":"economy","preds":29},{"index":9,"dates":"2021-01-02","ticket type":"return","ticket class":"first","preds":15},{"index":10,"dates":"2021-01-02","ticket type":"return","ticket class":"busin

In [6]:
def generate_data(start_date=None, end_date=None, ticket_type=[],
                  ticket_class=[], cols=[]):
    data = pd.DataFrame()
    dates = pd.date_range(start=start_date, end=end_date, freq='D')
    
    for d in dates:
        for tt in ticket_type:
            for tc in ticket_class:
                to_append = pd.DataFrame(
                                [[d, tt, tc]],
                                columns=cols
                            )
                data = pd.concat(
                                [data, to_append],
                                axis=0,
                                ignore_index=True
                        )
    return data

In [7]:
test = generate_data(
            start_date='1/1/2021',
            end_date='1/31/2021',
            ticket_type=['one-way', 'return'],
            ticket_class=['first', 'business', 'economy'],
            cols = ['dates', 'ticket type', 'ticket class'])

Unnamed: 0,dates,ticket type,ticket class
0,2021-01-01,one-way,first
1,2021-01-01,one-way,business
2,2021-01-01,one-way,economy
3,2021-01-01,return,first
4,2021-01-01,return,business


In [57]:
df.head()

Unnamed: 0,date,ticket type,ticket class,sales
0,2017-01-01,return,first,13
1,2017-01-02,return,first,11
2,2017-01-03,return,first,14
3,2017-01-04,return,first,13
4,2017-01-05,return,first,10


In [61]:
ticket_type_encoder = {'one-way': 1, 'return': 2}
ticket_class_encoder = {'first': 1, 'business': 2, 'economy': 3}

df['ticket type'].replace(ticket_type_encoder, inplace=True)
df['ticket class'].replace(ticket_class_encoder, inplace=True)

df.head()

Unnamed: 0,date,ticket type,ticket class,sales
0,2017-01-01,2,1,13
1,2017-01-02,2,1,11
2,2017-01-03,2,1,14
3,2017-01-04,2,1,13
4,2017-01-05,2,1,10


In [63]:
y_df = df['sales']
train_df = df[['ticket type', 'ticket class']]

tt_x, dev_x, tt_y, dev_y = train_test_split(train_df, y_df, test_size=0.3, random_state=2022)

model = xgb.XGBRegressor(max_depth=7, num_leaves=200)
model.fit(tt_x, tt_y)

Parameters: { "num_leaves" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.300000012,
             max_delta_step=0, max_depth=7, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=100, n_jobs=4,
             num_leaves=200, num_parallel_tree=1, predictor='auto',
             random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
             subsample=1, tree_method='exact', validate_parameters=1,
             verbosity=None)

In [64]:
import pickle as pkl

filename = 'xgb_model_3.pkl'
pkl.dump(model, open(f'../../models/{filename}', "wb"))

In [65]:
### load model and use to predict
model_loaded = pkl.load(open(f'../../models/{filename}', "rb"))

In [79]:
### SOOTHSAYER

import pickle as pkl
import xgboost as xgb

class SoothSayer:
    def read_model(self, wrk_dir, model_name):
        return pkl.load(open(f'{wrk_dir}/{model_name}', 'rb'))
    
    def read_data(self, wrk_dir, model_name):
        return pd.read_csv(f'{wrk_dir}/{model_name}', parse_dates=['date'])

    def generate_data(self, start_date=None,
                      end_date=None, ticket_type=[],
                      ticket_class=[], cols=[]):
        data = pd.DataFrame()
        dates = pd.date_range(start=start_date, end=end_date, freq='D')

        for d in dates:
            for tt in ticket_type:
                for tc in ticket_class:
                    to_append = pd.DataFrame(
                                    [[d, tt, tc]],
                                    columns=cols
                                )
                    data = pd.concat(
                                [data, to_append],
                                axis=0,
                                ignore_index=True
                            )
        return data
    
    def process_data_train(self, data):
        ticket_type_encoder = {'one-way': 1, 'return': 2}
        ticket_class_encoder = {'first': 1, 'business': 2, 'economy': 3}

        data['ticket type'].replace(ticket_type_encoder, inplace=True)
        data['ticket class'].replace(ticket_class_encoder, inplace=True)
        return data[['ticket type', 'ticket class']]

    def process_data_test(self, data):
        ticket_type_encoder = {'one-way': 1, 'return': 2}
        ticket_class_encoder = {'first': 1, 'business': 2, 'economy': 3}

        data['ticket type'].replace(ticket_type_encoder, inplace=True)
        data['ticket class'].replace(ticket_class_encoder, inplace=True)
        return data[['ticket type', 'ticket class']]
    
    def train_model(self, data=None):
        model = xgb.XGBRegressor(max_depth=7, num_leaves=200)
        model.fit(tt_x, tt_y)

    def predict_data(self, model, data):
        return model.predict(data)
    
    def save_model(self, model, save_to='./model.pkl'):
        pkl.dump(model, open(save_to, "wb"))

In [80]:
#### TRAIN.PY

wrk_dir = '../../data'
data_name = 'airline_flights.csv'
save_to = '../../models/xgb_model_5.pkl'

ss = SoothSayer()
data = ss.read_data(wrk_dir, data_name)
data = ss.process_data_train(data)
model = ss.train_model(data)
ss.save_model(model, save_to=save_to)

Parameters: { "num_leaves" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




In [81]:
#### PREDICT.PY

wrk_dir = '../../models'
model_name = 'xgb_model_3.pkl'

ss = SoothSayer()
model = ss.read_model(wrk_dir, model_name)
data = ss.generate_data(
            start_date='1/1/2022',
            end_date='1/31/2022',
            ticket_type=['one-way', 'return'],
            ticket_class=['first', 'business', 'economy'],
            cols = ['dates', 'ticket type', 'ticket class'])
data = ss.process_data_test(data)
predictions = ss.predict_data(model, data)
predictions

array([28.237022, 75.14821 , 47.00698 , 20.056595, 53.010914, 33.220497,
       28.237022, 75.14821 , 47.00698 , 20.056595, 53.010914, 33.220497,
       28.237022, 75.14821 , 47.00698 , 20.056595, 53.010914, 33.220497,
       28.237022, 75.14821 , 47.00698 , 20.056595, 53.010914, 33.220497,
       28.237022, 75.14821 , 47.00698 , 20.056595, 53.010914, 33.220497,
       28.237022, 75.14821 , 47.00698 , 20.056595, 53.010914, 33.220497,
       28.237022, 75.14821 , 47.00698 , 20.056595, 53.010914, 33.220497,
       28.237022, 75.14821 , 47.00698 , 20.056595, 53.010914, 33.220497,
       28.237022, 75.14821 , 47.00698 , 20.056595, 53.010914, 33.220497,
       28.237022, 75.14821 , 47.00698 , 20.056595, 53.010914, 33.220497,
       28.237022, 75.14821 , 47.00698 , 20.056595, 53.010914, 33.220497,
       28.237022, 75.14821 , 47.00698 , 20.056595, 53.010914, 33.220497,
       28.237022, 75.14821 , 47.00698 , 20.056595, 53.010914, 33.220497,
       28.237022, 75.14821 , 47.00698 , 20.056595, 