# **SETUP**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# **LIBRARY**

In [None]:
import datetime
import numpy as np
import pandas as pd
import sklearn
import lightgbm as lgb
import datetime
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold, StratifiedKFold

import warnings
warnings.filterwarnings("ignore")

In [None]:
print("pandas==",pd.__version__)
print("numpy==",np.__version__)
print("scikit-learn==",sklearn.__version__)
print("lightgbm==",lgb.__version__)

pandas== 1.3.5
numpy== 1.21.6
scikit-learn== 1.0.2
lightgbm== 2.2.3


In [None]:
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

# **Dataset**

In [None]:
csv_path = "/content/drive/MyDrive/TrailblazersQualificationChallenge/"
train = pd.read_csv(f'{csv_path}Train.csv')
test = pd.read_csv(f'{csv_path}Test.csv')
sample_sub = pd.read_csv(f'{csv_path}SampleSubmission.csv')

# **Feature Engineering**

In [None]:
class ProcessData:
    
    
    def preprocess(self, train, test):
        ID = 'Place_ID X Date'
        LABEL = 'target'
        
        train['Date'] = pd.to_datetime(train['Date'], format='%Y-%m-%d')
        test['Date'] = pd.to_datetime(test['Date'], format='%Y-%m-%d')
        
        df = pd.concat([train, test]).reset_index(drop=True)
        dropCol = ['Date', 'target_count', 'target_min', 'Place_ID X Date',
                   'target_variance', 'Place_ID', 'target_max', 'target']
        
        features = [columns for columns in df.columns if columns not in dropCol]
        df['PlaceID_Freq'] = df['Place_ID'].map(df['Place_ID'].value_counts())

        for i in range(1, 15):
            df[f'target_previous_{i}'] = df.sort_values(by='Date')[LABEL].fillna(method='ffill').shift(i).sort_index()
            df[f'target_next_{i}'] = df.sort_values(by='Date')[LABEL].fillna(method='bfill').shift(-i).sort_index()

        for i in range(1, 10):
            df[f'target_next_exp_{i}'] = df.sort_values(by='Date')[LABEL].shift(i).expanding().mean().fillna(method='ffill').sort_index()
            df[f'target_prev_exp_{i}'] = df.sort_values(by='Date')[LABEL].shift(-i).expanding().mean().fillna(method='bfill').sort_index()
            
        dropColumns = []
        for day in range(1, 10):
            import math
            colName = 'Date' + str(day+1)
            df[colName] = df['Date']  + datetime.timedelta(days=day)
            dropColumns.append(colName)

            for col in ['dayofweek', 'weekofyear', 'days_in_month', 'dayofyear']:
                df[colName +"_" + col] = getattr(df['Date'].dt, col)
                dropColumns.append(colName + "_" +  col)
                dropColumns.append(colName + "_" + col + "_norm")
                df[colName + "_" + col + "_norm"] = 2 * math.pi * df[colName +"_" + col] / df[colName +"_" + col].max()
                df[colName + "_" + col + "_sin"] = np.sin(df[colName + "_" + col + "_norm"])
                df[colName + "_" + col + "_cos"] = np.cos(df[colName + "_" + col + "_norm"])

        df = df.drop(dropColumns, axis = 1)

        for i in range(1, 11):
            df_new = df.sort_values(by='Date').groupby('Place_ID')[features].shift(i).sort_index()
            df_new_diff_prev = df[features] - df_new
            df_new.columns = [c + f'_prev_{i}' for c in df_new.columns]
            df_new_diff_prev.columns = [c + f'_prev_diff_{i}' for c in df_new_diff_prev.columns]
            df = pd.concat([df, df_new, df_new_diff_prev], axis=1)
            
        for i in range(1, 11):
            df_new = df.sort_values(by='Date').groupby('Place_ID')[features].shift(-i).sort_index()
            df_new_diff_next = df[features] - df_new
            df_new.columns = [c + f'_next_{i}' for c in df_new.columns]
            df_new_diff_next.columns = [c + f'_next_diff_{i}' for c in df_new_diff_next.columns]
            df = pd.concat([df, df_new, df_new_diff_next], axis=1)
            
        timeFeatures = ['day', 'month', 'week', 'dayofweek', 
                        'weekofyear', 'days_in_month',
                        'is_month_start', 'is_month_end', 'dayofyear']
        for attr in timeFeatures:
            df[attr] = getattr(df['Date'].dt, attr)
        df['is_weekend'] = (df['dayofweek'] >= 5)*1
        df['quarter'] = df['day']%15
        df['which_quarter'] = df['day']//15

        for col in ['dayofweek', 'weekofyear', 'days_in_month', 'dayofyear']:
            df["Current_Date_" + col + "_norm"] = 2 * math.pi * df[col] / df[col].max()
            df["Current_Date_" + col + "_sin"] = np.sin(df["Current_Date_" + col + "_norm"])
            df["Current_Date_" + col + "_cos"] = np.cos(df["Current_Date_" + col + "_norm"])
            df = df.drop(["Current_Date_" + col + "_norm"], axis = 1)
        
        rmvCols = ['Date', 'target_count', 'target_min', 'Place_ID X Date', 
                   'target_variance', 'Place_ID', 'target_max', 'target']
        
        features = [c for c in df.columns if c not in rmvCols]
        train = df[:train.shape[0]].reset_index(drop=True)
        test = df[train.shape[0]:].reset_index(drop=True)
        target = train[LABEL]

        return train, test, target, features

In [None]:
process = ProcessData()

train_df, test_df, target, features = process.preprocess(train, test)

In [None]:
train_df.shape, test_df.shape

((30557, 3181), (16136, 3181))

# **Modelling**

In [None]:
class PARAM:
    SEED = 1901
    n_splits = 10

    lgbmParams = {'num_leaves': 100,  'min_data_in_leaf': 40,    'objective':'regression',
              'max_depth': -1,    'learning_rate': 0.05, "boosting": "gbdt",  "feature_fraction": 0.35,
              "metric": 'auc',   "lambda_l1": 1,  "lambda_l2": 1,  "random_state": 6, "verbosity": -1,  'metric' : 'rmse',  'num_iterations': 2200}
        
        
skfolds = StratifiedKFold(n_splits= PARAM.n_splits, random_state= PARAM.SEED, shuffle = True)

In [None]:
target = train_df['target']
train_preds = np.zeros(len(train_df))
test_preds = np.zeros(len(test_df))
split_y = pd.qcut(target, 10, labels=False, duplicates='drop')

for fold_, (trn_idx, val_idx) in enumerate(skfolds.split(train_df.values, split_y)):
    print(50*'-')
    print(f'Fold {fold_+1} / {PARAM.n_splits}' )
    X_trn, X_val, X_test = train_df.iloc[trn_idx][features], train_df.iloc[val_idx][features], test_df[features]   
    y_trn, y_val = target.iloc[trn_idx], target.iloc[val_idx]
    trn_data = lgb.Dataset(X_trn, y_trn)
    val_data = lgb.Dataset(X_val, y_val)

    clf = lgb.train(PARAM.lgbmParams, trn_data, valid_sets = [trn_data, val_data], 
                          verbose_eval=200, early_stopping_rounds = 200)

    predTrain = clf.predict(X_val, num_iteration=clf.best_iteration)
    train_preds[val_idx] = predTrain
    print(f"RMSE : {rmse(y_val, predTrain)}")

    predTest = clf.predict(X_test, num_iteration=clf.best_iteration)
    predTest[predTest < 0] = 0
    test_preds += predTest
    print(50*'-')

test_preds = test_preds / PARAM.n_splits
print(f"Train RMSE : {rmse(target, train_preds)}")

--------------------------------------------------
Fold 1 / 10
Training until validation scores don't improve for 200 rounds.
[200]	training's rmse: 12.2382	valid_1's rmse: 29.3352
[400]	training's rmse: 7.47945	valid_1's rmse: 28.7278
[600]	training's rmse: 5.10429	valid_1's rmse: 28.5744
[800]	training's rmse: 3.6782	valid_1's rmse: 28.4944
[1000]	training's rmse: 2.72762	valid_1's rmse: 28.4596
[1200]	training's rmse: 2.06812	valid_1's rmse: 28.4308
[1400]	training's rmse: 1.59971	valid_1's rmse: 28.4218
[1600]	training's rmse: 1.24622	valid_1's rmse: 28.4151
[1800]	training's rmse: 0.977779	valid_1's rmse: 28.4083
[2000]	training's rmse: 0.770764	valid_1's rmse: 28.4035
[2200]	training's rmse: 0.610665	valid_1's rmse: 28.4035
Did not meet early stopping. Best iteration is:
[2200]	training's rmse: 0.610665	valid_1's rmse: 28.4035
RMSE : 28.40352412141118
--------------------------------------------------
--------------------------------------------------
Fold 2 / 10
Training until v

In [None]:
predictions1 = test_preds
predictions1[predictions1 < 0] = 0

sub = pd.DataFrame()
sub['Place_ID X Date'] = test_df['Place_ID X Date']
sub['target'] = predictions1
sub.to_csv("LGBM_SOLUTION3.csv", index=False)
sub.to_csv(f"{csv_path}LGBM_SOLUTION3.csv", index=False)