In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier


In [2]:
# functions
def optimize_dataframe(df):
    optimized_df = pd.DataFrame()

    for col in df.columns:
        col_data = df[col]
        dtype = col_data.dtype

        if dtype == object:
            # If dtype is object, convert to category if less than 50% unique
            num_unique_values = len(col_data.unique())
            num_total_values = len(col_data)
            if num_unique_values / num_total_values < 0.5:
                optimized_df[col] = col_data.astype('category')
            else:
                optimized_df[col] = col_data
        elif dtype == int:
            # If dtype is int, check if it can be converted to smaller int dtype
            if col_data.min() >= 0:
                if col_data.max() < 2**8:
                    optimized_df[col] = col_data.astype('uint8')
                elif col_data.max() < 2**16:
                    optimized_df[col] = col_data.astype('uint16')
                elif col_data.max() < 2**32:
                    optimized_df[col] = col_data.astype('uint32')
                else:
                    optimized_df[col] = col_data.astype('uint64')
            else:
                if col_data.min() > np.iinfo(np.int8).min and col_data.max() < np.iinfo(np.int8).max:
                    optimized_df[col] = col_data.astype('int8')
                elif col_data.min() > np.iinfo(np.int16).min and col_data.max() < np.iinfo(np.int16).max:
                    optimized_df[col] = col_data.astype('int16')
                elif col_data.min() > np.iinfo(np.int32).min and col_data.max() < np.iinfo(np.int32).max:
                    optimized_df[col] = col_data.astype('int32')
                else:
                    optimized_df[col] = col_data.astype('int64')
        elif dtype == float:
            # If dtype is float, check if it can be converted to smaller float dtype
            if col_data.min() > np.finfo(np.float16).min and col_data.max() < np.finfo(np.float16).max:
                optimized_df[col] = col_data.astype('float16')
            elif col_data.min() > np.finfo(np.float32).min and col_data.max() < np.finfo(np.float32).max:
                optimized_df[col] = col_data.astype('float32')
            else:
                optimized_df[col] = col_data.astype('float64')
        else:
            optimized_df[col] = col_data

    return optimized_df

def ffill_numerical(df):
    for col in df.columns:
        col_data = df[col]
        dtype = col_data.dtype
    
        if np.issubdtype(dtype, np.number):
            df[col] = df[col].fillna(method = 'ffill',)
    return df

def create_lagged_variables(df, var_list, n_lags):
    for var in var_list:
        for i in range(1, n_lags+1):
            col_name = f"{var}_lag{i}"
            df[col_name] = df[var].shift(i)
    return df


In [3]:
train = pd.read_parquet('finals/train.parquet')
Val = pd.read_parquet('finals/val.parquet')
test = pd.read_parquet('finals/test.parquet')

In [4]:
train[['month_of_year', 'day_of_year', 'day_of_month', 'week_of_year', 'week_of_month']]= train[['month_of_year', 'day_of_year', 'day_of_month', 'week_of_year', 'week_of_month']].astype('category')
Val[['month_of_year', 'day_of_year', 'day_of_month', 'week_of_year', 'week_of_month']]= Val[['month_of_year', 'day_of_year', 'day_of_month', 'week_of_year', 'week_of_month']].astype('category')
test[['month_of_year', 'day_of_year', 'day_of_month', 'week_of_year', 'week_of_month']]= test[['month_of_year', 'day_of_year', 'day_of_month', 'week_of_year', 'week_of_month']].astype('category')

In [5]:
le = LabelEncoder()

y_train = (le.fit_transform(train['Signal14']))
X_train = train.drop(['Signal14', 'Date'], axis=1)

y_val = le.transform(Val['Signal14'])
X_val = train.drop(['Signal14', 'Date'], axis=1)

y_test = le.transform(test['Signal14'])
X_test = train.drop(['Signal14', 'Date'], axis=1)

In [6]:
# Compute class frequencies
unique_classes, counts = np.unique(y_train, return_counts=True)
class_freqs = dict(zip(unique_classes, counts))

# Calculate the inverse of class frequencies
class_weights = {cls: 1.0 / freq for cls, freq in class_freqs.items()}

# Normalize the weights (optional)
total_weight = sum(class_weights.values())
class_weights = {cls: weight / total_weight for cls, weight in class_weights.items()}

# Compute sample weights for the training set
sample_weights = np.array([class_weights[label] for label in y_train])

In [12]:
# Train model
final_model = XGBClassifier(objective ='Multi:softprob', 
                      enable_categorical=True, 
                      tree_method='hist', 
                      n_estimators=10000,
                      colsample_bytree=0.8, 
                      early_stopping_rounds=10, 
                      eval_metric='mlogloss',
                      learning_rate = 0.1,
                      )

final_model.fit(X_train, y_train, 
          eval_set=[(X_train, y_train), (X_val, y_val)],
          sample_weight=sample_weights,
          verbose=10
          )
final_model.save_model('final model 26-04-23.json')

result = final_model.evals_result()
# plt.figure(figsize=(10,7))
plt.style.use("default")
plt.plot(result['validation_0']['mlogloss'], label='Training loss')
plt.plot(result['validation_1']['mlogloss'], label='Validation loss')
plt.axvline(final_model.best_ntree_limit, color='grey', label='Optimal number of trees')
plt.xlabel('Number of trees')
plt.ylabel('loss')
plt.legend()
plt.show()

XGBoostError: [20:38:58] /Users/runner/work/xgboost/xgboost/python-package/build/temp.macosx-11.0-arm64-cpython-38/xgboost/src/data/data.cc:455: Check failed: this->labels.Size() % this->num_row_ == 0 (1494 vs. 0) : Incorrect size for labels.
Stack trace:
  [bt] (0) 1   libxgboost.dylib                    0x000000013f3e5bc8 dmlc::LogMessageFatal::~LogMessageFatal() + 124
  [bt] (1) 2   libxgboost.dylib                    0x000000013f4695a4 xgboost::MetaInfo::SetInfoFromHost(xgboost::GenericParameter const&, xgboost::StringView, xgboost::Json) + 732
  [bt] (2) 3   libxgboost.dylib                    0x000000013f46916c xgboost::MetaInfo::SetInfo(xgboost::GenericParameter const&, xgboost::StringView, xgboost::StringView) + 164
  [bt] (3) 4   libxgboost.dylib                    0x000000013f3fa800 XGDMatrixSetInfoFromInterface + 224
  [bt] (4) 5   libffi.8.dylib                      0x000000010131004c ffi_call_SYSV + 76
  [bt] (5) 6   libffi.8.dylib                      0x000000010130d74c ffi_call_int + 1208
  [bt] (6) 7   _ctypes.cpython-311-darwin.so       0x00000001012f08bc _ctypes_callproc + 1232
  [bt] (7) 8   _ctypes.cpython-311-darwin.so       0x00000001012eaa70 PyCFuncPtr_call + 1216
  [bt] (8) 9   python3.11                          0x000000010093f8c4 _PyObject_MakeTpCall + 332

