In [68]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.feature_selection import SelectPercentile, chi2
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import DetCurveDisplay, RocCurveDisplay
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
import xgboost as xgb
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import warnings
warnings.filterwarnings('ignore')

sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = 10, 6

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 136950 entries, 0 to 136949
Data columns (total 6 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   id        136950 non-null  int64 
 1   date      136950 non-null  object
 2   country   136950 non-null  object
 3   store     136950 non-null  object
 4   product   136950 non-null  object
 5   num_sold  136950 non-null  int64 
dtypes: int64(2), object(4)
memory usage: 6.3+ MB


In [4]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27375 entries, 0 to 27374
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   id       27375 non-null  int64 
 1   date     27375 non-null  object
 2   country  27375 non-null  object
 3   store    27375 non-null  object
 4   product  27375 non-null  object
dtypes: int64(1), object(4)
memory usage: 1.0+ MB


In [5]:
train

Unnamed: 0,id,date,country,store,product,num_sold
0,0,2017-01-01,Argentina,Kaggle Learn,Using LLMs to Improve Your Coding,63
1,1,2017-01-01,Argentina,Kaggle Learn,Using LLMs to Train More LLMs,66
2,2,2017-01-01,Argentina,Kaggle Learn,Using LLMs to Win Friends and Influence People,9
3,3,2017-01-01,Argentina,Kaggle Learn,Using LLMs to Win More Kaggle Competitions,59
4,4,2017-01-01,Argentina,Kaggle Learn,Using LLMs to Write Better,49
...,...,...,...,...,...,...
136945,136945,2021-12-31,Spain,Kagglazon,Using LLMs to Improve Your Coding,700
136946,136946,2021-12-31,Spain,Kagglazon,Using LLMs to Train More LLMs,752
136947,136947,2021-12-31,Spain,Kagglazon,Using LLMs to Win Friends and Influence People,111
136948,136948,2021-12-31,Spain,Kagglazon,Using LLMs to Win More Kaggle Competitions,641


In [8]:
# Assuming `dates` is a pandas Series of datetime objects from 2017-01-01 to 2022-01-01
dates = pd.to_datetime(train['date'])
df = pd.DataFrame(dates, columns=['date'])

# Step 1: Extract datetime components
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day
df['day_of_week'] = df['date'].dt.dayofweek  # Monday=0, Sunday=6

# Step 2: Encode cyclical features
df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
df['day_sin'] = np.sin(2 * np.pi * df['day'] / 31)
df['day_cos'] = np.cos(2 * np.pi * df['day'] / 31)
df['day_of_week_sin'] = np.sin(2 * np.pi * df['day_of_week'] / 7)
df['day_of_week_cos'] = np.cos(2 * np.pi * df['day_of_week'] / 7)

In [79]:
def smape(forecasts, actuals):
    """
    Calculate the Symmetric Mean Absolute Percentage Error (sMAPE)
    between two lists or arrays of forecast and actual values.
    """
    # Convert inputs to NumPy arrays for vectorized operations
    forecasts, actuals = np.array(forecasts), np.array(actuals)
    # Calculate the sMAPE
    denominator = np.abs(actuals) + np.abs(forecasts)
    diff = np.abs(forecasts - actuals) * 2 / denominator
    diff[denominator == 0] = 0  # Handle the case where both forecast and actual are zero
    return 100 * np.mean(diff)

# Example usage


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 136950 entries, 0 to 136949
Data columns (total 11 columns):
 #   Column           Non-Null Count   Dtype         
---  ------           --------------   -----         
 0   date             136950 non-null  datetime64[ns]
 1   year             136950 non-null  int64         
 2   month            136950 non-null  int64         
 3   day              136950 non-null  int64         
 4   day_of_week      136950 non-null  int64         
 5   month_sin        136950 non-null  float64       
 6   month_cos        136950 non-null  float64       
 7   day_sin          136950 non-null  float64       
 8   day_cos          136950 non-null  float64       
 9   day_of_week_sin  136950 non-null  float64       
 10  day_of_week_cos  136950 non-null  float64       
dtypes: datetime64[ns](1), float64(6), int64(4)
memory usage: 11.5 MB


In [74]:
# Split the train data into train & evl sets & 
# keep test data as test set
X = train.drop(['id', 'num_sold', 'date'], axis=1)
X['year']=df['year']-2017
X['day']=df['day']
X['month']=df['month']
y = train['num_sold']
# Use ColumnTransformer by the column types
numeric_cols = X.select_dtypes(include='number').columns

# Fill in missing values and make them equal to the median, then standardize them
numeric_transformer = Pipeline(steps=[     
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Transform categorical variable into something readable by ML algorithms via One Hot encoding
# percentile=50 means only the top 50% of features by score will be kept. Higher % reduces dimensionality and complexity
catogorical_cols = X.select_dtypes(include='object').columns
catogorical_transformer = Pipeline(steps=[
    ('encoder', OneHotEncoder(handle_unknown='ignore')),
    ('selector', SelectPercentile(chi2, percentile=50))
])

In [87]:
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)
X_train, X_eval, y_train, y_eval = train_test_split(X, y, test_size=0.2, random_state=0)

In [84]:
# Use transformer when data contains both categorical and numerical values

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('cat', catogorical_transformer, catogorical_cols)
    ]
)

In [85]:
# Define the second classifier (Random Forest)
rf_clf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestRegressor())
])

# Train and evaluate Random Forest


In [67]:
rf_clf.fit(X_train, y_train)
y_pred = rf_clf.predict(X_eval)
for i in range(len(y_pred)):
    if y_pred[i]<0:
        y_pred[i]=0

smape_value = smape(y_pred, y_eval)
print(f"sMAPE: {smape_value}%")

sMAPE: 18.203312264171792%


In [95]:
xgb_clf = Pipeline(steps=[
    ('preprocessor', preprocessor),
])
xgb_clf.fit(X_train, y_train)
X_train_transformed = xgb_clf.named_steps['preprocessor'].transform(X_train)
xgb_clf.fit(X_eval, y_eval)
X_test_transformed = xgb_clf.named_steps['preprocessor'].transform(X_eval)

In [97]:
categorical_columns = ['country', 'store', 'product']  # Update this list with your actual categorical columns
for col in categorical_columns:
    X_train[col] = X_train[col].astype('category')
    X_eval[col] = X_eval[col].astype('category')
    
dtrain = xgb.DMatrix(X_train_transformed, label=y_train, enable_categorical=True)
dtest = xgb.DMatrix(X_test_transformed, label=y_eval, enable_categorical=True)

# Specify training parameters
params = {
    'max_depth': 10,  # the maximum depth of each tree
    'eta': 0.3,  # the training step for each iteration
    'objective': 'multi:softprob',  # error evaluation for multiclass training
    'num_class': 3  # the number of classes that exist in this dataset
}

# Specify the number of training iterations
num_round = 100



# Train the model
bst = xgb.train(params, dtrain)

# Make predictions
preds = bst.predict(dtest)
best_preds = np.asarray([np.argmax(line) for line in preds])

XGBoostError: [08:02:53] C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-0750514818a16474a-1\xgboost\xgboost-ci-windows\src\objective\multiclass_obj.cu:123: SoftmaxMultiClassObj: label must be in [0, num_class).

In [41]:
df

Unnamed: 0,date,year,month,day,day_of_week,month_sin,month_cos,day_sin,day_cos,day_of_week_sin,day_of_week_cos
0,2017-01-01,2017,1,1,6,5.000000e-01,0.866025,2.012985e-01,0.97953,-0.781831,0.623490
1,2017-01-01,2017,1,1,6,5.000000e-01,0.866025,2.012985e-01,0.97953,-0.781831,0.623490
2,2017-01-01,2017,1,1,6,5.000000e-01,0.866025,2.012985e-01,0.97953,-0.781831,0.623490
3,2017-01-01,2017,1,1,6,5.000000e-01,0.866025,2.012985e-01,0.97953,-0.781831,0.623490
4,2017-01-01,2017,1,1,6,5.000000e-01,0.866025,2.012985e-01,0.97953,-0.781831,0.623490
...,...,...,...,...,...,...,...,...,...,...,...
136945,2021-12-31,2021,12,31,4,-2.449294e-16,1.000000,-2.449294e-16,1.00000,-0.433884,-0.900969
136946,2021-12-31,2021,12,31,4,-2.449294e-16,1.000000,-2.449294e-16,1.00000,-0.433884,-0.900969
136947,2021-12-31,2021,12,31,4,-2.449294e-16,1.000000,-2.449294e-16,1.00000,-0.433884,-0.900969
136948,2021-12-31,2021,12,31,4,-2.449294e-16,1.000000,-2.449294e-16,1.00000,-0.433884,-0.900969


In [48]:
# Split the train data into train & evl sets & 
# keep test data as test set
X = train.drop(['id', 'num_sold', 'date'], axis=1)
X['year']=df['year']-2017
X['day']=df['day']
X['month']=df['month']
y = train['num_sold']

X_train, X_eval, y_train, y_eval = train_test_split(X, y, test_size=0.2, random_state=0)

In [49]:
X_train

Unnamed: 0,country,store,product,year,day,month
115128,Argentina,Kaggle Learn,Using LLMs to Win More Kaggle Competitions,4,16,3
3737,Spain,Kaggle Learn,Using LLMs to Win Friends and Influence People,0,19,2
134723,Canada,Kaggle Store,Using LLMs to Win More Kaggle Competitions,4,2,12
25613,Estonia,Kaggle Store,Using LLMs to Win More Kaggle Competitions,0,8,12
3700,Canada,Kagglazon,Using LLMs to Improve Your Coding,0,19,2
...,...,...,...,...,...,...
41993,Spain,Kaggle Store,Using LLMs to Win More Kaggle Competitions,1,14,7
97639,Spain,Kaggle Learn,Using LLMs to Write Better,3,25,7
95939,Argentina,Kagglazon,Using LLMs to Write Better,3,3,7
117952,Japan,Kaggle Store,Using LLMs to Win Friends and Influence People,4,22,4


In [None]:
# Fit the model to the training data
lasso = Lasso(alpha=1.0)
# Define the classifier (logistic regression for categorical predictions, otherwise linear regression)
lr_clf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', lasso)
])

# Train and evaluate Logistic Regression
lr_clf.fit(X_train, y_train)

sMAPE: 63.42177948423738%


In [None]:
alpha_values = np.linspace(0.01, 1, 20)  # Adjust the range and number of values as needed

# Initialize variables to store the best alpha and its corresponding sMAPE
best_alpha = None
best_smape = np.inf
lr_clf.fit(X_train, y_train)
# Loop over alpha values to find the best one
for alpha in alpha_values:
    lasso = Lasso(alpha=alpha)
    lr_clf = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', lasso)
    ])
    lr_clf.fit(X_train, y_train)
    predictions = lr_clf.predict(X_eval)

    # Calculate sMAPE
    current_smape = smape(y_eval, predictions)

    # Update best_alpha if current model is better
    if current_smape < best_smape:
        best_smape = current_smape
        best_alpha = alpha

print(f"Best Alpha: {best_alpha}")
print(f"Best sMAPE: {best_smape}%")

In [None]:
# Predict on the test set
y_pred = lr_clf.predict(X_eval)
for i in range(len(y_pred)):
    if y_pred[i]<0:
        y_pred[i]=0

