In [None]:
# Install the Kaggle library
!pip install kaggle

# Make a directory for Kaggle and move the kaggle.json file there
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/

# Set permissions for the kaggle.json file
!chmod 600 ~/.kaggle/kaggle.json



In [None]:
# Download the dataset
!kaggle competitions download -c store-sales-time-series-forecasting

# Unzip the downloaded files
!unzip store-sales-time-series-forecasting.zip

Downloading store-sales-time-series-forecasting.zip to /content
  0% 0.00/21.4M [00:00<?, ?B/s] 56% 12.0M/21.4M [00:00<00:00, 117MB/s]
100% 21.4M/21.4M [00:00<00:00, 145MB/s]
Archive:  store-sales-time-series-forecasting.zip
  inflating: holidays_events.csv     
  inflating: oil.csv                 
  inflating: sample_submission.csv   
  inflating: stores.csv              
  inflating: test.csv                
  inflating: train.csv               
  inflating: transactions.csv        


In [None]:
# Step 0: Import Libraries
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error
import numpy as np
from sklearn.preprocessing import LabelEncoder  # Import LabelEncoder

# Step 1: Load Data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# Step 2: Feature Engineering
# Convert date to datetime
train['date'] = pd.to_datetime(train['date'])
test['date'] = pd.to_datetime(test['date'])

# Extracting year, month, day, and day_of_week
train['year'] = train['date'].dt.year
train['month'] = train['date'].dt.month
train['day'] = train['date'].dt.day
train['day_of_week'] = train['date'].dt.dayofweek

test['year'] = test['date'].dt.year
test['month'] = test['date'].dt.month
test['day'] = test['date'].dt.day
test['day_of_week'] = test['date'].dt.dayofweek

# Convert categorical columns to numeric using Label Encoding
label_encoder = LabelEncoder()
train['family'] = label_encoder.fit_transform(train['family'])
test['family'] = label_encoder.transform(test['family'])

# Step 3: Prepare Training Data
# Define features and target
X = train.drop(['id', 'date', 'sales'], axis=1)  # Drop sales and non-features
y = train['sales']

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Set Parameters for LightGBM
params = {
    'objective': 'regression',
    'metric': 'rmse',  # Using RMSE as the metric
    'learning_rate': 0.05,
    'num_leaves': 31,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': -1
}

# Step 5: Create the LightGBM Model
model = lgb.LGBMRegressor(**params)

# Step 6: Fit the Model on the Training Data
model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    eval_metric='rmse'
)

# Step 7: Make Predictions on the Validation Set
y_pred = model.predict(X_val)

# Step 8: Evaluate the Model using RMSLE
# Note: RMSLE cannot handle negative values
y_pred = np.maximum(0, y_pred)  # Ensure predictions are non-negative
rmsle = np.sqrt(mean_squared_log_error(y_val, y_pred))
print(f'RMSLE: {rmsle}')

# Step 9: Prepare Submission
X_test = test.drop(['id', 'date'], axis=1)  # Drop id and date for prediction
test_preds = model.predict(X_test)

# Prepare the submission DataFrame
submission = pd.DataFrame({
    'id': test['id'],
    'sales': np.maximum(0, test_preds)  # Ensure no negative sales predictions
})

# Save to CSV
submission.to_csv('submission.csv', index=False)

# Check submission file
print(submission.head())

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



RMSLE: 2.407784213495835
        id        sales
0  3000888    49.091944
1  3000889    49.091944
2  3000890   191.178956
3  3000891  2751.867901
4  3000892   146.846532
