In [1]:
import pandas as pd 
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline


# dataframe
df_features = pd.read_csv('features.csv')
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
df_stores = pd.read_csv('stores.csv')

# set column name to lower 
df_stores.columns = df_stores.columns.str.lower()
df_features.columns = df_features.columns.str.lower()
df_train.columns = df_train.columns.str.lower()
df_test.columns =df_test.columns.str.lower()

In [2]:
# converting date(object) to date(datetime)
df_train["date"] = pd.to_datetime(df_train["date"])
df_features["date"] = pd.to_datetime(df_features["date"])
df_test["date"] = pd.to_datetime(df_test["date"])

df_train['week_no'] = df_train['date'].dt.isocalendar().week

In [3]:
del df_features['markdown1']
del df_features['markdown2']
del df_features['markdown3']
del df_features['markdown4']
del df_features['markdown5']
del df_train['date']

In [4]:
# training and validation datasets
n = int(len(df_train))
n_val = int(n*0.2)
n_test = int(n*0.2)
n_train = n - (n_val+ n_test)


# making random sample
indx = np. arange(n)
np.random.shuffle(indx)


# Selecting Data
sales_df_train = df_train.iloc[indx[:n_train]]
sales_df_val = df_train.iloc[indx[n_train:(n_train+n_val)]]
sales_df_test = df_train.iloc[indx[(n_train+n_val):]]

In [5]:
# reset_index(drop = TRUE)
sales_df_test.reset_index(drop = True)
sales_df_val.reset_index(drop = True)
sales_df_train.reset_index(drop = True)

Unnamed: 0,store,dept,weekly_sales,isholiday,week_no
0,14,56,2232.25,False,31
1,16,13,12451.39,False,4
2,26,60,88.00,False,29
3,33,74,353.83,False,33
4,14,54,90.68,False,45
...,...,...,...,...,...
252937,27,35,5756.00,False,18
252938,2,30,4789.64,False,14
252939,41,22,10171.46,False,28
252940,22,98,166.10,False,15


In [6]:
# setting the target value for model

y_test = np.log1p(sales_df_test.weekly_sales.values)
y_train = np.log1p(sales_df_train.weekly_sales.values)
y_val = np.log1p(sales_df_val.weekly_sales.values)

  y_test = np.log1p(sales_df_test.weekly_sales.values)
  y_test = np.log1p(sales_df_test.weekly_sales.values)
  y_train = np.log1p(sales_df_train.weekly_sales.values)
  y_train = np.log1p(sales_df_train.weekly_sales.values)
  y_val = np.log1p(sales_df_val.weekly_sales.values)
  y_val = np.log1p(sales_df_val.weekly_sales.values)


In [7]:
# handling 'inf value'
np.sum(np.isinf(y_train))
inf_mask = np.isinf(y_train)

# replace 'inf' with 'nan' value
y_train[inf_mask] = np.nan

# calculating mean of the array
mean_value = np.nanmean(y_train)
print(mean_value)

# replacing 'nan' value with mean_value
y_train = np.where(np.isnan(y_train), mean_value, y_train)
print("'nan' value after replacing it with mean_value :", np.sum(np.isnan(y_train)))

8.52710222492856
'nan' value after replacing it with mean_value : 0


In [8]:
# handling 'inf value'
np.sum(np.isinf(y_val))
inf_mask = np.isinf(y_val)

# replace 'inf' with 'nan' value
y_val[inf_mask] = np.nan

# calculating mean of the array
mean_value = np.nanmean(y_val)
print(mean_value)

# replacing 'nan' value with mean_value
y_val = np.where(np.isnan(y_val), mean_value, y_val)
print("'nan' value after replacing it with mean_value :", np.sum(np.isnan(y_val)))


8.527212917611429
'nan' value after replacing it with mean_value : 0


In [9]:
del sales_df_train['weekly_sales']
del sales_df_test['weekly_sales']
del sales_df_val['weekly_sales']

In [10]:
sales_df_train.iloc[10]

store          39
dept            5
isholiday    True
week_no        36
Name: 361784, dtype: object

## Gradient Boosting

In [11]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.feature_extraction import DictVectorizer

from sklearn.metrics import mean_squared_error

In [12]:
# One-Hot Incoding
dv = DictVectorizer(sparse = False)

train_dict = sales_df_train.to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = sales_df_val.to_dict(orient='records')
X_val = dv.fit_transform(val_dict)

In [13]:
# Initiate model
gbr = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3)

# Training model
gbr.fit(X_train, y_train)

In [14]:
# make prediction
y_pred = gbr.predict(X_val)
y_pred_actual = np.expm1(y_pred)
y_val_actual = np.expm1(y_val)

In [15]:
# evaluate model

mse = mean_squared_error(y_pred_actual, y_val_actual)
rsme = np.sqrt(mse)

In [16]:
print(rsme)

17844.939884873842


## Xgboost

In [17]:
import xgboost as xgb
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split

In [23]:
sales_df_train.isnull().sum()

store        0
dept         0
isholiday    0
week_no      0
dtype: int64

In [31]:
features = dv.feature_names_

In [36]:
# Convert dataset to DMatrix (optimized data structure for XGBoost)
dtrain = xgb.DMatrix(X_train, label=y_train, feature_names = features )
dval = xgb.DMatrix(X_val, label=y_val, feature_names = features)

In [39]:
# Set parameters for regression
params = {
    'objective': 'reg:squarederror',  # Use a regression objective
    'learning_rate': 0.1,
    'max_depth': 3,
}

In [40]:
# Convert dataset to DMatrix (optimized data structure for XGBoost)
dtrain = xgb.DMatrix(dtrain, label=y_train)
dtest = xgb.DMatrix(dval, label=y_test)

TypeError: Not supported type for data.<class 'xgboost.core.DMatrix'>