In [1]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/playground-series-s5e1/sample_submission.csv
/kaggle/input/playground-series-s5e1/train.csv
/kaggle/input/playground-series-s5e1/test.csv


In [63]:
!pip install -q xgboost optuna
print('done')

done


In [82]:
%%time
import numpy as np, pandas as pd, xgboost as xgb, optuna
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

CPU times: user 46 µs, sys: 1 µs, total: 47 µs
Wall time: 51 µs


In [100]:
train = pd.read_csv('/kaggle/input/playground-series-s5e1/train.csv')
test = pd.read_csv('/kaggle/input/playground-series-s5e1/test.csv')
train.head(2)

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,id,date,country,store,product,num_sold
0,0,2010-01-01,Canada,Discount Stickers,Holographic Goose,
1,1,2010-01-01,Canada,Discount Stickers,Kaggle,973.0


## **data info**

In [23]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 230130 entries, 0 to 230129
Data columns (total 6 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   id        230130 non-null  int64  
 1   date      230130 non-null  object 
 2   country   230130 non-null  object 
 3   store     230130 non-null  object 
 4   product   230130 non-null  object 
 5   num_sold  221259 non-null  float64
dtypes: float64(1), int64(1), object(4)
memory usage: 10.5+ MB


In [24]:
test.head(2)

Unnamed: 0,id,date,country,store,product
0,230130,2017-01-01,Canada,Discount Stickers,Holographic Goose
1,230131,2017-01-01,Canada,Discount Stickers,Kaggle


In [25]:
print(f"{train.shape}\n{test.shape}")

(230130, 6)
(98550, 5)


In [26]:
for col in train.columns.to_list():
    print(f"{col} : {train[col].isna().sum()}")

id : 0
date : 0
country : 0
store : 0
product : 0
num_sold : 8871


In [27]:
for col in test.columns.to_list():
    print(f"{col} : {test[col].isna().sum()}")

id : 0
date : 0
country : 0
store : 0
product : 0


In [28]:
for col in ['country', 'store', 'product']:
    print(train[col].value_counts(), sep='\n')

country
Canada       38355
Finland      38355
Italy        38355
Kenya        38355
Norway       38355
Singapore    38355
Name: count, dtype: int64
store
Discount Stickers       76710
Stickers for Less       76710
Premium Sticker Mart    76710
Name: count, dtype: int64
product
Holographic Goose     46026
Kaggle                46026
Kaggle Tiers          46026
Kerneler              46026
Kerneler Dark Mode    46026
Name: count, dtype: int64


## **data preprocessing**

In [101]:
for col in ['country', 'store', 'product']:
    train[col] = LabelEncoder().fit_transform(train[col])
    test[col] = LabelEncoder().fit_transform(test[col])
print('done')

done


In [102]:
train['date'] = pd.to_datetime(train['date'])
test['date'] = pd.to_datetime(test['date'])
print('done')

done


In [103]:
train['year'] = train['date'].dt.year
train['month'] = train['date'].dt.month
train['day'] = train['date'].dt.day

test['year'] = test['date'].dt.year
test['month'] = test['date'].dt.month
test['day'] = test['date'].dt.day

train = train.drop('date',axis=1)
test = test.drop('date',axis=1)
print('done')

done


In [104]:
train['date'] = (train['year'] + train['month'] + train['day'])/3
test['date'] = (test['year'] + test['month'] + test['day'])/3

train = train.drop(['year', 'month', 'day'],axis=1)
test = test.drop(['year', 'month', 'day'],axis=1)

print('done')

done


In [105]:
train['num_sold'] = train['num_sold'].fillna(train['num_sold'].mean())
train = train.drop('id', axis=1)
X_test = test.iloc[:, test.columns!='id']
print('done')

done


In [107]:
X_train, X_valid, y_train, y_valid = train_test_split(
    train.iloc[:, train.columns != 'num_sold'],
    train['num_sold'],
    test_size=0.2
)
print('done')

done


## **models**

In [108]:
boost = xgb.XGBRegressor()
forest = RandomForestRegressor()
print('done')

done


In [109]:
boost.fit(X_train, y_train)
forest.fit(X_train, y_train)
print('done')

done


In [110]:
print(
    f"boosting: {mean_squared_error(boost.predict(X_valid), y_valid)}\n"
    f"forest: {mean_squared_error(forest.predict(X_valid), y_valid)}"
)

boosting: 29970.4225993219
forest: 30860.638179113423


## **submission**

In [111]:
preds = boost.predict(X_test)
print('done')

done


In [112]:
output = pd.DataFrame({'id': test['id'],
                       'num_sold': preds})
output.to_csv('submission.csv', index=False)
print('done')

done
