In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.neighbors import KNeighborsRegressor

import scipy.stats

from sklearn.preprocessing import StandardScaler

import xgboost as xgb

In [2]:
df = pd.read_csv('train.csv')
df_hol = pd.read_csv('holidays_events.csv')
df_oil = pd.read_csv('oil.csv')
df_str = pd.read_csv('stores.csv')
df_trns = pd.read_csv('transactions.csv')
df_test = pd.read_csv('test.csv')

In [3]:
df.head()

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion
0,0,2013-01-01,1,AUTOMOTIVE,0.0,0
1,1,2013-01-01,1,BABY CARE,0.0,0
2,2,2013-01-01,1,BEAUTY,0.0,0
3,3,2013-01-01,1,BEVERAGES,0.0,0
4,4,2013-01-01,1,BOOKS,0.0,0


In [4]:
df_test.head()

Unnamed: 0,id,date,store_nbr,family,onpromotion
0,3000888,2017-08-16,1,AUTOMOTIVE,0
1,3000889,2017-08-16,1,BABY CARE,0
2,3000890,2017-08-16,1,BEAUTY,2
3,3000891,2017-08-16,1,BEVERAGES,20
4,3000892,2017-08-16,1,BOOKS,0


## Combine train and test data

In [5]:
df.shape

(3000888, 6)

In [6]:
target = df['sales']

In [7]:
testIds = df_test[['id']]

In [8]:
train = df.drop(['id', 'sales'], axis = 1)
test = df_test.drop(['id'], axis = 1)

In [9]:
data = pd.concat([train, test], axis=0).reset_index(drop=True)
data

Unnamed: 0,date,store_nbr,family,onpromotion
0,2013-01-01,1,AUTOMOTIVE,0
1,2013-01-01,1,BABY CARE,0
2,2013-01-01,1,BEAUTY,0
3,2013-01-01,1,BEVERAGES,0
4,2013-01-01,1,BOOKS,0
...,...,...,...,...
3029395,2017-08-31,9,POULTRY,1
3029396,2017-08-31,9,PREPARED FOODS,0
3029397,2017-08-31,9,PRODUCE,1
3029398,2017-08-31,9,SCHOOL AND OFFICE SUPPLIES,9


## Combine each Dataset

### Oil Price

In [10]:
df_oil.head()

Unnamed: 0,date,dcoilwtico
0,2013-01-01,
1,2013-01-02,93.14
2,2013-01-03,92.97
3,2013-01-04,93.12
4,2013-01-07,93.2


In [11]:
data = pd.merge(data, df_oil, how="left", on=["date", "date"])

In [12]:
data.shape

(3029400, 5)

### Store

In [13]:
df_str.head()

Unnamed: 0,store_nbr,city,state,type,cluster
0,1,Quito,Pichincha,D,13
1,2,Quito,Pichincha,D,13
2,3,Quito,Pichincha,D,8
3,4,Quito,Pichincha,D,9
4,5,Santo Domingo,Santo Domingo de los Tsachilas,D,4


In [14]:
data = pd.merge(data, df_str, how="left", on=["store_nbr", "store_nbr"])

In [15]:
data.shape

(3029400, 9)

### Transactions

In [16]:
df_trns.head()

Unnamed: 0,date,store_nbr,transactions
0,2013-01-01,25,770
1,2013-01-02,1,2111
2,2013-01-02,2,2358
3,2013-01-02,3,3487
4,2013-01-02,4,1922


In [17]:
data = pd.merge(data, df_trns, how="left", left_on=["date", "store_nbr"], right_on = ["date","store_nbr"])

In [18]:
data.loc[~data['transactions'].isna()]

Unnamed: 0,date,store_nbr,family,onpromotion,dcoilwtico,city,state,type,cluster,transactions
561,2013-01-01,25,AUTOMOTIVE,0,,Salinas,Santa Elena,D,1,770.0
562,2013-01-01,25,BABY CARE,0,,Salinas,Santa Elena,D,1,770.0
563,2013-01-01,25,BEAUTY,0,,Salinas,Santa Elena,D,1,770.0
564,2013-01-01,25,BEVERAGES,0,,Salinas,Santa Elena,D,1,770.0
565,2013-01-01,25,BOOKS,0,,Salinas,Santa Elena,D,1,770.0
...,...,...,...,...,...,...,...,...,...,...
3000883,2017-08-15,9,POULTRY,0,47.57,Quito,Pichincha,B,6,2155.0
3000884,2017-08-15,9,PREPARED FOODS,1,47.57,Quito,Pichincha,B,6,2155.0
3000885,2017-08-15,9,PRODUCE,148,47.57,Quito,Pichincha,B,6,2155.0
3000886,2017-08-15,9,SCHOOL AND OFFICE SUPPLIES,8,47.57,Quito,Pichincha,B,6,2155.0


In [19]:
data.shape

(3029400, 10)

## Feature Engineer

### Date => Day, Day of the Week, Month and Year

In [20]:
data.dtypes

date             object
store_nbr         int64
family           object
onpromotion       int64
dcoilwtico      float64
city             object
state            object
type             object
cluster           int64
transactions    float64
dtype: object

In [21]:
data['date'] =  pd.to_datetime(data['date'])

In [22]:
data['year'] = data['date'].dt.year
data['month'] = data['date'].dt.month
data['day'] = data['date'].dt.day
data['day_of_week'] = data['date'].dt.day_name()

In [23]:
data.head()

Unnamed: 0,date,store_nbr,family,onpromotion,dcoilwtico,city,state,type,cluster,transactions,year,month,day,day_of_week
0,2013-01-01,1,AUTOMOTIVE,0,,Quito,Pichincha,D,13,,2013,1,1,Tuesday
1,2013-01-01,1,BABY CARE,0,,Quito,Pichincha,D,13,,2013,1,1,Tuesday
2,2013-01-01,1,BEAUTY,0,,Quito,Pichincha,D,13,,2013,1,1,Tuesday
3,2013-01-01,1,BEVERAGES,0,,Quito,Pichincha,D,13,,2013,1,1,Tuesday
4,2013-01-01,1,BOOKS,0,,Quito,Pichincha,D,13,,2013,1,1,Tuesday


In [24]:
data.shape

(3029400, 14)

### Holiday => Depens on: Locale

In [25]:
def workDay(row):
    if row['day_of_week'] in ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday']:
        return 1.0
    else:
        return 0.0

In [26]:
data['workDay'] = data.apply(lambda row: workDay(row), axis=1)

In [27]:
data['Events'] = np.zeros(data.shape[0])
data['Holiday'] = np.zeros(data.shape[0])

In [28]:
data.loc[data['workDay'] == 0.0, 'Holiday'] = 1.0

In [29]:
u, c = np.unique(df_hol['date'], return_counts=True)
dup = u[c > 1]
dup

array(['2012-06-25', '2012-07-03', '2012-12-22', '2012-12-24',
       '2012-12-31', '2013-05-12', '2013-06-25', '2013-07-03',
       '2013-12-22', '2014-06-25', '2014-07-03', '2014-12-22',
       '2014-12-26', '2015-06-25', '2015-07-03', '2015-12-22',
       '2016-04-21', '2016-05-01', '2016-05-07', '2016-05-08',
       '2016-05-12', '2016-06-25', '2016-07-03', '2016-07-24',
       '2016-11-12', '2016-12-22', '2017-04-14', '2017-06-25',
       '2017-07-03', '2017-12-08', '2017-12-22'], dtype=object)

In [30]:
index=[32,35,39,86,151,156,205,245,291,305,344,265]
df_hol.drop(index=index, inplace=True)

In [31]:
u, c = np.unique(df_hol['date'], return_counts=True)
dup = u[c > 1]
for date in dup:
    display(df_hol.loc[df_hol['date'] == date])

Unnamed: 0,date,type,locale,locale_name,description,transferred
7,2012-06-25,Holiday,Regional,Imbabura,Provincializacion de Imbabura,False
8,2012-06-25,Holiday,Local,Latacunga,Cantonizacion de Latacunga,False
9,2012-06-25,Holiday,Local,Machala,Fundacion de Machala,False


Unnamed: 0,date,type,locale,locale_name,description,transferred
10,2012-07-03,Holiday,Local,Santo Domingo,Fundacion de Santo Domingo,False
11,2012-07-03,Holiday,Local,El Carmen,Cantonizacion de El Carmen,False


Unnamed: 0,date,type,locale,locale_name,description,transferred
54,2013-05-12,Holiday,Local,Puyo,Cantonizacion del Puyo,False
55,2013-05-12,Event,National,Ecuador,Dia de la Madre,False


Unnamed: 0,date,type,locale,locale_name,description,transferred
58,2013-06-25,Holiday,Regional,Imbabura,Provincializacion de Imbabura,False
59,2013-06-25,Holiday,Local,Machala,Fundacion de Machala,False
60,2013-06-25,Holiday,Local,Latacunga,Cantonizacion de Latacunga,False


Unnamed: 0,date,type,locale,locale_name,description,transferred
61,2013-07-03,Holiday,Local,El Carmen,Cantonizacion de El Carmen,False
62,2013-07-03,Holiday,Local,Santo Domingo,Fundacion de Santo Domingo,False


Unnamed: 0,date,type,locale,locale_name,description,transferred
110,2014-06-25,Holiday,Local,Latacunga,Cantonizacion de Latacunga,False
111,2014-06-25,Holiday,Local,Machala,Fundacion de Machala,False
112,2014-06-25,Holiday,Regional,Imbabura,Provincializacion de Imbabura,False
113,2014-06-25,Event,National,Ecuador,Mundial de futbol Brasil: Ecuador-Francia,False


Unnamed: 0,date,type,locale,locale_name,description,transferred
118,2014-07-03,Holiday,Local,El Carmen,Cantonizacion de El Carmen,False
119,2014-07-03,Holiday,Local,Santo Domingo,Fundacion de Santo Domingo,False


Unnamed: 0,date,type,locale,locale_name,description,transferred
176,2015-06-25,Holiday,Local,Machala,Fundacion de Machala,False
177,2015-06-25,Holiday,Regional,Imbabura,Provincializacion de Imbabura,False
178,2015-06-25,Holiday,Local,Latacunga,Cantonizacion de Latacunga,False


Unnamed: 0,date,type,locale,locale_name,description,transferred
179,2015-07-03,Holiday,Local,El Carmen,Cantonizacion de El Carmen,False
180,2015-07-03,Holiday,Local,Santo Domingo,Fundacion de Santo Domingo,False


Unnamed: 0,date,type,locale,locale_name,description,transferred
224,2016-04-21,Holiday,Local,Riobamba,Cantonizacion de Riobamba,False
225,2016-04-21,Event,National,Ecuador,Terremoto Manabi+5,False


Unnamed: 0,date,type,locale,locale_name,description,transferred
235,2016-05-01,Holiday,National,Ecuador,Dia del Trabajo,False
236,2016-05-01,Event,National,Ecuador,Terremoto Manabi+15,False


Unnamed: 0,date,type,locale,locale_name,description,transferred
242,2016-05-07,Additional,National,Ecuador,Dia de la Madre-1,False
243,2016-05-07,Event,National,Ecuador,Terremoto Manabi+21,False


Unnamed: 0,date,type,locale,locale_name,description,transferred
249,2016-05-12,Holiday,Local,Puyo,Cantonizacion del Puyo,False
250,2016-05-12,Event,National,Ecuador,Terremoto Manabi+26,False


Unnamed: 0,date,type,locale,locale_name,description,transferred
258,2016-06-25,Holiday,Local,Machala,Fundacion de Machala,False
259,2016-06-25,Holiday,Regional,Imbabura,Provincializacion de Imbabura,False
260,2016-06-25,Holiday,Local,Latacunga,Cantonizacion de Latacunga,False


Unnamed: 0,date,type,locale,locale_name,description,transferred
261,2016-07-03,Holiday,Local,El Carmen,Cantonizacion de El Carmen,False
262,2016-07-03,Holiday,Local,Santo Domingo,Fundacion de Santo Domingo,False


Unnamed: 0,date,type,locale,locale_name,description,transferred
282,2016-11-12,Holiday,Local,Ambato,Independencia de Ambato,False
283,2016-11-12,Work Day,National,Ecuador,Recupero Puente Dia de Difuntos,False


Unnamed: 0,date,type,locale,locale_name,description,transferred
315,2017-06-25,Holiday,Regional,Imbabura,Provincializacion de Imbabura,False
316,2017-06-25,Holiday,Local,Latacunga,Cantonizacion de Latacunga,False
317,2017-06-25,Holiday,Local,Machala,Fundacion de Machala,False


Unnamed: 0,date,type,locale,locale_name,description,transferred
318,2017-07-03,Holiday,Local,El Carmen,Cantonizacion de El Carmen,False
319,2017-07-03,Holiday,Local,Santo Domingo,Fundacion de Santo Domingo,False


Unnamed: 0,date,type,locale,locale_name,description,transferred
341,2017-12-08,Holiday,Local,Loja,Fundacion de Loja,False
342,2017-12-08,Transfer,Local,Quito,Traslado Fundacion de Quito,False


In [32]:
df_hol['transferred'] = df_hol['transferred'].astype('bool')

In [33]:
for index, row in df_hol.iterrows():
    if row['type'] == 'Event':
        if row['locale'] == 'National':
            data.loc[((data['date'] == row['date'])), 'Events'] = 1.0
        else:
            data.loc[((data['date'] == row['date']) & 
                      ((data['city'] == row['locale_name']) | (data['state'] == row['locale_name']))), 'Events'] = 1.0
            
    elif row['type'] == 'Work Day':
        if row['locale'] == 'National':
            data.loc[((data['date'] == row['date'])), 'workDay'] = 1.0
        else:
            data.loc[((data['date'] == row['date']) & 
                      ((data['city'] == row['locale_name']) | (data['state'] == row['locale_name']))), 'workDay'] = 1.0
            
    elif ((row['type'] == 'Holiday') & ~(row['transferred'])):
        if row['locale'] == 'National':
            data.loc[((data['date'] == row['date'])), 'Holiday'] = 1.0
            data.loc[((data['date'] == row['date'])), 'workDay'] = 0.0
        else:
            data.loc[((data['date'] == row['date']) & 
                      ((data['city'] == row['locale_name']) | (data['state'] == row['locale_name']))), 'Holiday'] = 1.0
            data.loc[((data['date'] == row['date']) & 
                      ((data['city'] == row['locale_name']) | (data['state'] == row['locale_name']))), 'workDay'] = 0.0
            
    else:
        if row['locale'] == 'National':
            data.loc[((data['date'] == row['date'])), 'Holiday'] = 1.0
            data.loc[((data['date'] == row['date'])), 'workDay'] = 0.0
        else:
            data.loc[((data['date'] == row['date']) & 
                      ((data['city'] == row['locale_name']) | (data['state'] == row['locale_name']))), 'Holiday'] = 1.0
            data.loc[((data['date'] == row['date']) & 
                      ((data['city'] == row['locale_name']) | (data['state'] == row['locale_name']))), 'workDay'] = 0.0
        

In [35]:
data.loc[((data['date'] == '2013-06-25') & ((data['state'] == 'Latacunga') | (data['city'] == 'Machala') | (data['state'] == 'Imbabura')))]

Unnamed: 0,date,store_nbr,family,onpromotion,dcoilwtico,city,state,type,cluster,transactions,year,month,day,day_of_week,workDay,Events,Holiday
312048,2013-06-25,15,AUTOMOTIVE,0,95.25,Ibarra,Imbabura,C,15,1469.0,2013,6,25,Tuesday,0.0,0.0,1.0
312049,2013-06-25,15,BABY CARE,0,95.25,Ibarra,Imbabura,C,15,1469.0,2013,6,25,Tuesday,0.0,0.0,1.0
312050,2013-06-25,15,BEAUTY,0,95.25,Ibarra,Imbabura,C,15,1469.0,2013,6,25,Tuesday,0.0,0.0,1.0
312051,2013-06-25,15,BEVERAGES,0,95.25,Ibarra,Imbabura,C,15,1469.0,2013,6,25,Tuesday,0.0,0.0,1.0
312052,2013-06-25,15,BOOKS,0,95.25,Ibarra,Imbabura,C,15,1469.0,2013,6,25,Tuesday,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
313033,2013-06-25,41,POULTRY,0,95.25,Machala,El Oro,D,4,804.0,2013,6,25,Tuesday,0.0,0.0,1.0
313034,2013-06-25,41,PREPARED FOODS,0,95.25,Machala,El Oro,D,4,804.0,2013,6,25,Tuesday,0.0,0.0,1.0
313035,2013-06-25,41,PRODUCE,0,95.25,Machala,El Oro,D,4,804.0,2013,6,25,Tuesday,0.0,0.0,1.0
313036,2013-06-25,41,SCHOOL AND OFFICE SUPPLIES,0,95.25,Machala,El Oro,D,4,804.0,2013,6,25,Tuesday,0.0,0.0,1.0


## Feature selection

In [36]:
data.shape

(3029400, 17)

In [37]:
data.dtypes

date            datetime64[ns]
store_nbr                int64
family                  object
onpromotion              int64
dcoilwtico             float64
city                    object
state                   object
type                    object
cluster                  int64
transactions           float64
year                     int64
month                    int64
day                      int64
day_of_week             object
workDay                float64
Events                 float64
Holiday                float64
dtype: object

In [38]:
data.describe()

Unnamed: 0,store_nbr,onpromotion,dcoilwtico,cluster,transactions,year,month,day,workDay,Events,Holiday
count,3029400.0,3029400.0,2093850.0,3029400.0,2755104.0,3029400.0,3029400.0,3029400.0,3029400.0,3029400.0,3029400.0
mean,27.5,2.64383,67.71437,8.481481,1694.602,2014.858,6.224706,15.70412,0.676939,0.03235294,0.3260022
std,15.58579,12.33287,25.61957,4.649735,963.281,1.355346,3.374138,8.7976,0.4676459,0.1769357,0.4687482
min,1.0,0.0,26.19,1.0,5.0,2013.0,1.0,1.0,0.0,0.0,0.0
25%,14.0,0.0,46.4,4.0,1046.0,2014.0,3.0,8.0,0.0,0.0,0.0
50%,27.5,0.0,53.19,8.5,1393.0,2015.0,6.0,16.0,1.0,0.0,0.0
75%,41.0,0.0,95.71,13.0,2079.0,2016.0,9.0,23.0,1.0,0.0,1.0
max,54.0,741.0,110.62,17.0,8359.0,2017.0,12.0,31.0,1.0,1.0,1.0


In [39]:
data.describe(exclude=[np.number])  

  data.describe(exclude=[np.number])


Unnamed: 0,date,family,city,state,type,day_of_week
count,3029400,3029400,3029400,3029400,3029400,3029400
unique,1700,33,22,16,5,7
top,2013-01-01 00:00:00,AUTOMOTIVE,Quito,Pichincha,D,Tuesday
freq,1782,91800,1009800,1065900,1009800,434808
first,2013-01-01 00:00:00,,,,,
last,2017-08-31 00:00:00,,,,,


In [40]:
data.isnull().sum() / data.shape[0]

date            0.000000
store_nbr       0.000000
family          0.000000
onpromotion     0.000000
dcoilwtico      0.308824
city            0.000000
state           0.000000
type            0.000000
cluster         0.000000
transactions    0.090545
year            0.000000
month           0.000000
day             0.000000
day_of_week     0.000000
workDay         0.000000
Events          0.000000
Holiday         0.000000
dtype: float64

In [41]:
data = data.drop(columns= ['date'])

## Taking care of missing data

### Categorical Features

In [42]:
data.select_dtypes(np.object_).loc[:, data.isna().sum() > 0].columns

Index([], dtype='object')

### Numerical Features

In [43]:
data.select_dtypes(np.number).loc[:, data.isna().sum() > 0].columns

Index(['dcoilwtico', 'transactions'], dtype='object')

In [44]:
data['dcoilwtico'] = data.ffill(axis=0)['dcoilwtico']

In [45]:
data['dcoilwtico'] = data.bfill(axis=0)['dcoilwtico']

In [46]:
def knn_impute (df, column_na):
  df = df.copy()

  numeric_df = df.select_dtypes(np.number)
  non_na_columns = numeric_df.loc[: , numeric_df.isna().sum() == 0].columns

  y_train = numeric_df.loc[numeric_df[column_na].isna() == False, column_na]
  X_train = numeric_df.loc[numeric_df[column_na].isna() == False, non_na_columns]
  X_test = numeric_df.loc[numeric_df[column_na].isna() == True, non_na_columns]

  knn = KNeighborsRegressor()
  knn.fit(X_train, y_train)

  y_pred = knn.predict(X_test)

  df.loc[df[column_na].isna() == True, column_na] = y_pred

  return df

In [47]:
data = knn_impute(data, 'transactions')

In [48]:
data.select_dtypes(np.number).loc[:, data.isna().sum() > 0].columns

Index([], dtype='object')

## Feature Transformation

### Transform numeric features with skew normal distribution

In [49]:
skew_df = pd.DataFrame(data.select_dtypes(np.number).columns, columns=['Feature'])
skew_df['Skew'] = skew_df['Feature'].apply(lambda feature: scipy.stats.skew(data[feature]))
skew_df['Absolute Skew'] = skew_df['Skew'].apply(abs)
skew_df['Skewed'] = skew_df['Absolute Skew'].apply(lambda x: True if x>=0.5 else False)
skew_df

Unnamed: 0,Feature,Skew,Absolute Skew,Skewed
0,store_nbr,0.0,0.0,False
1,onpromotion,11.153277,11.153277,True
2,dcoilwtico,0.322032,0.322032,False
3,cluster,0.040087,0.040087,False
4,transactions,1.574619,1.574619,True
5,year,0.086465,0.086465,False
6,month,0.099317,0.099317,False
7,day,0.011912,0.011912,False
8,workDay,-0.756722,0.756722,True
9,Events,5.286069,5.286069,True


In [50]:
data[skew_df.query("Skewed == True")['Feature'].values].describe()

Unnamed: 0,onpromotion,transactions,workDay,Events,Holiday
count,3029400.0,3029400.0,3029400.0,3029400.0,3029400.0
mean,2.64383,1664.612,0.676939,0.03235294,0.3260022
std,12.33287,944.5123,0.4676459,0.1769357,0.4687482
min,0.0,5.0,0.0,0.0,0.0
25%,0.0,1037.0,0.0,0.0,0.0
50%,0.0,1364.0,1.0,0.0,0.0
75%,0.0,2029.0,1.0,0.0,1.0
max,741.0,8359.0,1.0,1.0,1.0


In [51]:
for column in ['onpromotion', 'transactions']:
  data[column] = np.log1p(data[column])

## Enconding Categorical

In [52]:
data = pd.get_dummies(data)
data

Unnamed: 0,store_nbr,onpromotion,dcoilwtico,cluster,transactions,year,month,day,workDay,Events,...,type_C,type_D,type_E,day_of_week_Friday,day_of_week_Monday,day_of_week_Saturday,day_of_week_Sunday,day_of_week_Thursday,day_of_week_Tuesday,day_of_week_Wednesday
0,1,0.000000,93.14,13,7.655391,2013,1,1,0.0,0.0,...,0,1,0,0,0,0,0,0,1,0
1,1,0.000000,93.14,13,7.655391,2013,1,1,0.0,0.0,...,0,1,0,0,0,0,0,0,1,0
2,1,0.000000,93.14,13,7.655391,2013,1,1,0.0,0.0,...,0,1,0,0,0,0,0,0,1,0
3,1,0.000000,93.14,13,7.655391,2013,1,1,0.0,0.0,...,0,1,0,0,0,0,0,0,1,0
4,1,0.000000,93.14,13,7.655391,2013,1,1,0.0,0.0,...,0,1,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3029395,9,0.693147,47.26,6,7.524561,2017,8,31,1.0,0.0,...,0,0,0,0,0,0,0,1,0,0
3029396,9,0.000000,47.26,6,7.524561,2017,8,31,1.0,0.0,...,0,0,0,0,0,0,0,1,0,0
3029397,9,0.693147,47.26,6,7.524561,2017,8,31,1.0,0.0,...,0,0,0,0,0,0,0,1,0,0
3029398,9,2.302585,47.26,6,7.509993,2017,8,31,1.0,0.0,...,0,0,0,0,0,0,0,1,0,0


## Scaling

In [53]:
scaler = StandardScaler()
scaler.fit(data)
data = pd.DataFrame(scaler.transform(data), index=data.index, columns=data.columns)

## Split train and test data

In [54]:
train_final = data.loc[:train.index.max(), :].copy()
test_final = data.loc[train.index.max()+1:, :].reset_index(drop=True).copy()

In [55]:
test_final.shape

(28512, 94)

## Training Model

In [56]:
regressor = xgb.XGBRegressor()
regressor.fit(train_final, target)

XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=None, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=None, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=100, n_jobs=None, num_parallel_tree=None,
             predictor=None, random_state=None, ...)

In [57]:
final_predictions = regressor.predict(test_final)

In [58]:
final_predictions

array([ -11.924587,  -16.293266,  -30.061104, ..., 1395.9269  ,
        118.04518 ,    8.974746], dtype=float32)

## Make Submission

In [59]:
submission = pd.concat([testIds, pd.Series(final_predictions, name='sales')], axis=1)

In [62]:
submission.loc[submission['sales']<0, 'sales'] = 0

In [63]:
submission.to_csv('./submission.csv', index=False, header=True)