In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split



In [3]:
holiday_data = pd.read_csv('/content/holidays_events.csv')
oil_data = pd.read_csv('/content/oil.csv')
stores_data = pd.read_csv('/content/stores.csv')
train_data = pd.read_csv('/content/train.csv')
transactions_data = pd.read_csv('/content/transactions.csv')
test_data = pd.read_csv('/content/test.csv')

In [4]:
holiday_data['date'] = pd.to_datetime(holiday_data['date'])
oil_data['date'] = pd.to_datetime(oil_data['date'])
train_data['date'] = pd.to_datetime(train_data['date'])

Data Preparation

In [5]:
#Merging train.csv & stores.csv
train_data = pd.merge(train_data,stores_data, on='store_nbr', how='left')

In [6]:
#Merging train.csv & holiday.csv
train_data = pd.merge(train_data,holiday_data, on='date',how='left')
train_data.rename(columns={'type_y': 'holiday_type', 'type_shop': 'type_shop'}, inplace=True)

In [7]:
#Replacing NaN of non-holiday with Normal in case we need it for visualization
train_data['holiday_type'] = train_data['holiday_type'].fillna("Normal")

In [8]:
#Classifying as Holiday and Non-Holiday
train_data['category_type'] = train_data.apply(
    lambda row: 'non-holiday'
      if ((row['holiday_type'] == 'Holiday' and row['transferred'])
        or (row['holiday_type'] == 'Normal' and ~(row['date'].day_name() in ['Saturday', 'Sunday'])))
      else ('holiday' if ((row['holiday_type'] in ['Holiday', 'Additional','Event', 'Transfer', 'Bridge', 'Work Day'])
        or (row['holiday_type'] == 'Normal' and (row['date'].day_name() in ['Saturday', 'Sunday'])))
                      else 'non-holiday'), axis=1)

In [9]:
train_data_forModel = train_data.copy()

In [10]:
oil_data['date'] = pd.to_datetime(oil_data['date'])

# Add day_of_week to identify weekends
oil_data['day_of_week'] = oil_data['date'].dt.dayofweek  # Monday = 0, Sunday = 6

# Forward fill for weekends (Saturday = 5, Sunday = 6)
mask = (oil_data['day_of_week'] == 5) | (oil_data['day_of_week'] == 6)
oil_data.loc[mask, 'dcoilwtico'] = oil_data['dcoilwtico'].ffill()

# Forward-fill missing oil prices
oil_data['dcoilwtico'] = oil_data['dcoilwtico'].ffill()

In [11]:
#Merging train.csv & oil.csv
train_data_forModel = pd.merge(train_data_forModel, oil_data, on='date',how='left')


In [12]:
train_data_forModel['dcoilwtico'] = train_data_forModel['dcoilwtico'].bfill()

In [13]:
train_data_forModel = train_data_forModel.drop(columns={'id','date','city','state','type_x','cluster','description', 'locale','locale_name','transferred','holiday_type','day_of_week', 'dcoilwtico'})

In [14]:
train_data_forModel.head()

Unnamed: 0,store_nbr,family,sales,onpromotion,category_type
0,1,AUTOMOTIVE,0.0,0,holiday
1,1,BABY CARE,0.0,0,holiday
2,1,BEAUTY,0.0,0,holiday
3,1,BEVERAGES,0.0,0,holiday
4,1,BOOKS,0.0,0,holiday


In [15]:
train_data_forModel = pd.get_dummies(train_data_forModel, columns=['family', 'category_type'] )

In [16]:
# Convert only boolean columns to integers
train_data_forModel[train_data_forModel.select_dtypes('bool').columns] = train_data_forModel.select_dtypes('bool').astype(int)


In [17]:
train_data_forModel.head()

Unnamed: 0,store_nbr,sales,onpromotion,family_AUTOMOTIVE,family_BABY CARE,family_BEAUTY,family_BEVERAGES,family_BOOKS,family_BREAD/BAKERY,family_CELEBRATION,...,family_PERSONAL CARE,family_PET SUPPLIES,family_PLAYERS AND ELECTRONICS,family_POULTRY,family_PREPARED FOODS,family_PRODUCE,family_SCHOOL AND OFFICE SUPPLIES,family_SEAFOOD,category_type_holiday,category_type_non-holiday
0,1,0.0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,1,0.0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,1,0.0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,1,0.0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,1,0.0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0


In [18]:
test_data.head()

Unnamed: 0,id,date,store_nbr,family,onpromotion
0,3000888,2017-08-16,1,AUTOMOTIVE,0
1,3000889,2017-08-16,1,BABY CARE,0
2,3000890,2017-08-16,1,BEAUTY,2
3,3000891,2017-08-16,1,BEVERAGES,20
4,3000892,2017-08-16,1,BOOKS,0


In [19]:
test_data_forModel = test_data.copy()

In [20]:
#dateTime
test_data_forModel['date'] = pd.to_datetime(test_data_forModel['date'])

In [21]:
test_data_forModel = pd.merge(test_data_forModel, holiday_data, on='date', how='left')

In [22]:
test_data_forModel.rename(columns={'type': 'holiday_type'}, inplace=True)

In [23]:
#Replacing NaN of non-holiday with Normal in case we need it for visualization
test_data_forModel['holiday_type'] = test_data_forModel['holiday_type'].fillna("Normal")

In [24]:
#Classifying as Holiday and Non-Holiday
test_data_forModel['category_type'] = test_data_forModel.apply(
    lambda row: 'non-holiday'
      if ((row['holiday_type'] == 'Holiday' and row['transferred'])
        or (row['holiday_type'] == 'Normal' and ~(row['date'].day_name() in ['Saturday', 'Sunday'])))
      else ('holiday' if ((row['holiday_type'] in ['Holiday', 'Additional','Event', 'Transfer', 'Bridge', 'Work Day'])
        or (row['holiday_type'] == 'Normal' and (row['date'].day_name() in ['Saturday', 'Sunday'])))
                      else 'non-holiday'), axis=1)

In [25]:
test_data_forModel = test_data_forModel.drop(columns={'description','date','holiday_type','locale','locale_name','transferred'})
test_data_forModel

Unnamed: 0,id,store_nbr,family,onpromotion,category_type
0,3000888,1,AUTOMOTIVE,0,non-holiday
1,3000889,1,BABY CARE,0,non-holiday
2,3000890,1,BEAUTY,2,non-holiday
3,3000891,1,BEVERAGES,20,non-holiday
4,3000892,1,BOOKS,0,non-holiday
...,...,...,...,...,...
28507,3029395,9,POULTRY,1,non-holiday
28508,3029396,9,PREPARED FOODS,0,non-holiday
28509,3029397,9,PRODUCE,1,non-holiday
28510,3029398,9,SCHOOL AND OFFICE SUPPLIES,9,non-holiday


In [26]:
test_data_forModel = pd.get_dummies(test_data_forModel, columns=['family', 'category_type'])

In [27]:
test_data_forModel[test_data_forModel.select_dtypes('bool').columns] = test_data_forModel.select_dtypes('bool').astype(int)


In [28]:
order_columns= ['store_nbr', 'onpromotion','family_AUTOMOTIVE',
       'family_BABY CARE', 'family_BEAUTY', 'family_BEVERAGES', 'family_BOOKS',
       'family_BREAD/BAKERY', 'family_CELEBRATION', 'family_CLEANING',
       'family_DAIRY', 'family_DELI', 'family_EGGS', 'family_FROZEN FOODS',
       'family_GROCERY I', 'family_GROCERY II', 'family_HARDWARE',
       'family_HOME AND KITCHEN I', 'family_HOME AND KITCHEN II',
       'family_HOME APPLIANCES', 'family_HOME CARE', 'family_LADIESWEAR',
       'family_LAWN AND GARDEN', 'family_LINGERIE', 'family_LIQUOR,WINE,BEER',
       'family_MAGAZINES', 'family_MEATS', 'family_PERSONAL CARE',
       'family_PET SUPPLIES', 'family_PLAYERS AND ELECTRONICS',
       'family_POULTRY', 'family_PREPARED FOODS', 'family_PRODUCE',
       'family_SCHOOL AND OFFICE SUPPLIES', 'family_SEAFOOD',
       'category_type_holiday', 'category_type_non-holiday']
test_data_forModel = test_data_forModel.reindex(columns=order_columns)

In [29]:
from sklearn import metrics
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
import matplotlib.pyplot as plt

In [30]:
#y is prediction target
y = train_data_forModel.sales

X_train_col = train_data_forModel.drop('sales', axis=1)
X_train_col


Unnamed: 0,store_nbr,onpromotion,family_AUTOMOTIVE,family_BABY CARE,family_BEAUTY,family_BEVERAGES,family_BOOKS,family_BREAD/BAKERY,family_CELEBRATION,family_CLEANING,...,family_PERSONAL CARE,family_PET SUPPLIES,family_PLAYERS AND ELECTRONICS,family_POULTRY,family_PREPARED FOODS,family_PRODUCE,family_SCHOOL AND OFFICE SUPPLIES,family_SEAFOOD,category_type_holiday,category_type_non-holiday
0,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,1,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,1,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3054343,9,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
3054344,9,1,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
3054345,9,148,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
3054346,9,8,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,1,0


In [31]:
#Define a random forest model
rf_model = RandomForestRegressor(random_state=1)
rf_model.fit(X_train_col,y)
predictions = rf_model.predict(test_data_forModel)


In [36]:
test_data_forModel['id'] = test_data['id']
test_data_forModel.head()

Unnamed: 0,store_nbr,onpromotion,family_AUTOMOTIVE,family_BABY CARE,family_BEAUTY,family_BEVERAGES,family_BOOKS,family_BREAD/BAKERY,family_CELEBRATION,family_CLEANING,...,family_PET SUPPLIES,family_PLAYERS AND ELECTRONICS,family_POULTRY,family_PREPARED FOODS,family_PRODUCE,family_SCHOOL AND OFFICE SUPPLIES,family_SEAFOOD,category_type_holiday,category_type_non-holiday,id
0,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,3000888
1,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,3000889
2,1,2,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,3000890
3,1,20,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,3000891
4,1,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,3000892


In [37]:
submission = pd.DataFrame({'id': test_data_forModel['id'], 'sales':predictions})
submission.to_csv('submission.csv',index=False)