In [None]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.model_selection import cross_val_score

%matplotlib inline 

In [None]:
train=pd.read_csv("train.csv")

In [None]:
holiday_events=pd.read_csv('holidays_events.csv')
oil=pd.read_csv('oil.csv')
stores=pd.read_csv('stores.csv')
transactions=pd.read_csv('transactions.csv')

In [None]:
train.head()

In [None]:
oil.head()

In [None]:
stores.head()

In [None]:
transactions.head()

In [None]:
train_full = train \
   .merge(holiday_events, how = 'left', on = ['date']).rename(columns={'type':'holiday_type'}) \
  .merge(stores, how = 'left', on = ['store_nbr']).rename(columns={'type':'city_type'}) \
  .merge(transactions, how = 'left', on = ['store_nbr','date']) \
  .merge(oil, how = 'left', on = ['date'] )

In [None]:
test=pd.read_csv("/content/test.csv")

In [None]:
test_full = test \
  .merge(holiday_events, how = 'left', on = ['date']).rename(columns={'type':'holiday_type'}) \
  .merge(stores, how = 'left', on = ['store_nbr']).rename(columns={'type':'city_type'}) \
  .merge(transactions, how = 'left', on = ['store_nbr','date']) \
  .merge(oil, how = 'left', on = ['date'] )

In [None]:
train_full['date'] = pd.to_datetime(train['date'])
train_full.set_index('date')

In [None]:
class DropColumnTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, by=1, columns=None):
        self.by = by
        self.columns = columns
    
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        cols_to_transform = X.columns
        if self.columns:
            cols_to_transform = self.columns
        X = X.drop(cols_to_transform, axis = 1)
        return X

class BoolTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, by=1, columns=None):
        self.by = by
        self.columns = columns
    
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        cols_to_transform = X.columns
        if self.columns:
            cols_to_transform = self.columns
        for col in cols_to_transform:
            X[col] = X[col].apply(lambda x: str(x)) 
        return X


In [None]:
prepare_df_pipe = Pipeline(
    steps=[
          ("drop columns", DropColumnTransformer(columns=['description','locale_name', 'id', 'sales']))
        , ("bool columns", BoolTransformer(columns=['transferred']))
    ]
)
prepare_df_pipe.fit_transform(train_full)

In [None]:
train_full.isna().sum()/len(train_full)

In [None]:
train_full['holiday_type'].fillna('None').value_counts()

In [None]:
prepare_df_pipe.fit_transform(train_full).isna().sum()

In [None]:
train_full['dcoilwtico'].fillna('None').value_counts()

In [None]:
prepare_df_pipe.fit_transform(train_full).columns[prepare_df_pipe.fit_transform(train_full).dtypes != 'object']

In [None]:
none_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value = 'Other')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
]) ## holiday_type, locale, transferred, city_type

cat_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent', fill_value='missing')),
    ('encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
]) ## city, family

num_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value = -1)),
    ('scaler', StandardScaler())
]) ## dcoilwtico


In [None]:
preprocessor = ColumnTransformer([
      ('holiday_type, locale, transferred, city_type', none_pipe, ['holiday_type', 'locale', 'city_type', 'transferred'])
    , ('city, family', cat_pipe, ['family']) 
    , ('store_nbr, onpromotion, cluster, transactions', num_pipe, ['store_nbr', 'onpromotion', 'cluster', 'transactions'])
])

pipe = Pipeline(
    steps=[
        ("drop columns", DropColumnTransformer(columns=['description','locale_name', 'id', 'city']))
      , ("bool columns", BoolTransformer(columns=['transferred']))
      , ("preprocessor", preprocessor)
      , ("model", RandomForestRegressor(n_estimators=20, verbose=1, n_jobs = -1))
    ]
)


In [None]:
X = train_full.drop('sales', axis =1)
y = train_full['sales']

In [None]:
pipe.fit(X, y)


In [None]:
pipe.predict(test_full)

In [None]:
sample=pd.read_csv("/content/sample_submission.csv")
sample.head()

In [None]:
submission = pd.DataFrame(test_full.id, columns = ['id'])
submission['sales'] = pipe.predict(test_full)

In [None]:
submission.to_csv('submission.csv', index = False)