In [66]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

In [67]:
# Let's explore the data we have here

train = pd.read_csv("./train.csv")
test = pd.read_csv("./test.csv")
stores = pd.read_csv("./stores.csv")
transactions = pd.read_csv("./transactions.csv")
h_days = pd.read_csv("./holidays_events.csv")
oil = pd.read_csv("./oil.csv")

In [68]:
train.head(5)

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion
0,0,2013-01-01,1,AUTOMOTIVE,0.0,0
1,1,2013-01-01,1,BABY CARE,0.0,0
2,2,2013-01-01,1,BEAUTY,0.0,0
3,3,2013-01-01,1,BEVERAGES,0.0,0
4,4,2013-01-01,1,BOOKS,0.0,0


In [69]:
def merge(df, m_df, on = 'date'):
 df = pd.merge(how = 'left', on = on, left = df, right = m_df)

 return df

train = merge(train, oil)
train = merge(train, h_days)
train = merge(train, stores, "store_nbr")

test = merge(test, oil)
test = merge(test, h_days)
test = merge(test, stores, "store_nbr")

train = train.drop(['store_nbr'], axis = 1)
test = test.drop(['store_nbr'], axis = 1)

In [70]:
check = pd.to_datetime(['2018-10-26 12:00 -0530', '2018-10-26 12:00 -0500'],
               utc=True)

def split_date(df):
 df['date'] = pd.to_datetime(df['date'])

 df['day_of_week'] = df['date'].apply(lambda x: x.day_of_week)
 df['month'] = df['date'].apply(lambda x: x.month)
 df['year'] = df['date'].apply(lambda x: x.year)
 df['is_weekend'] = df['day_of_week'].apply(lambda x: 1 if x > 5 else 0)
 df['quarter'] = df['date'].apply(lambda x: x.quarter)

 return df

train = split_date(train)
test = split_date(test)

In [71]:
train.head(5)

Unnamed: 0,id,date,family,sales,onpromotion,dcoilwtico,type_x,locale,locale_name,description,transferred,city,state,type_y,cluster,day_of_week,month,year,is_weekend,quarter
0,0,2013-01-01,AUTOMOTIVE,0.0,0,,Holiday,National,Ecuador,Primer dia del ano,False,Quito,Pichincha,D,13,1,1,2013,0,1
1,1,2013-01-01,BABY CARE,0.0,0,,Holiday,National,Ecuador,Primer dia del ano,False,Quito,Pichincha,D,13,1,1,2013,0,1
2,2,2013-01-01,BEAUTY,0.0,0,,Holiday,National,Ecuador,Primer dia del ano,False,Quito,Pichincha,D,13,1,1,2013,0,1
3,3,2013-01-01,BEVERAGES,0.0,0,,Holiday,National,Ecuador,Primer dia del ano,False,Quito,Pichincha,D,13,1,1,2013,0,1
4,4,2013-01-01,BOOKS,0.0,0,,Holiday,National,Ecuador,Primer dia del ano,False,Quito,Pichincha,D,13,1,1,2013,0,1


In [72]:
def encode(df):
 le = LabelEncoder()
 cols_to_encode = ["family", "state", "type_y", "locale", "locale_name", "transferred"]

 for c in cols_to_encode:
  df[c] = le.fit_transform(df[c])
 
 return df

train = encode(train)
test = encode(test)

In [73]:
train['description'].unique()

array(['Primer dia del ano', nan, 'Recupero puente Navidad',
       'Recupero puente primer dia del ano', 'Carnaval',
       'Fundacion de Manta', 'Provincializacion de Cotopaxi',
       'Fundacion de Cuenca', 'Cantonizacion de Libertad',
       'Cantonizacion de Riobamba', 'Viernes Santo', 'Dia del Trabajo',
       'Dia de la Madre-1', 'Cantonizacion del Puyo', 'Dia de la Madre',
       'Batalla de Pichincha', 'Cantonizacion de Guaranda',
       'Provincializacion de Imbabura', 'Fundacion de Machala',
       'Cantonizacion de Latacunga', 'Cantonizacion de El Carmen',
       'Fundacion de Santo Domingo', 'Cantonizacion de Cayambe',
       'Fundacion de Guayaquil-1', 'Fundacion de Guayaquil',
       'Fundacion de Esmeraldas', 'Primer Grito de Independencia',
       'Fundacion de Riobamba', 'Fundacion de Ambato',
       'Fundacion de Ibarra', 'Cantonizacion de Quevedo',
       'Independencia de Guayaquil',
       'Traslado Independencia de Guayaquil', 'Dia de Difuntos',
       'Independe

In [74]:
train.corr().style.background_gradient("YlOrBr")

Unnamed: 0,id,family,sales,onpromotion,dcoilwtico,locale,locale_name,transferred,state,type_y,cluster,day_of_week,month,year,is_weekend,quarter
id,1.0,1.1e-05,0.086102,0.206032,-0.839036,-0.049972,-0.053151,-0.047792,7.8e-05,-0.000113,-4.4e-05,0.005678,0.066683,0.977741,0.004358,0.065949
family,1.1e-05,1.0,-0.113986,-0.047216,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,0.0,0.0,0.0,0.0,0.0
sales,0.086102,-0.113986,1.0,0.428241,-0.079393,-0.01164,-0.013896,-0.01443,0.068988,-0.095699,0.038537,0.037359,0.02025,0.08132,0.039271,0.018752
onpromotion,0.206032,-0.047216,0.428241,1.0,-0.15246,-0.017057,-0.019368,-0.017828,0.013109,-0.008256,0.005702,-0.000538,0.0256,0.198913,-0.014103,0.02486
dcoilwtico,-0.839036,-0.0,-0.079393,-0.15246,1.0,0.037224,0.056684,0.041243,0.0,-0.0,-0.0,-0.000256,0.004121,-0.833249,,0.011183
locale,-0.049972,-0.0,-0.01164,-0.017057,0.037224,1.0,0.865778,0.960054,0.0,-0.0,-0.0,-0.003089,-0.128111,-0.023697,-0.012962,-0.13465
locale_name,-0.053151,-0.0,-0.013896,-0.019368,0.056684,0.865778,1.0,0.922572,0.0,-0.0,0.0,-0.012243,-0.088518,-0.034878,-0.007582,-0.09837
transferred,-0.047792,-0.0,-0.01443,-0.017828,0.041243,0.960054,0.922572,1.0,0.0,-0.0,0.0,-0.010921,-0.126011,-0.022134,-0.010624,-0.131039
state,7.8e-05,-0.0,0.068988,0.013109,0.0,0.0,0.0,0.0,1.0,-0.262662,0.169536,-0.0,-0.0,-0.0,-0.0,-0.0
type_y,-0.000113,-0.0,-0.095699,-0.008256,-0.0,-0.0,-0.0,-0.0,-0.262662,1.0,-0.268419,0.0,0.0,0.0,0.0,0.0
