In [None]:
import numpy as np
import pandas as pd
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
import plotly
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
init_notebook_mode(connected=True)
plt.style.use('fivethirtyeight')
import xgboost as xgb
print("xgboost", xgb.__version__)
from xgboost import plot_importance, plot_tree
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [None]:
df = pd.read_csv("data/avocado.csv")

In [None]:
data = df.copy()

In [None]:
data = data.drop(['Unnamed: 0'],axis = 1)

In [None]:
data.Date

In [None]:
data.info()

In [None]:
data.describe(include = 'all')

In [None]:
data["Date"] = pd.to_datetime(data["Date"])
data["month"] = data["Date"].dt.month
data["day"] = data["Date"].dt.day


In [None]:


scatter = go.Scatter(x = data.groupby('Date').mean().index, y = data.groupby('Date').mean().AveragePrice , name = 'avg price')

layout = go.Layout(title = 'Time series plot for mean daily prices for all regions', xaxis ={'title':'Date'}, yaxis = {'title':'Prices'})
figure = go.Figure(data = [scatter], layout = layout)
iplot(figure)

In [None]:
#renaming columns

rename_columns = list(data.columns)
rename_columns = [x.lower() for x in rename_columns]
data.columns = rename_columns
data = data.rename(columns = lambda x: x.replace(' ','_'))

In [None]:
data

In [None]:



#Tranforming categorical values 

def label_enconcode_pre(df):
    if df.dtype == 'object':
        df = LabelEncoder().fit_transform(df)
    return df

data = data.apply(lambda x: label_enconcode_pre(x))

In [None]:
data = pd.get_dummies(data, columns=['year','type','region'], drop_first= True)

In [None]:
data = data.set_index('date')

In [None]:
split_date = '2017-07-30'
data_train = data.loc[data.index <= split_date]
data_test = data.loc[data.index > split_date]

scatter = go.Scatter(x = data_train.groupby('date').mean().index, y = data_train.groupby('date').mean().averageprice , name = 'train')
scatter2 = go.Scatter(x = data_test.groupby('date').mean().index, y = data_test.groupby('date').mean().averageprice, name = 'test')

layout = go.Layout(title = 'Time series plot for mean daily prices for all regions', xaxis ={'title':'Date'}, yaxis = {'title':'Prices'})
figure = go.Figure(data = [scatter, scatter2], layout = layout)
iplot(figure)

In [None]:
# Train / Test split

X_train = data_train.drop(['averageprice'], axis = 1)
y_train = data_train['averageprice']

X_test = data_test.drop(['averageprice'], axis = 1)
y_test = data_test['averageprice']

### XGBoost model 

In [None]:


reg = xgb.XGBRegressor(n_estimators=1000)
reg.fit(X_train, y_train,
        eval_set=[(X_train, y_train), (X_test, y_test)],
        early_stopping_rounds=50,
       verbose=False, ) # Change verbose to True if you want to see it train

In [None]:

plot_importance(reg, height=0.8, max_num_features=15)

In [None]:
data_test["averageprice_prediction"] = reg.predict(X_test)
data_all = pd.concat([data_test, data_train], sort = False)

In [None]:
data_all[['averageprice', 'averageprice_prediction']].sort_values('date')

In [None]:
scatter = go.Scatter(x = data.groupby('date').mean().index, y = data.groupby('date').mean().averageprice , name = 'actual')
scatter2 = go.Scatter(x = data_all.groupby('date').mean().index, y = data_all.groupby('date').mean().averageprice_prediction, name = 'forecast')

layout = go.Layout(title = 'XGBoost time series forecast', xaxis ={'title':'Date'}, yaxis = {'title':'Prices'})
figure = go.Figure(data = [scatter, scatter2], layout = layout)
iplot(figure)

### Metrics

In [None]:
mse = mean_squared_error(y_true=data_test['averageprice'], y_pred=data_test['averageprice_prediction'])
mae = mean_absolute_error(y_true=data_test['averageprice'], y_pred=data_test['averageprice_prediction'])

print("MAE:", mae, "MSE:",mse)

### Fine tuning the model  

In [None]:
eval_set = [(X_train, y_train), (X_test, y_test)]
eval_metric = ["mae"]
%time reg.fit(X_train, y_train, eval_metric=eval_metric, eval_set=eval_set, verbose=True)

In [None]:
reg = xgb.XGBRegressor(n_estimators=1000, learning_rate = 0.09, max_depth = 8)
reg.fit(X_train, y_train,
        eval_set=[(X_train, y_train), (X_test, y_test)],
        early_stopping_rounds=50,
       verbose=False, ) # Change verbose to True if you want to see it train

data_test["averageprice_prediction"] = reg.predict(X_test)
data_all = pd.concat([data_test, data_train], sort = False)

scatter = go.Scatter(x = data.groupby('date').mean().index, y = data.groupby('date').mean().averageprice , name = 'actual')
scatter2 = go.Scatter(x = data_all.groupby('date').mean().index, y = data_all.groupby('date').mean().averageprice_prediction, name = 'forecast')

layout = go.Layout(title = 'XGBoost time series forecast', xaxis ={'title':'Date'}, yaxis = {'title':'Prices'})
figure = go.Figure(data = [scatter, scatter2], layout = layout)
iplot(figure)


mse = mean_squared_error(y_true=data_test['averageprice'], y_pred=data_test['averageprice_prediction'])
mae = mean_absolute_error(y_true=data_test['averageprice'], y_pred=data_test['averageprice_prediction'])

print("MAE:", mae, "MSE:",mse)