In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime as dt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LogisticRegression
from math import sqrt
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error


## Disclaimer: this is my first notebook, so excuse my mistakes or incorrect assumptions. I have tried to apply everything I have been learning by looking at people´s kernels and reading ML theory.  So this first kernel is a mix of visualizations, feature engineering and model comparisson. 

So lets start and try to predict avocados future prices.

Lets begin by loading the data.

In [None]:
df = pd.read_csv('../input/avocado.csv', index_col=0)

df.head(5)

Lets convert date to datetime type and also type to a category so they can be useful later in the model. And reset the index to avoid any potential duplicates.

In [None]:
df = df.reset_index(drop=True)
df['type'] = df['type'].astype('category')
df['type'] = df['type'].cat.codes

## Visualizations

There are obvious differences between regions/cities in terms of volumes. Some like their guacamole more than others. 

In [None]:
filter1=df.region!='TotalUS'
data1=df[filter1]

sorted_average = data1.groupby(["region"])['Total Volume'].aggregate(np.mean).reset_index().sort_values('Total Volume')

fig, ax = plt.subplots(figsize=(12, 8))
plt.xticks(rotation=90)
ax=sns.barplot(x='region',y='Total Volume', data=data1, palette='magma', order=sorted_average['region'])

Not everyones pays the same for organic avocados, southern states (close to Mexico) seem to benefit from better prices. This is not surprising taking into account Mexico represent 45% of all the avocado exports in the world. Ay caramba!

In [None]:
filter2=df['type']==1
data2=df[filter2]

sorted_average = data2.groupby(["region"])['AveragePrice'].aggregate(np.mean).reset_index().sort_values('AveragePrice')

fig, ax = plt.subplots(figsize=(12, 8))
plt.xticks(rotation=90)
plt.title('Organic, Average Price')
ax=sns.barplot(x='region',y='AveragePrice', data=data2, palette='magma', order=sorted_average['region'])

The story repeats itself with the conventional type, it is much cheaper for states closer to Mexico.

In [None]:
filter2=df['type']==0
data2=df[filter2]


sorted_average = data2.groupby(["region"])['AveragePrice'].aggregate(np.mean).reset_index().sort_values('AveragePrice')

fig, ax = plt.subplots(figsize=(12, 8))
plt.xticks(rotation=90)
plt.title('Conventional, Average Price')
ax=sns.barplot(x='region',y='AveragePrice', data=data2, palette='magma', order=sorted_average['region'])

Obvious price differences between organic and conventional, which is no surprise since it costs more to grow organic and this 
type of fruit caters to a premium segment of the market. Except for 2017, prices for conventional avocados have been quite stable on average. 

In [None]:
filter3=df['region']!='TotalUS'
data3=df[filter3]

fig, ax = plt.subplots(figsize=(12, 8))
plt.title('Average Price per year')
g = sns.barplot(x = 'year', y = 'AveragePrice', hue='type', data=data3)

Year after year the volumes are growing steadily for both kinds of hass avocado. 

In [None]:
filter3=df['region']=='TotalUS'
data3=df[filter3]

fig, ax = plt.subplots(figsize=(12, 8))
plt.title('Total Volume per year (TotalUS only)')
g = sns.barplot(x = 'year', y = 'Total Volume', hue='type', data=data3, estimator=sum)

# Lets see how Volumes and Average Prices relate to each other for conventional and organic avocados. 

There is a slight tendency for lower prices when volumes are significantly large, which is usually what happens in any market in terms of pricing.  

In [None]:
filter5=df['type']==0
data5=df[filter5]

g = sns.lmplot(x='Total Volume',y='AveragePrice', data=data5, fit_reg=True, height=8, aspect=1.2)
fig = g.fig
fig.suptitle("Conventional: Volume vs. Average Price")
plt.show()

In [None]:
filter5=df['type']==1
data5=df[filter5]

g = sns.lmplot(x='Total Volume',y='AveragePrice', data=data5, fit_reg=True, height=8, aspect=1.2)
fig = g.fig
fig.suptitle("Organic: Volume vs. Average Price")
plt.show()

# Now lets see how the features correlate with each other.

All the numeric features correlate highly with each other and none with the label (AveragePrice). Type is moderately correlated to AveragePrice which was evident by the differences in prices between conventional and organic avocados which was displayed in the visualizations. 

In [None]:
sns.clustermap(df.corr(), center=0, cmap="vlag", annot = True, linewidths=.75, figsize=(13, 13));

Finally, lets do a pairing of all the numeric features in order to clearly visualize the above results. 

In [None]:
g = sns.PairGrid(df)
g = g.map_diag(plt.hist)
g = g.map_offdiag(plt.scatter);

# Models

Visualizations are always nice but lets get to the reason we are all here. Lets start with Random Forest which seems to be the more suitable algorithm for this job. 

Lets convert our region feature so we can use it in the model. We will keep the region column for later. 

In [None]:
df['fregion'] = df['region'].values
df = pd.get_dummies(df, columns=['fregion'])
df.head()

# Lets train the data and score it. 

Nice, the result makes it seems like this is the right model to use. 

In [None]:
drop_list = ['AveragePrice', 'Date', 'region']

X = df.drop(drop_list, axis=1)
y = df['AveragePrice'].values.ravel()
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=1)

clf = RandomForestRegressor(n_estimators= 100, random_state=42)
clf.fit(Xtrain,ytrain)

scores = cross_val_score(clf, Xtrain, ytrain, cv=3, n_jobs=-1)

print(f"{round(np.mean(scores),3)*100}% accuracy")
print(f"MSE {mean_squared_error(y_pred=clf.predict(Xtest), y_true=ytest)}")

And now lets evaluate the predictions. We have USD 0.1026 difference on average on the predictions given by the Random Forest. Not bad. And lets see how the predictions look in a histogram. Not too bad, most prediction fall between 0 and 20 cents. 

In [None]:
predictions = clf.predict(Xtest)

#print(predictions)
# Calculate the absolute errors
errors = abs(predictions - ytest)

fig, ax = plt.subplots(figsize=(12, 8))
plt.hist(errors, bins = 10, edgecolor = 'black');
# Print out the mean absolute error (mae)
#print(f"Test: {mean_squared_error(ytest, predictions)} ")
print(f"R2 score: {r2_score(ytest, predictions)}")
print(f"Mean absolute error:  {mean_absolute_error(ytest, predictions)} USD")

Lets plot the predictions vs reality. 

In [None]:
fig, ax = plt.subplots(figsize=(12, 8))
g = sns.regplot(x = predictions,y = ytest)

Lets check out now which features are important for the model. 

In [None]:
# Get numerical feature importances
importances = list(clf.feature_importances_)

feature_list = list(X.columns)

# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]

# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)

# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

Well type is an important feature for the model. Again, we were expecting that. But, we also can see how Large Bags, 4046, 4225, Total Volume, etc., basically all the numeric features, carry a decent weight in the model's predictions. And this for me is a **PROBLEM** since this data is totally dependant on sales activity. We wouldn't have this kind of data if we would attempt to predict future prices. Therefore I think these features cannot be considered as part of the solution. So lets try something else. 

# New approach

Lets add a few new features and see if they can replace some of these numeric features.  We will add features that would be readily available for future predictions.

Since there are significant differences in prices among different regions lets add a grouping using the Economic Analysis Regions from the Bureau of Economic Analysis (BEA) of the United States Department of Commerce. 

Lets also add seasons, since produce is usually very season dependant and it might help the model. Lets also extract the week number from the date. And lets finally convert the date to numeric months. 

In [None]:
EconomicAnalysisRegion = []

for region in df['region']:
    if region in ['California', 'LasVegas', 'LosAngeles', 'Portland', 'Sacramento', 'SanDiego', 'SanFrancisco', 'Seattle', 'Spokane']:
        EconomicAnalysisRegion.append('Far West')
    elif region in ['Chicago', 'CincinnatiDayton', 'Columbus', 'Detroit', 'GrandRapids', 'Indianapolis']:
        EconomicAnalysisRegion.append('Great Lakes')
    elif region in ['GreatLakes']:
        EconomicAnalysisRegion.append('GreatLakes')
    elif region in ['Albany', 'BaltimoreWashington', 'BuffaloRochester', 'HarrisburgScranton', 'HartfordSpringfield', 'NewYork', 'Philadelphia', 'Pittsburgh', 'Syracuse']:
        EconomicAnalysisRegion.append('Mideast')
    elif region in ['Midsouth']:
        EconomicAnalysisRegion.append('Midsouth') 
    elif region in ['Boston', 'HartfordSpringfield']:
        EconomicAnalysisRegion.append('New England') 
    elif region in ['Northeast']:
        EconomicAnalysisRegion.append('Northeast')
    elif region in ['NorthernNewEngland']:
        EconomicAnalysisRegion.append('NorthernNewEngland')
    elif region in ['Plains', 'StLouis']:
        EconomicAnalysisRegion.append('Plains')
    elif region in ['Boise', 'Denver']:
        EconomicAnalysisRegion.append('Rocky Mountains')
    elif region in ['SouthCarolina']:
        EconomicAnalysisRegion.append('SouthCarolina')
    elif region in ['SouthCentral']:
        EconomicAnalysisRegion.append('SouthCentral')
    elif region in ['Atlanta', 'Charlotte', 'Jacksonville', 'Louisville', 'MiamiFtLauderdale', 'Nashville', 'NewOrleansMobile', 'Orlando', 'RaleighGreensboro', 'RichmondNorfolk', 'Roanoke', 'Southeast']:
        EconomicAnalysisRegion.append('Southeast')
    elif region in ['DallasFtWorth', 'Houston', 'PhoenixTucson', 'Tampa']:
        EconomicAnalysisRegion.append('SouthWest')
    elif region in ['TotalUS']:
        EconomicAnalysisRegion.append('TotalUS')
    elif region in ['West']:
        EconomicAnalysisRegion.append('West')
    elif region in ['WestTexNewMexico']:
        EconomicAnalysisRegion.append('WestTexNewMexico')
        

df['Economic Analysis Region'] = EconomicAnalysisRegion

df = pd.get_dummies(df, columns=['Economic Analysis Region'])

In [None]:
df['Date'] = pd.to_datetime(df['Date'])
df['month'] = df['Date'].dt.month

seasons = []

for month in df['month']:
    if month in [1, 2, 12]:
        seasons.append('winter')
    elif month in [3, 4, 5]:
        seasons.append('spring')
    elif month in [6, 7, 8]:
        seasons.append('summer')
    elif month in [9, 10, 11]:
        seasons.append('fall')
                
df['season'] = seasons
df = pd.get_dummies(df, columns=['season'])

In [None]:
df['Date'] = pd.to_datetime(df['Date'])
df['week'] = df['Date'].dt.week.shift(-2).ffill()

In [None]:
df.head(5)

# Train the new model

Lets train the new model dropping the features which are not useful and only considering the new features. 

The result show a slight increase in accuracy compared to the previous training. So it seems the new features did some good. Lets see how the predictions fair out. 

In [None]:
drop_list = ['AveragePrice', 'Date', '4046', '4225', '4770', 'Total Bags', 'Total Volume', 'region', 'Small Bags', 'Large Bags', 'XLarge Bags']

X = df.drop(drop_list, axis=1)
y = df['AveragePrice'].values.ravel()
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=42)

clf = RandomForestRegressor(n_estimators= 100, random_state=42)

clf.fit(Xtrain,ytrain)

scores = cross_val_score(clf, Xtrain, ytrain, cv=3, n_jobs=-1)

print(f"{round(np.mean(scores),3)*100}% accuracy")
print(f"MSE {mean_squared_error(y_pred=clf.predict(Xtest), y_true=ytest)}")

Lets see how the predictions look. 

Wow, feature engineering does make all the difference. the MSE dropped below 0.02 and the absolute error improved by 0.0136 cents. And pretty much all the  predictions fall between 0 and 20 cents. So quite good after all. And better than guessing, right?  

In [None]:
predictions = clf.predict(Xtest)

#print(predictions)
# Calculate the absolute errors
errors = abs(predictions - ytest)
sns.set(rc={'figure.figsize':(11.7,8.27)})
plt.hist(errors, bins = 10, edgecolor = 'black');
# Print out the mean absolute error (mae)
print('Mean Absolute Error: USD', round(np.mean(errors), 3))

Lets see which features were more important for the model. 

Its seems the type feature remains as strong as ever. But week proved to be a game changer, and month and year scores well too. However the other new features are almost ignored. Type and dates are really the features that are important for the model to predict the prices. 

In [None]:
# Get numerical feature importances
importances = list(clf.feature_importances_)

feature_list = list(X.columns)

# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]

# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)

# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

Lets now try another approach, lets see how Prophet, an algorithm developed for time series predictions, does with this data. 

# Bring in the Prophet

In [None]:
from fbprophet import Prophet

Since this is a time series algorithm, we cannot train all the data together, so we need to look at one region at a time. Lets start with TotalUS and also the conventional type. Doesnt make sense to mix the types since there are significant price differences between the two

In [None]:
m = Prophet()
df = pd.read_csv('../input/avocado.csv', index_col=0)
df['Date'] = pd.to_datetime(df['Date'])

mask = (df['region'] == 'TotalUS') & (df['type'] == 'conventional')
df = df[mask]

# Change column names as Prophet requires.
df.rename(columns={'Date': 'ds', 'AveragePrice': 'y'}, inplace=True)
m.fit(df);

Lets predict prices for 52 weeks into the future. 

In [None]:
future = m.make_future_dataframe(periods=52,freq='w')
future.tail(3)

And plot the predictions along with the historical data. 

In [None]:
forecast = m.predict(future)
m.plot(forecast, xlabel = 'Date', ylabel = 'Price');

In [None]:
fig2 = m.plot_components(forecast)

Lets prepare the data so we can compare the predctions with the historical data. 

In [None]:
cmp_df = df.join(forecast.set_index('ds'), on='ds')
cmp_df = cmp_df[cmp_df['y'].notnull()]

So Prophet's prediction is 0.067 cents off on average which is 41 cents or 37.9% better than the Random Forest using the numeric features. It seems Prophet is definitely better suited for the job. 

In [None]:
print(f"MSE {mean_squared_error(y_pred=cmp_df.yhat, y_true=cmp_df.y)}")
print(f"R2 score: {r2_score(cmp_df.y, cmp_df.yhat)}")
print(f"Mean absolute error:  {mean_absolute_error(cmp_df.y, cmp_df.yhat)} USD")

90%+ of the predictions fall between 0 and 0.20 cents absolute difference. 

In [None]:
errors = abs(cmp_df.yhat - cmp_df.y)
plt.hist(errors, bins = 15, edgecolor = 'black');

Lets plot the predictions vs the targets. 

In [None]:
sns.set(rc={'figure.figsize':(12,10)})
g = sns.regplot(x = cmp_df.yhat,y = cmp_df.y)

# One more run for Prophet

Lets now try it with the Houston data and the organic variety and see if we get similar results.

In [None]:
m = Prophet()
df = pd.read_csv('../input/avocado.csv', index_col=0)
df['Date'] = pd.to_datetime(df['Date'])

mask = (df['region'] == 'Houston') & (df['type'] == 'organic')
df = df[mask]

df.rename(columns={'Date': 'ds', 'AveragePrice': 'y'}, inplace=True)
m.fit(df);
future = m.make_future_dataframe(periods=52,freq='w')
forecast = m.predict(future)
m.plot(forecast, xlabel = 'Date', ylabel = 'Price');

In [None]:
cmp_df = df.join(forecast.set_index('ds'), on='ds')
cmp_df = cmp_df[cmp_df['y'].notnull()]

print(f"MSE {mean_squared_error(y_pred=cmp_df.yhat, y_true=cmp_df.y)}")
print(f"R2 score: {r2_score(cmp_df.y, cmp_df.yhat)}")
print(f"Mean absolute error:  {mean_absolute_error(cmp_df.y, cmp_df.yhat)} USD")

Hmmm, USD 0.126 absolute error. Maybe this Prophet cannot always see the future. :-D

I have ran Prophet on other regions using the conventional type and results were often as good as for TotalUS. I guess it doesn't do so well with the organic data since price variability in this set is larger than for the convetional type. (Standard deviation: Organic -> 0.363502 vs Conventional -> 0.263041)

# If you got this far, thank you for you patience and attention!!! 

# If you have comments or suggestions I will be happy to read them, thanks in advance. 