In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# !pip install seaborn==0.11.0
import seaborn as sns
import matplotlib.pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Load Data

In [None]:
df = pd.read_csv('../input/avocado-prices/avocado.csv', index_col=0).reset_index(drop=True)
df

In [None]:
df.info()

In [None]:
df['Date'] = pd.to_datetime(df['Date'])

In [None]:
df['month_obs'] = df['Date'].dt.month
df['day_obs'] = df['Date'].dt.day

# Business Understanding
"...if a Millenial could find a city with cheap avocados..."
* Can avocado prices be predicted so Millenial can choose their cheap avocados?
* Was the Avocadopocalypse of 2017 real?

# Data Understanding

In [None]:
df

*     Date - The date of the observation
*     AveragePrice - the average price of a single avocado
*     type - conventional or organic
*     year - the year
*     Region - the city or region of the observation
*     Total Volume - Total number of avocados sold
*     4046 - Total number of avocados with PLU 4046 sold --> Small CA Avocado
*     4225 - Total number of avocados with PLU 4225 sold --> Medium CA Avocado
*     4770 - Total number of avocados with PLU 4770 sold --> Large CA Avocado
*     Total Bags - Total of small + large + xl bags sold
*     Small Bags - total small bags of avocado sold
*     Large Bags - total large bags of avocado sold
*     XLarge Bags - total xlarge bags of avocado sold
---
>Total Volume = 4046 + 4225 + 4770 + Total Bags.

>PLU stands for Price Look Up.

>Source of the data comes directly from retailers’ cash registers based on actual retail sales of Hass avocados.

In [None]:
g = sns.relplot(data=df, x='AveragePrice', y='Total Volume', hue='region')
g.fig.set_figwidth(20)
g.fig.set_figheight(10)
plt.show()

In [None]:
sns.displot(data=df, x='AveragePrice', kind='hist')
plt.show()

In [None]:
g = sns.catplot(data=df, y='AveragePrice',kind='box', x='region')
g.fig.set_figwidth(20)
g.fig.set_figheight(10)
plt.xticks(rotation=90)
plt.show()

In [None]:
cols = ['4046','4225','4770','Total Bags','Small Bags','Large Bags','XLarge Bags']
g = sns.relplot(data=df, x='4046', y='AveragePrice', hue='4225')
plt.show()

In [None]:
g = sns.relplot(data=df, x='Small Bags', y='AveragePrice', hue='Large Bags')
plt.show()

In [None]:
sns.heatmap(df.corr())
plt.show()

Dependent Variable: Total Volume, 4046, 4225, 4770, Total Bags, Small Bags, Large Bags, Xlarge bags

In [None]:
df['Ratio_small_large'] = df['Small Bags'] * df['Large Bags']
df['Ratio_4046_4225'] = df['4046'] * df['4225']

In [None]:
sns.relplot(data=df, x='AveragePrice', y='Ratio_small_large')
plt.show()

In [None]:
sns.relplot(data=df, x='AveragePrice', y='Ratio_4046_4225')
plt.show()

In [None]:
sns.relplot(data=df, x='AveragePrice', y='Total Volume')
plt.show()

In [None]:
sns.relplot(data=df[df['Total Volume'] < 20000000], x='AveragePrice', y='Total Volume')
plt.show()

In [None]:
sns.relplot(data=df[df['Total Volume'] > 20000000], x='AveragePrice', y='Total Volume')
plt.show()

In [None]:
sns.displot(data=df, hue='year', x='AveragePrice', kind='kde', palette='pastel')
plt.show()

In [None]:
sns.displot(data=df, hue='year', x='AveragePrice', kind='kde', palette='pastel', col='month_obs', col_wrap=5)
plt.show()

In [None]:
sns.displot(data=df, x='AveragePrice',kind='kde', hue='type', col='year')
plt.show()

# Data Preparation

In [None]:
df['isAbove20Mils'] = df['Total Volume'].apply(lambda x: '1' if x > 20000000 else '0')
df['_type'] = df['type'].apply(lambda x: '1' if x == 'conventional' else '0')

In [None]:
cols = ['Total Volume', '_type', 'year','month_obs', 'day_obs', 'isAbove20Mils']
X = df[cols]
y = df['AveragePrice']

In [None]:
merged = pd.get_dummies(X['month_obs'], prefix='mo').iloc[:,:-1].reset_index(drop=True).join(X)
merged = pd.get_dummies(X['day_obs'], prefix='do').iloc[:,:-1].reset_index(drop=True).join(merged)

In [None]:
X = merged 
X

In [None]:
X.drop(['month_obs', 'day_obs'], axis=1, inplace=True)

In [None]:
from sklearn.linear_model import LinearRegression, Ridge, SGDRegressor

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [None]:
lr = LinearRegression()
lr.fit(X_train, y_train)

In [None]:
y_pred = lr.predict(X_test)

In [None]:
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test, y_pred)

In [None]:
def predict(model, X_train,  X_test,y_train, y_test):
    model.fit(X, y)
    y_pred = model.predict(X_test)
    return mean_squared_error(y_test, y_pred)

In [None]:
predict(Ridge(), X_train,  X_test,y_train, y_test)

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
reg = make_pipeline(StandardScaler(),SGDRegressor(max_iter=10000, tol=1e-3))
predict(reg,X_train,  X_test,y_train, y_test)

In [None]:
from sklearn.neural_network import MLPRegressor

In [None]:
predict(MLPRegressor(random_state=1, hidden_layer_sizes=(1000,), activation='tanh'), X_train,  X_test,y_train, y_test)

In [None]:
from sklearn.svm import LinearSVR
reg = make_pipeline(StandardScaler(),LinearSVR(C=0.05, epsilon=0.351))
predict(reg, X_train,  X_test,y_train, y_test)

In [None]:
from sklearn.ensemble import RandomForestRegressor
predict(RandomForestRegressor(random_state=0, n_estimators=100), X_train,  X_test,y_train, y_test)

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
predict(GradientBoostingRegressor(learning_rate=1, n_estimators=1000), X_train,  X_test,y_train, y_test)