### About Features

* Date - the date of the observation
* AveragePrice - the average price of a single avocado
* type - conventional or organic
* year - the year
* Region - the city or region of the observation
* Total Volume - Total number of avocados sold
* 4046 - Total number of avocados with PLU 4046 sold
* 4225 - Total number of avocados with PLU 4225 sold
* 4770 - Total number of avocados with PLU 4770 sold


**Our task is to predict the Average Price.**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('../input/avacado-price-prediction/Avocado.csv')

In [None]:
df.shape

In [None]:
df.head()

In [None]:
# Drop unnecessary feature

df = df.drop('Unnamed: 0', axis=1)

In [None]:
# Check Null Values

df.isnull().sum()

Hopefully, we have no null values!

In [None]:
df.describe()

# EDA

In [None]:
background_color = '#F8EDF4'
color_palette = ['#F78904', '#00C73C', '#D2125E', '#693AF9', '#B20600', '#007CDE', '#994936', '#886A00', '#39BBC2']

### Distribution of Target (AveragePrice)

In [None]:
fig = plt.figure(figsize=(15, 12))
gs = fig.add_gridspec(3, 2)
gs.update(hspace=0.2, wspace=0.3)

ax0 = fig.add_subplot(gs[0, 0])
ax1 = fig.add_subplot(gs[0, 1])
ax2 = fig.add_subplot(gs[1, 0])
ax3 = fig.add_subplot(gs[1, 1])
ax4 = fig.add_subplot(gs[2, 0])
ax5 = fig.add_subplot(gs[2, 1])
fig.patch.set_facecolor(background_color)

axes = [ax0, ax1, ax2, ax3, ax4, ax5]


# Title1
ax0.text(0.5, 0.5, 'Distribution of AveragePrice\n____________________',
        horizontalalignment='center',
        verticalalignment='center',
        fontsize=18, fontweight='bold', fontfamily='serif')

# Graph1
sns.kdeplot(x='AveragePrice', data=df, fill=True, ax=ax1, color=color_palette[0])



# Title2
ax2.text(0.5, 0.5, 'Distribution of AveragePrice\nby Type\n____________________',
        horizontalalignment='center',
        verticalalignment='center',
        fontsize=18, fontweight='bold', fontfamily='serif')

# Graph2
sns.kdeplot(x='AveragePrice', data=df, fill=True, hue='type', ax=ax3, palette=color_palette[:2])



# Title3
ax4.text(0.5, 0.5, 'Distribution of AveragePrice\nby Year\n____________________',
        horizontalalignment='center',
        verticalalignment='center',
        fontsize=18, fontweight='bold', fontfamily='serif')

# Graph3
sns.kdeplot(x='AveragePrice', data=df, fill=True, hue='year', ax=ax5, palette=color_palette[:4])



# Settings
for ax in axes:
    ax.set_facecolor(background_color)
    for s in ['top', 'right', 'left']:
        ax.spines[s].set_visible(False)

for ax in [ax0, ax2, ax4]:
    ax.set_xticklabels([])
    ax.set_yticklabels([])
    ax.tick_params(left=False, bottom=False)
    ax.spines[['bottom']].set_visible(False)
        
for ax in [ax1, ax3, ax5]:
    ax.grid(color='#000000', linestyle=':', axis='y', zorder=0, dashes=(1,5))
    ax.set_xlabel('')
    ax.set_ylabel('')

### Distribution of Continuous Features

In [None]:
cont_features = ['Total Volume', '4046', '4225', '4770', 'Total Bags', 'Small Bags', 'Large Bags', 'XLarge Bags']

In [None]:
fig = plt.figure(figsize=(20, 15))
gs = fig.add_gridspec(3, 3)
gs.update(wspace=0.4, hspace=0.5)

ax0 = fig.add_subplot(gs[0, 0])
ax1 = fig.add_subplot(gs[0, 1])
ax2 = fig.add_subplot(gs[0, 2])
ax3 = fig.add_subplot(gs[1, 0])
ax4 = fig.add_subplot(gs[1, 1])
ax5 = fig.add_subplot(gs[1, 2])
ax6 = fig.add_subplot(gs[2, 0])
ax7 = fig.add_subplot(gs[2, 1])
ax8 = fig.add_subplot(gs[2, 2])
axes = [ax0, ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8]

fig.patch.set_facecolor(background_color)

# Title
ax0.text(0.5, 0.5, 'Distribution of Continuous Features\n_______________________',
        fontsize=18, fontfamily='serif', fontweight='bold',
        horizontalalignment='center',
        verticalalignment='center')

# Graphs
for i, ax in enumerate(axes):
    for s in ['top', 'right', 'left']:
        ax.spines[s].set_visible(False)
        
    ax.set_facecolor(background_color)
    
    if i == 0:
        ax.set_xticklabels([])
        ax.set_yticklabels([])
        ax.tick_params(left=False, bottom=False)
        ax.spines[['bottom']].set_visible(False)
    else:
        ax.set_title(cont_features[i-1], fontsize=14, fontfamily='serif', fontweight='bold')
        ax.grid(color='#000000', linestyle=':', axis='y', zorder=0, dashes=(1,5))
    
        sns.kdeplot(x=cont_features[i-1], data=df, ax=ax, color=color_palette[i-1], fill=True)
        ax.set_xlabel('')
        ax.set_ylabel('')
    


Oops! They are highly skewed!
Let's check skewness of the dataset.

In [None]:
df[cont_features].skew()

Usually, it's a great way to use 'sqrt' to remove skewness.

In [None]:
np.sqrt(df[cont_features]).skew()

It works! But... still skewed....

Let's try higher root.

In [None]:
(df[cont_features] ** (1/6)).skew()

When we use 6th root, it's much better.

Let's see the distribution again.

In [None]:
fig = plt.figure(figsize=(20, 15))
gs = fig.add_gridspec(3, 3)
gs.update(wspace=0.4, hspace=0.5)

ax0 = fig.add_subplot(gs[0, 0])
ax1 = fig.add_subplot(gs[0, 1])
ax2 = fig.add_subplot(gs[0, 2])
ax3 = fig.add_subplot(gs[1, 0])
ax4 = fig.add_subplot(gs[1, 1])
ax5 = fig.add_subplot(gs[1, 2])
ax6 = fig.add_subplot(gs[2, 0])
ax7 = fig.add_subplot(gs[2, 1])
ax8 = fig.add_subplot(gs[2, 2])
axes = [ax0, ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8]

fig.patch.set_facecolor(background_color)

# Title
ax0.text(0.5, 0.5, 'Distribution of Continuous Features\nAfter Removing Skewness\n_______________________',
        fontsize=18, fontfamily='serif', fontweight='bold',
        horizontalalignment='center',
        verticalalignment='center')

# Graphs
for i, ax in enumerate(axes):
    for s in ['top', 'right', 'left']:
        ax.spines[s].set_visible(False)
        
    ax.set_facecolor(background_color)
    
    if i == 0:
        ax.set_xticklabels([])
        ax.set_yticklabels([])
        ax.tick_params(left=False, bottom=False)
        ax.spines[['bottom']].set_visible(False)
    else:
        ax.set_title(cont_features[i-1], fontsize=14, fontfamily='serif', fontweight='bold')
        ax.grid(color='#000000', linestyle=':', axis='y', zorder=0, dashes=(1,5))
    
        sns.kdeplot(x=df[cont_features[i-1]]**(1/6), ax=ax, color=color_palette[i-1], fill=True)
        ax.set_xlabel('')
        ax.set_ylabel('')

'XLarge Bags' is still skewed, but the others look much better.

We can also see the ditribution of continuous features by the 'type'.

In [None]:
fig = plt.figure(figsize=(20, 15))
gs = fig.add_gridspec(3, 3)
gs.update(wspace=0.4, hspace=0.5)

ax0 = fig.add_subplot(gs[0, 0])
ax1 = fig.add_subplot(gs[0, 1])
ax2 = fig.add_subplot(gs[0, 2])
ax3 = fig.add_subplot(gs[1, 0])
ax4 = fig.add_subplot(gs[1, 1])
ax5 = fig.add_subplot(gs[1, 2])
ax6 = fig.add_subplot(gs[2, 0])
ax7 = fig.add_subplot(gs[2, 1])
ax8 = fig.add_subplot(gs[2, 2])
axes = [ax0, ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8]

fig.patch.set_facecolor(background_color)

# Title
ax0.text(0.5, 0.5, 'Distribution of Continuous Features\nby Type\n_______________________',
        fontsize=18, fontfamily='serif', fontweight='bold',
        horizontalalignment='center',
        verticalalignment='center')

# Graphs
for i, ax in enumerate(axes):
    for s in ['top', 'right', 'left']:
        ax.spines[s].set_visible(False)
        
    ax.set_facecolor(background_color)
    
    if i == 0:
        ax.set_xticklabels([])
        ax.set_yticklabels([])
        ax.tick_params(left=False, bottom=False)
        ax.spines[['bottom']].set_visible(False)
    else:
        ax.set_title(cont_features[i-1], fontsize=14, fontfamily='serif', fontweight='bold')
        ax.grid(color='#000000', linestyle=':', axis='y', zorder=0, dashes=(1,5))
    
        sns.kdeplot(x=df[cont_features[i-1]]**(1/6), ax=ax, palette=color_palette[0:2], fill=True, hue=df['type'])
        ax.set_xlabel('')
        ax.set_ylabel('')

### Correlation Matrix

In [None]:
df.corr()

In [None]:
f, ax = plt.subplots(1, 1, figsize=(10, 10))

mask = np.triu(np.ones_like(df.corr()))
ax.text(2.5, -0.1, 'Correlation Matrix', fontsize=18, fontweight='bold', fontfamily='serif')
sns.heatmap(df.corr(), annot=True, fmt='.2f', cmap='RdBu', 
            square=True, mask=mask, linewidth=0.7, ax=ax)

# Preprocessing

'Label Encoding' for the categorical features.

In [None]:
from sklearn.preprocessing import LabelEncoder

label = LabelEncoder()
df['type'] = label.fit_transform(df['type'])
df['region'] = label.fit_transform(df['region'])

Get month from 'Date' and drop the column.

In [None]:
df['Month'] = pd.to_datetime(df['Date']).dt.month
df.drop('Date', axis=1, inplace=True)

# Modeling

In [None]:
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


X = df.drop('AveragePrice', axis=1)
y = df['AveragePrice']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=0)

### Linear Regression

In [None]:
model = LinearRegression()
model.fit(X_train, y_train)
print('Train Score : {:.4f}'.format(model.score(X_train, y_train)))
print('Test Score : {:.4f}'.format(model.score(X_test, y_test)))

### Ridge

In [None]:
model = Ridge()
model.fit(X_train, y_train)
print('Train Score : {:.4f}'.format(model.score(X_train, y_train)))
print('Test Score : {:.4f}'.format(model.score(X_test, y_test)))

### Gradient Boost

In [None]:
model = GradientBoostingRegressor()
model.fit(X_train, y_train)
print('Train Score : {:.4f}'.format(model.score(X_train, y_train)))
print('Test Score : {:.4f}'.format(model.score(X_test, y_test)))

### K-Nearest Neighbors

In [None]:
model = KNeighborsRegressor()
model.fit(X_train, y_train)
print('Train Score : {:.4f}'.format(model.score(X_train, y_train)))
print('Test Score : {:.4f}'.format(model.score(X_test, y_test)))

### Random Forest

In [None]:
model = RandomForestRegressor()

model.fit(X_train, y_train)
print('Train Score : {:.4f}'.format(model.score(X_train, y_train)))
print('Test Score : {:.4f}'.format(model.score(X_test, y_test)))

### Decision Tree

In [None]:
model = DecisionTreeRegressor()

model.fit(X_train, y_train)
print('Train Score : {:.4f}'.format(model.score(X_train, y_train)))
print('Test Score : {:.4f}'.format(model.score(X_test, y_test)))

# Hyperparameter Tuning

In [None]:
from sklearn.model_selection import GridSearchCV

models = []
scores = []

### Ridge

In [None]:
param_grid = {'alpha' : [0.0001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100]}
model = Ridge()

grid_search = GridSearchCV(model, param_grid, cv=5)
grid_search.fit(X_train, y_train)
ridge_score = grid_search.score(X_test, y_test)

print('Test Score : {:.4f}'.format(ridge_score))
print('Best Parameters :', grid_search.best_params_)
print('Best CV Score : {:.4f}'.format(grid_search.best_score_))

models.append('Ridge')
scores.append(ridge_score)

### Gradient Boost

In [None]:
param_grid = {'max_depth' : [2, 3, 4, 5, 6, 7, 8, 9]}
model = GradientBoostingRegressor()

grid_search = GridSearchCV(model, param_grid, cv=5)
grid_search.fit(X_train, y_train)
gb_score = grid_search.score(X_test, y_test)

print('Test Score : {:.4f}'.format(gb_score))
print('Best Parameters :', grid_search.best_params_)
print('Best CV Score : {:.4f}'.format(grid_search.best_score_))

models.append('Gradient Boost')
scores.append(gb_score)

### K-Nearest Neighbors

In [None]:
param_grid = {'n_neighbors' : range(1, 20)}
model = KNeighborsRegressor()

grid_search = GridSearchCV(model, param_grid, cv=5)
grid_search.fit(X_train, y_train)
knr_score = grid_search.score(X_test, y_test)

print('Test Score : {:.4f}'.format(knr_score))
print('Best Parameters :', grid_search.best_params_)
print('Best CV Score : {:.4f}'.format(grid_search.best_score_))

models.append('K-Nearest Neighbors')
scores.append(knr_score)

### Random Forest

In [None]:
param_grid = {'max_depth' : [5, 10, 15, 20, 25, 30]}
model = RandomForestRegressor()

grid_search = GridSearchCV(model, param_grid, cv=5)
grid_search.fit(X_train, y_train)
rf_score = grid_search.score(X_test, y_test)

print('Test Score : {:.4f}'.format(rf_score))
print('Best Parameters :', grid_search.best_params_)
print('Best CV Score : {:.4f}'.format(grid_search.best_score_))

models.append('Random Forest')
scores.append(rf_score)

### Decision Tree

In [None]:
param_grid = {'max_depth' : range(2, 20), "min_samples_leaf" : range(2, 10)}
model = DecisionTreeRegressor()

grid_search = GridSearchCV(model, param_grid, cv=5)
grid_search.fit(X_train, y_train)
dt_score = grid_search.score(X_test, y_test)

print('Test Score : {:.4f}'.format(dt_score))
print('Best Parameters :', grid_search.best_params_)
print('Best CV Score : {:.4f}'.format(grid_search.best_score_))

models.append('Decision Tree')
scores.append(dt_score)

## Result

In [None]:
df_result = pd.DataFrame({'Model' : models, 'Score' : scores})
df_result

In [None]:
fig = plt.figure(figsize=(15, 4))
gs = fig.add_gridspec(1, 2)
gs.update(wspace=0.2)

ax0 = fig.add_subplot(gs[0, 0])
ax1 = fig.add_subplot(gs[0, 1])

# Title
ax0.text(0.5, 0.5, 'Score of Models\n ___________',
        horizontalalignment='center',
        verticalalignment='center',
        fontsize=18, fontfamily='serif', fontweight='bold')
ax0.set_xticklabels([])
ax0.set_yticklabels([])
ax0.tick_params(left=False, bottom=False)
ax0.spines['left'].set_visible(False)


# Graph
ax1.grid(color='#000000', linestyle=':', axis='y', zorder=0, dashes=(1,5))
sns.barplot(x='Score', y='Model', data=df_result, palette=color_palette, ax=ax1)
ax1.set_xlabel('')
ax1.set_ylabel('')


fig.patch.set_facecolor(background_color)
axes = [ax0, ax1]

for ax in axes:
    ax.set_facecolor(background_color)
    for s in ['top', 'right', 'bottom']:
        ax.spines[s].set_visible(False)

### Thank you!
### Please Upvote if you like my notebook 👍