# Problem Statement:
Avocado is a fruit consumed by people heavily in the United States. 

Content
This data was downloaded from the Hass Avocado Board website in May of 2018 & compiled into a single CSV. 

The table below represents weekly 2018 retail scan data for National retail volume (units) and price. Retail scan data comes directly from retailers’ cash registers based on actual retail sales of Hass avocados. 

Starting in 2013, the table below reflects an expanded, multi-outlet retail data set. Multi-outlet reporting includes an aggregation of the following channels: grocery, mass, club, drug, dollar and military. The Average Price (of avocados) in the table reflects a per unit (per avocado) cost, even when multiple units (avocados) are sold in bags. 

The Product Lookup codes (PLU’s) in the table are only for Hass avocados. Other varieties of avocados (e.g. greenskins) are not included in this table.

Some relevant columns in the dataset:

Date - The date of the observation
AveragePrice - the average price of a single avocado
type - conventional or organic
year - the year
Region - the city or region of the observation
Total Volume - Total number of avocados sold
4046 - Total number of avocados with PLU 4046 sold
4225 - Total number of avocados with PLU 4225 sold
4770 - Total number of avocados with PLU 4770 sold


Inspiration /Label 

Your task is to make a mode that can consider the data provided and predict the Average Price.

#Importing libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.metrics import accuracy_score 
from sklearn.metrics import confusion_matrix, classification_report 
from sklearn.model_selection import train_test_split
from scipy.stats import zscore #to remove outliers
from scipy.stats import skew
import requests
import pandas_profiling
import io
import warnings
warnings.filterwarnings('ignore')

In [None]:
#Importing dataset

In [None]:
df = pd.read_csv("avocado.csv")

# eda

In [None]:
df.head(4)

In [None]:
df.shape # check the data dimension

In [None]:
#the dataset has only 1518 rows but the file is being imported with 16468 rows

In [None]:
df["Date"][1516] # we need to keep the dataframe limited to 1516 rows and also drop the extra index column

In [None]:
df.columns

In [None]:
df.drop(columns='Unnamed: 0', axis=1,inplace = True)#column is removed

In [None]:
df["Date"][16467]

In [None]:
j = 16467
while j>1516:
    df = df.drop(index = j)
    j -= 1

In [None]:
df #additional rows are dropped 

In [None]:
df.dtypes

In [None]:
df.info()

In [None]:
df.columns # check the column names for EDA

In [None]:
df.describe()

## handle categorical data

In [None]:
#categorical columns in the dataset are as follows:
#date, type, region

In [None]:
#splitting date column to 3 different columns

In [None]:
df[["day", "month", "year"]] = df["Date"].str.split("-", expand = True)

In [None]:
df.drop(columns="Date", axis=1, inplace= True)

In [None]:
df

In [None]:
from sklearn.preprocessing import LabelEncoder

LE=LabelEncoder()
df["type"] = LE.fit_transform(df["type"])
df["region"] = LE.fit_transform(df["region"])

In [None]:
df.head(5)

In [None]:
df.dtypes

In [None]:
#day, month and year columns are still object types, converting them to float...
df['day'] = pd.to_numeric(df['day'], downcast='float')
df['month'] = pd.to_numeric(df['month'], downcast='float')
df['year'] = pd.to_numeric(df['year'], downcast='float')

In [None]:
df.dtypes

## handle null values

In [None]:
df.isna().sum() #check for null values

In [None]:
plt.figure(figsize = (10,10))
sns.heatmap(df.isnull())
plt.title("Null Values")
plt.show()

In [None]:
#No null values to be handled

In [None]:
df.columns

In [None]:
sns.scatterplot(x='4046', y='AveragePrice', data=df)

In [None]:
sns.scatterplot(x='4225', y='AveragePrice', data=df)

In [None]:
sns.scatterplot(x='4770', y='AveragePrice', data=df)

In [None]:
#Checking the distribution of values of each column

In [None]:
for col in df:
    print(col)
    
    plt.figure()
    sns.kdeplot(df[col], shade = True)
    plt.show()

In [None]:
#Checking the distribution of values of each column

In [None]:
for col in df:
    print(col)
    
    plt.figure()
    sns.countplot(df[col])
    plt.show()

In [None]:
df.columns

In [None]:
pre_profile = df.profile_report(title="avocado")
#pre_profile.to_file(output_file="avocado.html")

In [None]:
pre_profile

In [None]:
#type has constant value "0"	Constant
#Total Volume is highly correlated with 4046 and 5 other fields	High correlation
#4046 is highly correlated with Total Volume and 3 other fields	High correlation
#4225 is highly correlated with Total Volume and 5 other fields	High correlation
#4770 is highly correlated with Total Volume and 3 other fields	High correlation
#Total Bags is highly correlated with Total Volume	High correlation
#Small Bags is highly correlated with Total Volume	High correlation
#Large Bags is highly correlated with Total Volume	High correlation
#type is highly correlated with year	High correlation

## check for outliers

In [None]:
#remove outliers before skewness check and before x, y split

In [None]:
df.boxplot(figsize=[20,8])
plt.subplots_adjust(bottom=0.25)
plt.show()

In [None]:
#Removing outliers by z score

In [None]:
from scipy.stats import zscore
z = np.abs(zscore(df))
new_df = df[(z<3).all(axis=1)]

In [None]:
new_df.shape

In [None]:
df.shape

In [None]:
dataloss = ((1517-0)/1517)*100

In [None]:
dataloss

In [None]:
#Outliers are not removed since data loss is very high

## check co-relation

In [None]:
plt.figure(figsize=[22,12])
cor = df.corr()
sns.heatmap(cor, annot = True)
plt.show()

In [None]:
#Arrange co-relation in descending order. Dropping columns should be the last option to prevent data loss.

In [None]:
cor["AveragePrice"].sort_values(ascending=False)

In [None]:
df.columns

In [None]:
columns = ['AveragePrice', 'Total Volume', '4046', '4225', '4770', 'Total Bags',
       'Small Bags', 'Large Bags', 'XLarge Bags', 'type', 'year', 'region',
       'day', 'month']

In [None]:
sns.pairplot(df[columns])

## check for skewness

In [None]:
x = df.drop('AveragePrice',axis=1)
y = df['AveragePrice']

In [None]:
for col in df:
    print(col)
    print(skew(df[col]))
    
    plt.figure()
    sns.distplot(df[col])
    plt.show()

In [None]:
x.skew() # check skewness

In [None]:
from sklearn.preprocessing import power_transform
df_new = power_transform(x)

df_new = pd.DataFrame(df_new, columns = x.columns)

In [None]:
df_new.skew()

In [None]:
df_new

In [None]:
x

In [None]:
x = df_new

## test train split

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=101)

## regression

In [None]:
#Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
lm = LinearRegression()
lm.fit(x_train,y_train)
print('Coefficients: \n', lm.coef_)
predictions = lm.predict(x_test)
plt.scatter(y_test,predictions)
plt.xlabel('Y Test')
plt.ylabel('Predicted Y')

# calculate these metrics by hand!
from sklearn import metrics

print('MAE:', metrics.mean_absolute_error(y_test, predictions))
print('MSE:', metrics.mean_squared_error(y_test, predictions))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, predictions)))
print('Variance:',metrics.explained_variance_score(y_test, predictions))

In [None]:
from sklearn.linear_model import Lasso, Ridge

In [None]:
#alpha values could be .00001, .0001, 0.01,.1,1,10, higher values reduce all coefficients towards 0 and impact output
# default value of alpha is 0.01
ls = Lasso(alpha=0.00001)
ls.fit(x_train, y_train)
ls.score(x_train, y_train)

In [None]:
ls.coef_

In [None]:
rd=Ridge(alpha=0.00001)
rd.fit(x_train, y_train)
rd.score(x_train, y_train)

In [None]:
rd.coef_

In [None]:
#ElasticNet
from sklearn.linear_model import ElasticNet
enr = ElasticNet(alpha= 0.00001)
enr.fit(x_train, y_train)
enr.score(x_train, y_train)

In [None]:
enr.coef_

In [None]:
#CatBoostRegressor

In [None]:
from catboost import CatBoostRegressor
# Initialize CatBoostRegressor
model = CatBoostRegressor(iterations=10,learning_rate=0.5,depth=2)
# Fit model
model.fit(x_train,y_train)
# Get predictions
preds = model.predict(x_test)

plt.scatter(y_test,preds)
plt.xlabel('Y Test')
plt.ylabel('Predicted Y')

from sklearn import metrics
from sklearn.metrics import r2_score

print('MAE:', metrics.mean_absolute_error(y_test, preds))
print('MSE:', metrics.mean_squared_error(y_test, preds))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, preds)))
print('R2 Score', r2_score(y_test, preds))
print('Variance:',metrics.explained_variance_score(y_test, preds))

In [None]:
#decisiontreeregressor

In [None]:
from sklearn.tree import DecisionTreeRegressor
import matplotlib.pyplot as plt

# Fit regression model
regr_1 = DecisionTreeRegressor(max_depth=5)
regr_1.fit(x_train,y_train)

# Predict
preds = regr_1.predict(x_test)

# Plot the results
plt.scatter(y_test,preds)
plt.xlabel('Y Test')
plt.ylabel('Predicted Y')

print('MAE:', metrics.mean_absolute_error(y_test, preds))
print('MSE:', metrics.mean_squared_error(y_test, preds))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, preds)))
print('Variance:',metrics.explained_variance_score(y_test, preds))

## hyper parameter tuning

In [None]:
from sklearn.model_selection import GridSearchCV

model = DecisionTreeRegressor()

gs = GridSearchCV(model,
                  param_grid = {'max_depth': range(1, 11),
                                'min_samples_split': range(10, 60, 10)},
                  cv=5,
                  n_jobs=1,
                  scoring='neg_mean_squared_error')

gs.fit(x_train, y_train)

print(gs.best_params_)
print(-gs.best_score_)

In [None]:
new_model = DecisionTreeRegressor(max_depth=7,
                                  min_samples_split=10)
#or new_model = gs.best_estimator_
new_model.fit(x_train, y_train)

## saving the model

In [None]:
import joblib
joblib.dump(new_model, "model.pkl") #rename as per project name
prediction = new_model.predict(x_test)