In [1]:
import os

AVOCADOS_PATH = os.path.join("datasets", "avocados")

In [2]:
import pandas as pd

def load_avocados_data(avocados_path=AVOCADOS_PATH):
    csv_path = os.path.join(avocados_path, "avocado.csv")
    return pd.read_csv(csv_path)

In [3]:
avocados = load_avocados_data()
avocados.head()

FileNotFoundError: [Errno 2] No such file or directory: 'datasets/avocados/avocado.csv'

In [None]:
avocados.info()

In [None]:
avocados["type"].value_counts()

In [None]:
avocados["region"].value_counts()

In [None]:
avocados.describe()

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
avocados.hist(bins=50, figsize=(20,15))
plt.show()

In [None]:
#will change evertime it is ran
#eventually showing the model the entire dataset

import numpy as np

def split_train_test(data, test_ratio):
    shuffled_indices = np.random.permutation(len(data))
    test_set_size = int(len(data) * test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]

In [None]:
from zlib import crc32

def test_set_check(identifier, test_ratio):
    return crc32(np.int64(identifier)) & 0xffffffff < test_ratio * 2**32

def split_train_test_by_id(data, test_ratio, id_column):
    ids = data[id_column]
    in_test_set = ids.apply(lambda id_: test_set_check(id_, test_ratio))
    return data.loc[~in_test_set], data.loc[in_test_set]

In [None]:
avocados_with_id = avocados.reset_index() # adds an 'index column'
train_set, test_set = split_train_test_by_id(avocados_with_id, 0.2, "index")

In [None]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(avocados, test_size=0.2, random_state=42)

In [None]:
#Checking if the sets were split evenly
train_set["type"].value_counts()

In [None]:
test_set["type"].value_counts()

In [None]:
corr_matrix = avocados.corr()

In [None]:
corr_matrix["AveragePrice"].sort_values(ascending=False)

In [None]:
from pandas.plotting import scatter_matrix

attributes = ["AveragePrice", "4046", "Total Volume", "4770"]
scatter_matrix(avocados[attributes], figsize=(12,8))

In [None]:
avocados.plot(kind="scatter", x="Total Volume", y="AveragePrice",
             alpha=0.1)

In [None]:
avocados["volume_per_bag"] = avocados["Total Volume"]/avocados["Total Bags"]
avocados["4046_ratio"] = avocados["4046"]/avocados["Total Volume"]
avocados["4225_ratio"] = avocados["4225"]/avocados["Total Volume"]
avocados["4770_ratio"] = avocados["4770"]/avocados["Total Volume"]

In [None]:
avocados.describe()

In [None]:
corr_matrix = avocados.corr()
corr_matrix["AveragePrice"].sort_values(ascending=False)

In [None]:
attributes = ["AveragePrice", "4046_ratio", "4770_ratio", "4225_ratio"]
scatter_matrix(avocados[attributes], figsize=(12,8))

In [None]:
avocados.plot(kind="scatter", x="volume_per_bag", y="AveragePrice",
             alpha=0.1)

In [None]:
avocados["volume_per_bag"].max()

In [None]:
avocados.describe()

In [None]:
avocados["Total Bags"].count()

In [None]:
avocados["Total Volume"].max()

In [None]:
###fixing Total Bags having 0 for a value
def indexes_for_value(data, value):
    indexes = []
    for i in range(len(data)):
        if data[i]==value:
            indexes.append(i)
    return indexes
            
def replaceValueMean(data, value):
    indexes = indexes_for_value(data,value)
    total=0
    count=len(data)-len(indexes)
    for i in range(len(data)):
        if data[i]!=value:
            total+=data[i]
    mean=total/count
    for i in indexes:
        data[i] = mean
    return data
            

In [None]:
indexes_for_value(avocados["Total Bags"], 0)

In [None]:
replaceValueMean(avocados["Total Bags"], 0)

In [None]:
avocados["volume_per_bag"] = avocados["Total Volume"]/avocados["Total Bags"]

In [None]:
avocados.describe()

In [None]:
train_set, test_set = train_test_split(avocados, test_size=0.2, random_state=42)
avocados = train_set.copy()

In [None]:
avocados_labels = train_set["AveragePrice"].copy()
avocados = train_set.drop("AveragePrice", axis=1)

In [None]:
#substitute for missing values
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy="median")

In [None]:
avocados = avocados.drop("Unnamed: 0", axis=1)

In [None]:
avocados.describe()

In [None]:
avocados_num = avocados.drop(["Date", "type", "year", "region"], axis=1)

In [None]:
imputer.fit(avocados_num)

In [None]:
imputer.statistics_

In [None]:
avocados_num.median().values

In [None]:
X = imputer.transform(avocados_num)

In [None]:
avocados_tr = pd.DataFrame(X, columns=avocados_num.columns)

In [None]:
def toMonth(dates):
    Month= []
    for date in dates:
        Month.append(int(date[5:7]))
    return pd.DataFrame(Month, columns=["Month"])

In [None]:
avocados["Month"]=toMonth(avocados["Date"])

In [None]:
avocados.describe()

In [None]:
from sklearn.preprocessing import OneHotEncoder

In [None]:
#sparse=False to make it a dense Matrix
type_encoder = OneHotEncoder()
avocados_type_reshaped = avocados["type"].values.reshape(-1,1)
avocados_type_1hot = type_encoder.fit_transform(avocados_type_reshaped)
avocados_type_1hot

In [None]:
type_encoder.categories_

In [None]:
region_encoder = OneHotEncoder()
avocados_region_reshaped = avocados["region"].values.reshape(-1,1)
avocados_region_1hot = region_encoder.fit_transform(avocados_region_reshaped)
avocados_region_1hot

In [None]:
region_encoder.categories_

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

#indexes
volume_ix, ix_4046, ix_4225, ix_4770, bags_ix = 1, 2, 3, 4, 5

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_volume_per_bag = True):
        self.add_volume_per_bag = add_volume_per_bag
    def fit(self, X, y=None):
        return self #nothing else to do
    def transform(self, X, y=None):
        ratio_4046 = X[:, ix_4046] / X[:, volume_ix]
        ratio_4225 = X[:, ix_4225] / X[:, volume_ix]
        ratio_4770 = X[:, ix_4770] / X[:, volume_ix]
        if self.add_volume_per_bag:
            volume_per_bag = X[:, volume_ix] / replaceValueMean(X[:, bags_ix], 0)
            return np.c_[X,ratio_4046, ratio_4225, ratio_4770, volume_per_bag]
        else:
            return np.c_[X,ratio_4046, ratio_4225, ratio_4770]
        
attr_adder = CombinedAttributesAdder(add_volume_per_bag=False)
avocados_extra_attribs = attr_adder.transform(avocados.values)

In [None]:
avocados_num.describe()

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('std_scaler', StandardScaler()),
    ('attribs_adder', CombinedAttributesAdder()),
])

avocados_num_tr = num_pipeline.fit_transform(avocados_num)

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values

In [None]:
num_attribs = list(avocados_num)
cat_attribs = ["type", "region"]

num_pipeline = Pipeline([
    ('selector', DataFrameSelector(num_attribs)),
    ('imputer', SimpleImputer(strategy="median")),
    ('std_scaler0', StandardScaler()),
    ('attribs_adder', CombinedAttributesAdder(add_volume_per_bag=False)),
    ('std_scaler1', StandardScaler()),
])

cat_pipeline = Pipeline([
    ('selector', DataFrameSelector(cat_attribs)),
    ('cat_encoder', OneHotEncoder()),
])

In [None]:
from sklearn.pipeline import FeatureUnion

full_pipeline = FeatureUnion(transformer_list = [
    ("num_pipeline", num_pipeline),
    ("cat_pipeline", cat_pipeline),
])

In [None]:
avocados_prepared = full_pipeline.fit_transform(avocados)

In [None]:
avocados_prepared

In [None]:
avocados_prepared.shape

In [None]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(avocados_prepared, avocados_labels)

In [None]:
some_data = avocados.iloc[:5]
some_labels = avocados_labels.iloc[:5]
some_data_prepared = full_pipeline.transform(some_data)
print("Predictions:", lin_reg.predict(some_data_prepared))
print("Labels:", list(some_labels))

In [None]:
from sklearn.metrics import mean_squared_error
avocados_predictions = lin_reg.predict(avocados_prepared)
lin_mse = mean_squared_error(avocados_labels, avocados_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

In [None]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor()
tree_reg.fit(avocados_prepared, avocados_labels)

In [None]:
avocados_predictions = tree_reg.predict(avocados_prepared)
tree_mse = mean_squared_error(avocados_labels, avocados_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse

In [None]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(tree_reg, avocados_prepared, avocados_labels,
                        scoring = "neg_mean_squared_error", cv=10)
tree_rmse_scores = np.sqrt(-scores)

In [None]:
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

In [None]:
display_scores(tree_rmse_scores)

In [None]:
lin_scores = cross_val_score(lin_reg, avocados_prepared, avocados_labels,
                        scoring = "neg_mean_squared_error", cv=10)
lin_rmse_scores = np.sqrt(-scores)

In [None]:
display_scores(lin_rmse_scores)

In [None]:
from sklearn.ensemble import RandomForestRegressor
forest_reg = RandomForestRegressor()
forest_reg.fit(avocados_prepared, avocados_labels)

In [None]:
avocados_predictions = forest_reg.predict(avocados_prepared)
forest_mse = mean_squared_error(avocados_labels, avocados_predictions)
forest_rmse = np.sqrt(forest_mse)
forest_rmse

In [None]:
forest_scores = cross_val_score(forest_reg, avocados_prepared, avocados_labels,
                               scoring="neg_mean_squared_error", cv=10)
forest_rmse_scores = np.sqrt(-forest_scores)
display_scores(forest_rmse_scores)

In [None]:
from sklearn.svm import SVR
SVR_reg = SVR()
SVR_reg.fit(avocados_prepared, avocados_labels)

In [None]:
avocados_predictions = SVR_reg.predict(avocados_prepared)
SVR_mse = mean_squared_error(avocados_labels, avocados_predictions)
SVR_rmse = np.sqrt(SVR_mse)
SVR_rmse

In [None]:
SVR_scores = cross_val_score(SVR_reg, avocados_prepared, avocados_labels,
                               scoring="neg_mean_squared_error", cv=10)
SVR_rmse_scores = np.sqrt(-SVR_scores)
display_scores(SVR_rmse_scores)

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    {'kernel': ["linear"], 'C': [1,2,4]},
    {'kernel': ["linear"], 'C': [1,2,4], 'gamma': [2,3]},
]

SVR_reg = SVR()

grid_search = GridSearchCV(SVR_reg, param_grid, cv=5,
                          scoring='neg_mean_squared_error')

grid_search.fit(avocados_prepared, avocados_labels)

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    {'n_estimators': [3,10,30], 'max_features': [2,4,6,8]},
    {'bootstrap': [False], 'n_estimators': [3,10], 'max_features': [2,3,4]},
]

forest_reg = RandomForestRegressor()

grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
                          scoring='neg_mean_squared_error')

grid_search.fit(avocados_prepared, avocados_labels)

In [None]:
grid_search.best_params_

In [None]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

In [None]:
feature_importances = grid_search.best_estimator_.feature_importances_
feature_importances

In [None]:
extra_attribs = ["ratio_4046", "ratio_4225", "ratio_4770"]
cat_encoder = cat_pipeline.named_steps["cat_encoder"]
cat_one_hot_attribs = list(cat_encoder.categories_[0])
attributes = num_attribs + cat_one_hot_attribs
sorted(zip(feature_importances, attributes), reverse =True)

In [None]:
final_model = grid_search.best_estimator_

X_test = test_set.drop("AveragePrice", axis=1)
y_test = test_set["AveragePrice"].copy()

X_test_prepared = full_pipeline.transform(X_test)

final_predictions = final_model.predict(X_test_prepared)

final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)
final_rmse