In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [None]:
data=pd.read_csv('/kaggle/input/avocado-prices/avocado.csv')

In [None]:
data.head()

**Let us remove this unnamed column and proceed.**

In [None]:
data.drop(['Unnamed: 0'],axis=1,inplace=True)

In [None]:
data.head()

**Let us rename the columns for better understanding.**

In [None]:
data= data.rename(index=str, columns={"4046" : "Small Hass", "4225" : "Large Hass","4770" : "XLarge Hass" })

**The Hass avocado is a cultivar of avocado with dark green–colored, bumpy skin. It was first grown and sold by Southern 
California mail carrier and amateur horticulturist Rudolph Hass, who also gave it his name.
The Hass avocado is a large-sized fruit weighing 200 to 300 grams. When ripe, the skin becomes a dark purplish-black and
yields to gentle pressure. When ready to serve, it becomes white-green in the middle part of the inner fruit.
Owing to its taste, size, shelf-life, high growing yield and in some areas, year-round harvesting, the Hass cultivar is 
the most commercially popular avocado worldwide. In the United States it accounts for more than 80% of the avocado crop,
95% of the California crop and is the most widely grown avocado in New Zealand.**

**Converting format of Date column:**



In [None]:
data['Date'] =pd.to_datetime(data.Date)
data.sort_values(by=['Date'], inplace=True, ascending=True)
data.head()

**Null Values:**

In [None]:
data.isnull().sum()

In [None]:
sns.heatmap(data.isnull(),yticklabels=False)

In [None]:
sns.jointplot(x='Large Bags',y='Small Bags',color='lime',data=data)
plt.show()

In [None]:
sns.jointplot(x='XLarge Bags',y='Large Bags',data=data,color='cyan')
plt.show()

**Total Volume vs Average Price**

In [None]:
data.plot(kind = "scatter", x = "Total Volume", y = "AveragePrice",color='red')
plt.show()

In [None]:
plt.figure(figsize=(12,5))
plt.title("Distribution Price")
ax = sns.distplot(data["AveragePrice"], color = 'lime')
plt.show()

**Let's visualize the features using boxplots:**

In [None]:
data.boxplot(by = "region", figsize = (12, 12))
plt.show()

**Average Sales of Avacados by Month**

In [None]:
data['Month'] = data['Date'].apply(lambda date:pd.Period(date, freq='M'))

In [None]:
avg_monthly_sales = data.groupby(data['Month'])['Total Volume'].mean()
avg_monthly_sales

In [None]:
sns.distplot(avg_monthly_sales,bins=10, kde=False,color='blue')
plt.show()

In [None]:
plt.figure(figsize=(12,20))
sns.set_style('whitegrid')
sns.pointplot(x='AveragePrice', y='region', data=data, hue='type',join=False)
plt.xticks(np.linspace(1,2,5))
plt.xlabel('region',{'fontsize' : 'large'})
plt.ylabel('AveragePrice',{'fontsize':'large'})
plt.title("Type Average Price in Each Region",{'fontsize':20})

In [None]:
pip install plotly

In [None]:
import plotly.offline as py
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode
init_notebook_mode(connected=True)
from plotly import tools


In [None]:
typeof=data.groupby('type')['Total Volume'].agg('sum')

In [None]:
values=[typeof['conventional'],typeof['organic']]
labels=['conventional','organic']
trace=go.Pie(labels=labels,values=values)
py.iplot([trace])

**Total Volume and Average Price based on Region**

In [None]:
sns.FacetGrid(data, hue = "region", size = 5) \
   .map(plt.scatter, "Total Volume", "AveragePrice") \
   .add_legend()
plt.show()

**Kde Plot**

In [None]:
sns.FacetGrid(data, hue = "region", size = 8) \
   .map(sns.kdeplot, "AveragePrice") \
   .add_legend()
plt.show()

**Pairplot**

In [None]:
sns.pairplot(data,hue='region',size=3)

In [None]:
plt.figure(figsize=(12,6))
sns.heatmap(data.corr(),cmap='Greens',annot=True)

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
data.loc[:,'Small Hass':'XLarge Bags']= scaler.fit_transform(data.loc[:,'Small Hass':'XLarge Bags']) 
data.head()

In [None]:
X = data.drop(['AveragePrice'], axis = 1)
y = data['AveragePrice']
y=np.log1p(y)

In [None]:
X_categorical=pd.get_dummies(X[["type","region"]], drop_first = True)

In [None]:
X_numerical=X[["Small Hass","Large Hass","XLarge Hass","Small Bags","Large Bags","XLarge Bags"]]

In [None]:
X= pd.concat([X_categorical, X_numerical], axis = 1)

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=100)

In [None]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train,y_train)
print ("R2 of Linear Regresson:", lr.score(X_train,y_train) )

In [None]:
from sklearn import metrics

In [None]:
from sklearn.metrics import mean_absolute_error

In [None]:
metrics.mean_absolute_error(y_test,lr.predict(X_test))

In [None]:
metrics.mean_squared_error(y_test,lr.predict(X_test))

In [None]:
np.sqrt(metrics.mean_squared_error(y_test,lr.predict(X_test)))

In [None]:
import statsmodels.api as sm
X_train = sm.add_constant(X_train) 
model = sm.OLS(y_train, X_train).fit()
print(model.summary())

**LASSO and RIDGE Regressions**

In [None]:
from sklearn import linear_model
from sklearn.model_selection import GridSearchCV

alphas = np.logspace(-5,3,20)

clf = GridSearchCV(estimator=linear_model.Ridge(), param_grid=dict(alpha=alphas), cv=10)
clf.fit(X_train, y_train)
optlamGSCV_R = clf.best_estimator_.alpha
print('Optimum regularization parameter (Ridge):', optlamGSCV_R)

clf = GridSearchCV(estimator=linear_model.Lasso(), param_grid=dict(alpha=alphas), cv=10)
clf.fit(X_train, y_train)
optlamGSCV_L= clf.best_estimator_.alpha
print('Optimum regularization parameter (Lasso):', optlamGSCV_L)

**To be continued :)**