In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import pickle
%matplotlib inline
import numpy as np
from sklearn.tree import DecisionTreeClassifier

from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split,GridSearchCV
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.metrics import accuracy_score, confusion_matrix,roc_curve,roc_auc_score
from sklearn.metrics import classification_report
import seaborn as sns
import pickle
import warnings
warnings.filterwarnings('ignore')

In [None]:
df=pd.read_csv('avocado.csv')
df.head()

In [None]:
df.shape

In [None]:
df=df.drop(columns=['Unnamed: 0','Date'],axis=1)

In [None]:
df

In [None]:
df.info()

We can see there are no null values, so imputation is not required.

Theres 2 features with object data type (Type, Region), so we need to use encoding for them, as well as for feature Year, as it can have effect on Price but need not be in YYYY format

In [None]:
print(df['type'].value_counts().sort_values())

In [None]:
print(df['region'].value_counts().sort_values())

Using LabelEncoder for both of them would be best

In [None]:
from sklearn.preprocessing import LabelEncoder
lab_enc=LabelEncoder()

In [None]:
data=lab_enc.fit_transform(df['region'])
pd.Series(data)

In [None]:
df['region']=data
df

In [None]:
data=lab_enc.fit_transform(df['type'])
pd.Series(data)

In [None]:
df['type']=data
df

In [None]:
data=lab_enc.fit_transform(df['year'])
pd.Series(data)

In [None]:
df['year']=data
df

In [None]:
plt.figure(figsize=(20,15),facecolor='yellow')
plotno=1
for column in df:
    if plotno<=13:
        ax=plt.subplot(3,5,plotno)
        sns.distplot(df[column])
        plt.xlabel(column,fontsize=20)
    plotno+=1
plt.tight_layout()

# Case 1 - Find the region

In [None]:
y=df['region']
x=df.drop(columns=['region'])

In [None]:
plt.figure(figsize=(20,15),facecolor='white')
plotnumber=1
for column in x:
    if plotnumber<=18:
        ax=plt.subplot(5,4,plotnumber)
        plt.scatter(x[column],y)
        plt.xlabel(column,fontsize=20)
        plt.ylabel('region',fontsize=20)
    plotnumber+=1
plt.tight_layout()

In [None]:
df_corr=df.corr().abs()
plt.figure(figsize=(15,11))
sns.heatmap(df_corr,annot=True,annot_kws={'size':10})
plt.show

From heatmap we can see region has weak bond with Year, type

In [None]:
plt.figure(figsize=(15,15))
plotno=1
for column in x:
    if plotno<=13:
        ax=plt.subplot(4,4,plotno)
        sns.stripplot(y,x[column])
    plotno+=1
plt.show()

In [None]:
df_features=df
plt.figure(figsize=(20,20),facecolor='red')
plotno=1
for column in df_features:
    if plotno<=18:
        ax=plt.subplot(5,5,plotno)
        sns.boxplot(data=df_features[column])
        plt.xlabel(column,fontsize=12)
    plotno+=1
plt.show()

In [None]:
df.shape

In [None]:
df.describe()

In [None]:
q1=df.quantile(0.25)
q3=df.quantile(0.75)
iqr=q3-q1

In [None]:
AvgPrice=(q3.AveragePrice + (1.5*iqr.AveragePrice))
index=np.where(df['AveragePrice']>AvgPrice)

In [None]:
df=df.drop(df.index[index])
df.shape

In [None]:
df_features=df
plt.figure(figsize=(20,20),facecolor='red')
plotno=1
for column in df_features:
    if plotno<=18:
        ax=plt.subplot(5,5,plotno)
        sns.boxplot(data=df_features[column])
        plt.xlabel(column,fontsize=12)
    plotno+=1
plt.show()

For other features, the outliers consist lot of data and removal would harm the dataset

In [None]:
df.reset_index()

In [None]:
sns.lineplot(x='region',y='Total Volume',data=df)
plt.show()

In [None]:
sns.lineplot(x='region',y='AveragePrice',data=df)
plt.show()

In [None]:
sns.lineplot(x='region',y='4046',data=df)
plt.show()

In [None]:
sns.lineplot(x='region',y='4225',data=df)
plt.show()

In [None]:
sns.lineplot(x='region',y='4770',data=df)
plt.show()

In [None]:
sns.lineplot(x='region',y='Total Bags',data=df)
plt.show()

In [None]:
sns.lineplot(x='region',y='Small Bags',data=df)
plt.show()

In [None]:
sns.lineplot(x='region',y='Large Bags',data=df)
plt.show()

In [None]:
sns.lineplot(x='region',y='XLarge Bags',data=df)
plt.show()

In [None]:
sns.lineplot(x='region',y='type',data=df)
plt.show()

In [None]:
sns.lineplot(x='region',y='year',data=df)
plt.show()

From these graphs and the heatplot we can see Year and Type have very little relation with region, so they can be excluded from prediction model

In [None]:
y=df['region']
x=df.drop(columns=['region','year','type'])

In [None]:
from sklearn.feature_selection import SelectKBest,f_classif

In [None]:
best_features=SelectKBest(score_func=f_classif,k=9)

fit=best_features.fit(x,y)

data_scores=pd.DataFrame(fit.scores_)

data_columns=pd.DataFrame(x.columns)

feature_scores=pd.concat([data_columns,data_scores],axis=1)

feature_scores.columns=['Feature_Name','Score']

print(feature_scores.nlargest(7,'Score'))

Based om the lineplot, heatmap, and Selectk method, these 7 features are best syited to make prediction for region

In [None]:
y=df['region']
x=df[['Total Volume','4225','Small Bags','4046','Total Bags','Large Bags','4770']]

In [None]:
scaler=StandardScaler()
x_scaled=scaler.fit_transform(x)

In [None]:
vif=pd.DataFrame()
vif['vif']=[variance_inflation_factor(x_scaled,i) for i in range(x_scaled.shape[1])]
vif['Features']=x.columns
vif

Considering most used multicollinearity threshold of 10, we're clear of multicollinearity

In [None]:
x_train,x_test,y_train,y_test=train_test_split(x_scaled,y,test_size=0.17,random_state=333)

In [None]:
def metric_score(clf,x_train,x_test,y_train,y_test,train=True):
    if train:
        y_pred=clf.predict(x_train)
        print('\n----Train result----')
        print(f'Accuracy Score:{accuracy_score(y_train,y_pred)*100:.2f}%')
        
    elif train==False:
        pred=clf.predict(x_test)
        print('\n----Test result----')
        print(f'Accuracy Score:{accuracy_score(y_test,pred)*100:.2f}%')

In [None]:
knn=KNeighborsClassifier()
knn.fit(x_train,y_train)

In [None]:
metric_score(knn,x_train,x_test,y_train,y_test,train=True)
metric_score(knn,x_train,x_test,y_train,y_test,train=False)

In [None]:
from sklearn.linear_model import LogisticRegression
log_reg=LogisticRegression()
log_reg.fit(x_train,y_train)

In [None]:
metric_score(log_reg,x_train,x_test,y_train,y_test,train=True)
metric_score(log_reg,x_train,x_test,y_train,y_test,train=False)

In [None]:
from sklearn.ensemble import BaggingClassifier

In [None]:
bag_knn=BaggingClassifier(KNeighborsClassifier(n_neighbors=9),
                          n_estimators=6,max_samples=0.75,
                          bootstrap=True,random_state=366,oob_score=True)

In [None]:
bag_knn.fit(x_train,y_train)

In [None]:
metric_score(bag_knn,x_train,x_test,y_train,y_test,train=True)
metric_score(bag_knn,x_train,x_test,y_train,y_test,train=False)

Bagging did not improve accuracy score.

In [None]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostRegressor

In [None]:
rnd_clf=RandomForestClassifier()
rnd_clf.fit(x_train,y_train)

In [None]:
metric_score(rnd_clf,x_train,x_test,y_train,y_test,train=True)
metric_score(rnd_clf,x_train,x_test,y_train,y_test,train=False)

In [None]:
pickle.dump(reg,open('knn','wb'))

Random Forest is giving best results, so we'll use this for Prediction

In [None]:
print('Region:',rnd_clf.predict(scaler.transform([[17074.83,1529.20,13066.82,2046.96,13498.67,431.85,0.00]])))

# Case 2 - Find Average Price

In [None]:
y=df['AveragePrice']
x=df.drop(columns=['AveragePrice'])

In [None]:
plt.figure(figsize=(20,15),facecolor='white')
plotnumber=1
for column in x:
    if plotnumber<=18:
        ax=plt.subplot(5,4,plotnumber)
        plt.scatter(x[column],y)
        plt.xlabel(column,fontsize=20)
        plt.ylabel('AveragePrice',fontsize=20)
    plotnumber+=1
plt.tight_layout()

In [None]:
df_corr=df.corr().abs()
plt.figure(figsize=(15,11))
sns.heatmap(df_corr,annot=True,annot_kws={'size':10})
plt.show

From Scatterplot and heatmap, we can see region and year have weak bond with Price.

We can also see Type has strongest bond

In [None]:
x

In [None]:
sns.lineplot(x='AveragePrice',y='region',data=df)
plt.show()

In [None]:
sns.lineplot(x='AveragePrice',y='Total Volume',data=df)
plt.show()

In [None]:
sns.lineplot(x='AveragePrice',y='4046',data=df)
plt.show()

In [None]:
sns.lineplot(x='AveragePrice',y='4225',data=df)
plt.show()

In [None]:
sns.lineplot(x='AveragePrice',y='type',data=df)
plt.show()

In [None]:
sns.lineplot(x='AveragePrice',y='year',data=df)
plt.show()

In [None]:
y=df['AveragePrice']
x=df.drop(columns=['AveragePrice','year','region'])

In [None]:
scal=StandardScaler()
x_scaled=scal.fit_transform(x)

In [None]:
vif=pd.DataFrame()
vif['vif']=[variance_inflation_factor(x_scaled,i) for i in range(x_scaled.shape[1])]
vif['Features']=x.columns
vif

Threshold is in control

In [None]:
from sklearn.feature_selection import SelectKBest, f_classif

In [None]:
bestf=SelectKBest(score_func=f_classif,k=9)
fit=bestf.fit(x,y)
df_scores=pd.DataFrame(fit.scores_)
df_columns=pd.DataFrame(x.columns)

In [None]:
feat_score=pd.concat([df_columns,df_scores],axis=1)
feat_score.columns=['Feature_Name','score']
print(feat_score.nlargest(7,'score'))

In [None]:
y=df[['AveragePrice']]
x=df[['type','4046','Total Volume','4770','4225','Total Bags','Small Bags']]

In [None]:
scaler=StandardScaler()
x_scaled=scaler.fit_transform(x)

In [None]:
x_train,x_test,y_train,y_test=train_test_split(x_scaled,y,test_size=0.12,random_state=44)

In [None]:
from sklearn.linear_model import LinearRegression
reg=LinearRegression()
reg.fit(x_train,y_train)

In [None]:
reg.score(x_train,y_train)

In [None]:
reg.score(x_test,y_test)

In [None]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.linear_model import Lasso,Ridge,RidgeCV,LassoCV

In [None]:
lasscv=LassoCV(alphas=None,max_iter=200,normalize=True)
lasscv.fit(x_train,y_train)

In [None]:
alpha=lasscv.alpha_
alpha

In [None]:
lasso_reg=Lasso(alpha)
lasso_reg.fit(x_train,y_train)

In [None]:
lasso_reg.score(x_test,y_test)

In [None]:
ridgecv=RidgeCV(alphas=np.arange(0.002,0.1,0.02),normalize=True)
ridgecv.fit(x_train,y_train)

In [None]:
ridgecv.alpha_

In [None]:
ridge_model=Ridge(alpha=ridgecv.alpha_)
ridge_model.fit(x_train,y_train)

In [None]:
ridge_model.score(x_test,y_test)

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
grid_param={'criterion':['gini','entropy'],
           'max_depth':range(10,15),
           'min_samples_leaf':range(2,6),
           'min_samples_split':range(3,8),
           'max_leaf_nodes':range(5,10)}

In [None]:
grid_search = GridSearchCV(estimator=reg,
                           param_grid=grid_param,
                           cv=5,
                           n_jobs=-1)

In [None]:
pickle.dump(reg,open('Avocado','wb'))