# **Fruit Classification Modelling**

In [None]:
# Setup

import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import plotly.figure_factory as ff


In [18]:
fruit_data = pd.read_csv('../data/fruit_classification_dataset.csv')

fruit_data.head()

# This is quite a word heavy dataset, not as ideal because of the lack of numerical features.
display(fruit_data.info())
display(fruit_data.isnull().sum())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   size (cm)      10000 non-null  float64
 1   shape          10000 non-null  object 
 2   weight (g)     10000 non-null  float64
 3   avg_price (₹)  10000 non-null  float64
 4   color          10000 non-null  object 
 5   taste          10000 non-null  object 
 6   fruit_name     10000 non-null  object 
dtypes: float64(3), object(4)
memory usage: 547.0+ KB


None

size (cm)        0
shape            0
weight (g)       0
avg_price (₹)    0
color            0
taste            0
fruit_name       0
dtype: int64

In [None]:
# Lets get into visualization ASAP: Starting with the distributions of numerical features

for col in ['size (cm)', 'weight (g)', 'avg_price (₹)']:
    fig = px.histogram(fruit_data, x=col, nbins=30, color='fruit_name', barmode='overlay',
                       title=f"Distribution of {col}")
    fig.show()


In [21]:
# Correlations heatmap, to see how numerical features relate to each other

corr = fruit_data[['size (cm)', 'weight (g)', 'avg_price (₹)']].corr()
fig = px.imshow(corr, text_auto=True, color_continuous_scale='blues',
                title='Correlation Heatmap')
fig.show()

In [22]:
# Average Price per Fruit

fig = px.bar(fruit_data, x='fruit_name', y='avg_price (₹)', color='fruit_name',
             title='Average Price per Fruit', 
             hover_data=['size (cm)', 'weight (g)'])
fig.show()

In [23]:
# Scatter Plot (Size vs Weight)
fig = px.scatter(fruit_data, x='size (cm)', y='weight (g)', color='fruit_name',
                 size='avg_price (₹)', hover_data=['color', 'taste'],
                 title='Size vs Weight (bubble = price)')
fig.show()

In [24]:
# Taste vs Color Distribution
fig = px.histogram(fruit_data, x='taste', color='color', barmode='group',
                   title='Taste Distribution by Color')
fig.show()

In [28]:

# Lets try to encode the categorical features and build a simple model. and because of the broadness of the names, i'll just drop it down to label encoding.
encoded_fruits_data = fruit_data.copy()

X = encoded_fruits_data.drop(columns=['fruit_name'])
y = encoded_fruits_data['fruit_name']

le = LabelEncoder()
le_y = LabelEncoder()

y_encoded = le_y.fit_transform(y)

X_encoded = pd.get_dummies(X, columns=['shape', 'color', 'taste'], drop_first=True)


X_train, X_test, y_train, y_test = train_test_split(X_encoded, y_encoded, test_size=0.2, random_state=69)

In [29]:
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

print("Random Forest Results:")
print(classification_report(y_test, y_pred_rf))

Random Forest Results:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       110
           1       1.00      1.00      1.00        96
           2       1.00      1.00      1.00       104
           3       1.00      1.00      1.00        74
           4       1.00      1.00      1.00       104
           5       1.00      1.00      1.00        91
           6       1.00      1.00      1.00       104
           7       1.00      1.00      1.00       101
           8       1.00      1.00      1.00        98
           9       1.00      1.00      1.00       103
          10       1.00      1.00      1.00        93
          11       1.00      1.00      1.00       123
          12       1.00      1.00      1.00        97
          13       1.00      1.00      1.00       100
          14       1.00      1.00      1.00       111
          15       1.00      1.00      1.00        98
          16       1.00      1.00      1.00       107
    

**Damn that's accurate** <br>
huh........ the dataset has a very weighted determinant, let see if it holds up tho

In [33]:
# Let's get a Confusion Matrix out there
cm = confusion_matrix(y_test, y_pred_rf)
fig = ff.create_annotated_heatmap(
    z=cm, 
    x=le_y.classes_.tolist(), 
    y=le_y.classes_.tolist(),
    colorscale='Blues', showscale=True
)
fig.update_layout(title="Random Forest Confusion Matrix")
fig.show()


_damn huh....._

### **In conclusion this is a strange dataset to say the least and maybe its my encoding that created some data ranking issues.**