### Features of the dataset
A_id - Unique identifier for each fruit \
Size - Size of the fruit \
Weight - Weight of the fruit \
Sweetness - Sweetness of the fruit \
Crunchiness - Texture indicating the crunchiness of the fruit \
Juiciness - Level of juiciness of the fruit \
Ripeness - Stage of ripeness of the fruit \
Acidity - Acidity level of the fruit \
Quality - Overall quality of the fruit

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support
from sklearn.naive_bayes import GaussianNB
import seaborn as sns
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.neighbors import NearestNeighbors
# Load the dataset
apple = pd.read_csv("data/apple_quality.csv")
apple["Quality"] = apple["Quality"].replace(["bad"], 0)
apple["Quality"] = apple["Quality"].replace(["good"], 1)
apple=apple[['Size', 'Weight','Sweetness', 'Crunchiness','Juiciness', 'Ripeness', 'Acidity','Quality']]


In [None]:
colors = ["darkred", "red"]

# Create the count plot
plt.figure(figsize=(6, 6))
ax = sns.countplot(data=apple, x='Quality', hue= 'Quality', palette=colors)

# Add count annotations to each bar
for p in ax.patches:
    ax.annotate(f'{p.get_height()}', (p.get_x() + p.get_width() / 2., p.get_height()), ha='center', va='center', xytext=(0, 10), textcoords='offset points')

plt.title("Quality Distribution")
plt.xlabel("Quality")
plt.ylabel("Count")
plt.show()


In [None]:
colors = ["darkred", "red"]

# Create the subplots
plt.figure(figsize=(15, 10))
for i, col in enumerate(apple.columns, 1):
    plt.subplot(4, 4, i)
    plt.title(f"Distribution of {col} Data")
    sns.histplot(data=apple, x=col, kde=True, hue="Quality", palette=colors)
    # print(sns.histplot(data=apple, x=col, kde=True, hue="Quality", palette=colors))
    plt.tight_layout()

plt.show()

In [None]:
features = ['Size', 'Weight', 'Sweetness', 'Crunchiness', 'Juiciness', 'Ripeness', 'Acidity']

apple = apple.drop(4000) # dropping this row because it contains the author info (messes with our data)

# Calculate correlation matrix
correlation_matrix = apple[features].corr()

# Plot correlation matrix as heatmap
# plt.figure(figsize=(10, 8))
# sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
# plt.title("Correlation Matrix")
# plt.show()

# Plot scatter plots with regression lines
plt.figure(figsize=(12, 10))
for i in range(len(features)):
    for j in range(i+1, len(features)):
        plt.figure(figsize=(12, 10))
        plt.scatter(x=apple[features[i]], y=apple[features[j]], color='red')
        plt.xlabel(features[i])
        plt.ylabel(features[j])
        plt.title(f"{features[i]} vs {features[j]}")
        plt.tight_layout()
        plt.show()

In [None]:
correlation_matrix_df = apple.corr(method='pearson')
fig = plt.figure(figsize=(12,6))
mask = np.triu(np.ones_like(correlation_matrix_df, dtype=bool))
sns.heatmap(correlation_matrix_df,annot=True,cmap='Reds', mask = mask)

In [None]:
sns.pairplot(apple,hue='Quality',palette='Reds',corner=True)
plt.show()

Summary:

Size, sweetness and juiciness are relatively, strongly positively correlated with the good quality. The correlation may seem to be low due to the binary value of good quality\
Ripeness is relatively, strongly negatively correlated with the good quality.

### Classification using GaussianNB

In [None]:
# Use Gaussian for datasets with quantitative variables
clf = GaussianNB()

apple = apple.dropna()
X_train, X_test, y_train, y_test = train_test_split(apple[['Size', 'Weight','Sweetness', 'Crunchiness',
                                                           'Juiciness', 'Ripeness', 'Acidity']].values,
                                                    apple.Quality,test_size=0.25)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
p,r,f,s = precision_recall_fscore_support(y_test, y_pred, labels=[1,0])

print(classification_report(y_true=y_test,y_pred=y_pred))

In [None]:
mgnb = confusion_matrix(y_test, y_pred)
mgnb
cm_columns = ['predicted 0', 'predicted 1']
cm_rows = ['actual 0', 'actual 1']
df_nb = pd.DataFrame(mgnb, index = cm_columns, columns = cm_rows)
sns.heatmap(df_nb, annot=True, fmt='d',cmap='Reds')

### Classification using KNeighbors

In [None]:
from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier(n_neighbors = 5)
model = model.fit(X_train,y_train)
y_pred = model.predict(X_test)
p,r,f,s = precision_recall_fscore_support(y_test, y_pred, labels=[1,0])
print(p, r, f, s)
print(classification_report(y_true=y_test,y_pred=y_pred))

In [None]:
mgnb = confusion_matrix(y_test, y_pred)
mgnb
cm_columns = ['predicted 0', 'predicted 1']
cm_rows = ['actual 0', 'actual 1']
df_nb = pd.DataFrame(mgnb, index = cm_columns, columns = cm_rows)
sns.heatmap(df_nb, annot=True, fmt='d',cmap='Reds')

### Finding out which attribute does worst (on average) in terms of precision with Naive Bayes Classifiers.


In [None]:
attributes = ['Size', 'Weight','Sweetness', 'Crunchiness', 'Juiciness', 'Ripeness', 'Acidity']
precision_scores = {}
times_to_sample = 10

# How well does predicting with a single attribute perform?
for attr in range(len(attributes)):
    scores = []
    for j in range(times_to_sample):
        classifier = GaussianNB()
        X_train, X_test, y_train, y_test = train_test_split(apple[[attributes[attr]]].values, apple.Quality, test_size=0.25)
        classifier.fit(X_train, y_train)
        y_pred = classifier.predict(X_test)

        p,r,f,s = precision_recall_fscore_support(y_test, y_pred, labels=[0,1])
        scores.append(p)
    precision_scores[attributes[attr]] = scores

precision_averages_label_zero = []
precision_averages_label_one = []

for attr in range(len(attributes)):
    total_label_zero = 0
    total_label_one = 0
    for j in range(times_to_sample):
        total_label_zero += precision_scores[attributes[attr]][j][0]
        total_label_one += precision_scores[attributes[attr]][j][1]

    precision_averages_label_zero.append(total_label_zero / times_to_sample)
    precision_averages_label_one.append(total_label_one / times_to_sample)

min_index_label_zero = min(enumerate(precision_averages_label_zero), key=lambda x: x[1])[0]
min_index_label_one = min(enumerate(precision_averages_label_one), key=lambda x: x[1])[0]

print(f"The worst precision for label zero was: {attributes[min_index_label_zero]}")
print(f"The worst precision for label one was: {attributes[min_index_label_one]}")


In [None]:
precision_lists_label_zero = [precision_scores[attr][0] for attr in attributes]
precision_lists_label_one = [precision_scores[attr][1] for attr in attributes]

# Create boxplots for precision scores of label zero and label one
plt.figure(figsize=(10, 6))
plt.boxplot(precision_lists_label_zero, labels=attributes, boxprops=dict(color='red'))
plt.title('Precision Scores for Label Zero by Attribute')
plt.xlabel('Attribute')
plt.ylabel('Precision')
plt.xticks(rotation=45)
plt.show()

plt.figure(figsize=(10, 6))
plt.boxplot(precision_lists_label_one, labels=attributes, boxprops=dict(color='red'))
plt.title('Precision Scores for Label One by Attribute')
plt.xlabel('Attribute')
plt.ylabel('Precision')
plt.xticks(rotation=45)
plt.show()


### Acidity was typically the worst when used with the Naive Bayes Classifier. Does our precision increase if we predict without acidity?

In [None]:
# Now let's try with a few (but not all) attributes
clf = GaussianNB()

# with acidity (using random_state = 42 for consistency between samples)
X_train, X_test, y_train, y_test = train_test_split(apple[['Size', 'Weight','Sweetness', 'Crunchiness', 'Juiciness', 'Ripeness']].values,
                                                    apple.Quality,test_size=0.25, random_state=42)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
p,r,f,s = precision_recall_fscore_support(y_test, y_pred, labels=[1,0])
print(f"With Acidity \t | Precision: {p} Recall: {r} F-Score: {f} Support: {s}\n")

# without acidity (using random_state = 42 for consistency between samples)
X_train, X_test, y_train, y_test = train_test_split(apple[['Size', 'Weight','Sweetness', 'Crunchiness', 'Juiciness', 'Ripeness', 'Acidity']].values,
                                                    apple.Quality,test_size=0.25, random_state=42)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
p,r,f,s = precision_recall_fscore_support(y_test, y_pred, labels=[1,0])
print(f"W/out Acidity \t | Precision: {p} Recall: {r} F-Score: {f} Support: {s}")

### How does Ripeness correspond to other attributes?


In [None]:
import seaborn as sns
from scipy import stats

fig = plt.figure(figsize=(10,4))
ax = sns.scatterplot(data=apple.sample(frac=0.05), x="Ripeness", y="Sweetness",color='red')
sns.regplot(data=apple.sample(frac=0.25), x='Ripeness', y='Sweetness',color='red')
plt.show()

# correlation coefficients of ripeness compared to other attributes
display(stats.pearsonr(apple.dropna().Ripeness, apple.dropna().Sweetness))
display(stats.pearsonr(apple.dropna().Ripeness, apple.dropna().Juiciness))
display(stats.pearsonr(apple.dropna().Ripeness, apple.dropna().Crunchiness))
display(stats.pearsonr(apple.dropna().Ripeness, apple.dropna().Size))
display(stats.pearsonr(apple.dropna().Ripeness, apple.dropna().Weight))
display(stats.pearsonr(apple.dropna().Ripeness, apple.dropna().Acidity.astype(float)))

### Bad apples vs. good apples by above-average values for each attribute

In [None]:
large = apple[apple.Weight > 0]
large = large[large.Size >0]
sweet_apples = apple[apple.Sweetness>0]
crunchy_apples = apple[apple.Crunchiness>0]
juicy_apples = apple[apple.Juiciness>0]
ripe_apples = apple[apple.Ripeness>0]
acidic_apples = apple[apple.Acidity.astype('float') >0]

sns.countplot(x='Quality', data=large, hue='Quality', palette='Reds')
plt.title('Count of Bad vs. Good Apples (Large/Heavy)')
plt.show()

sns.countplot(x='Quality', data=sweet_apples, hue='Quality', palette='Reds')
plt.title('Count of Bad vs. Good Apples (Sweet Apples)')
plt.show()

sns.countplot(x='Quality', data=crunchy_apples, hue='Quality', palette='Reds')
plt.title('Count of Bad vs. Good Apples (Crunchy Apples)')
plt.show()

sns.countplot(x='Quality', data=juicy_apples, hue='Quality', palette='Reds')
plt.title('Count of Bad vs. Good Apples (Juicy Apples)')
plt.show()

sns.countplot(x='Quality', data=ripe_apples, hue='Quality', palette='Reds')
plt.title('Count of Bad vs. Good Apples (Ripe Apples)')
plt.show()

sns.countplot(x='Quality', data=acidic_apples, hue='Quality', palette='Reds')
plt.title('Count of Bad vs. Good Apples (Acidic Apples)')
plt.show()

