In [1]:
# Importing all needed modules.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Loading the data set.
df = pd.read_csv("B1_banana_quality.csv", sep=";")
df

Unnamed: 0,Size,Weight,Sweetness,Softness,HarvestTime,Ripeness,Acidity,Quality
0,-1.924968,0.468078,3.077832,-1.472177,0.294799,2.435570,0.271290,Good
1,-2.409751,0.486870,0.346921,-2.495099,-0.892213,2.067549,0.307325,Good
2,-0.357607,1.483176,1.568452,-2.645145,-0.647267,3.090643,1.427322,Good
3,-0.868524,1.566201,1.889605,-1.273761,-1.006278,1.873001,0.477862,Good
4,0.651825,1.319199,-0.022459,-1.209709,-1.430692,1.078345,2.812442,Good
...,...,...,...,...,...,...,...,...
7995,-6.414403,0.723565,1.134953,2.952763,0.297928,-0.156946,2.398091,Bad
7996,0.851143,-2.217875,-2.812175,0.489249,-1.323410,-2.316883,2.113136,Bad
7997,1.422722,-1.907665,-2.532364,0.964976,-0.562375,-1.834765,0.697361,Bad
7998,-2.131904,-2.742600,-1.008029,2.126946,-0.802632,-3.580266,0.423569,Bad


In [3]:
# Testing is the data set has missing values.
df.isna().sum()

Size           1
Weight         1
Sweetness      3
Softness       2
HarvestTime    2
Ripeness       1
Acidity        1
Quality        1
dtype: int64

In [4]:
# The data set has several missing values. Because the number of the missing values is very small relative to the total number of
# sample in the data set we can drop the rows that are having missing values.

In [5]:
df = df.dropna()

In [6]:
df

Unnamed: 0,Size,Weight,Sweetness,Softness,HarvestTime,Ripeness,Acidity,Quality
0,-1.924968,0.468078,3.077832,-1.472177,0.294799,2.435570,0.271290,Good
1,-2.409751,0.486870,0.346921,-2.495099,-0.892213,2.067549,0.307325,Good
2,-0.357607,1.483176,1.568452,-2.645145,-0.647267,3.090643,1.427322,Good
3,-0.868524,1.566201,1.889605,-1.273761,-1.006278,1.873001,0.477862,Good
4,0.651825,1.319199,-0.022459,-1.209709,-1.430692,1.078345,2.812442,Good
...,...,...,...,...,...,...,...,...
7995,-6.414403,0.723565,1.134953,2.952763,0.297928,-0.156946,2.398091,Bad
7996,0.851143,-2.217875,-2.812175,0.489249,-1.323410,-2.316883,2.113136,Bad
7997,1.422722,-1.907665,-2.532364,0.964976,-0.562375,-1.834765,0.697361,Bad
7998,-2.131904,-2.742600,-1.008029,2.126946,-0.802632,-3.580266,0.423569,Bad


In [7]:
# As we can see wwe lost only 11 rows with is an acceptable lost.

In [8]:
# Next I want to explore the distribution of classes in the data set.
df['Quality'].value_counts()

Good    3995
Bad     3993
Name: Quality, dtype: int64

In [9]:
# The disctribution of classes in the target column is near equal, so we can count it as a balanced data set.

In [10]:
# To better understand the distribution of the variables I will compute the basic statistics on this data set.

In [11]:
describ = df.describe()
describ.loc["skew"] = df.iloc[:, :-1].skew().values
describ.loc["kurtosis"] = df.iloc[:, :-1].kurtosis().values
describ

Unnamed: 0,Size,Weight,Sweetness,Softness,HarvestTime,Ripeness,Acidity
count,7988.0,7988.0,7988.0,7988.0,7988.0,7988.0,7988.0
mean,-0.746942,-0.764751,-0.730504,-0.008473,-0.750805,0.779346,0.006944
std,2.136756,2.014797,4.305708,2.119499,1.997833,2.11439,2.294252
min,-7.998074,-8.283002,-6.434022,-14.693974,-7.570008,-7.423155,-8.226977
25%,-2.276735,-2.228365,-2.108123,-1.588367,-2.120838,-0.575289,-1.630664
50%,-0.896477,-0.872182,-1.021734,0.205098,-0.934192,0.963899,0.09409
75%,0.65719,0.768668,0.302133,1.548266,0.511743,2.260935,1.680768
max,7.9708,5.679692,342.46957,40.17425,6.29328,7.249034,7.411633
skew,0.265798,0.046207,63.471837,0.627724,0.277654,-0.311801,-0.151561
kurtosis,-0.161692,-0.523459,5054.846358,15.827336,-0.08353,-0.087592,-0.306771


In [12]:
# From the table above we can conclude the following:
# 1 - All variables are having different ranges.
# 2 - Moltiple columns are highly skewed.
# 3 - We will have to normalize data before passing it to an algotithm.

In [13]:
# Next I would like to create the correlation matrix, but before I will have to create a temporary numerical column for the Quality column.

In [14]:
df["quality_num_column"] = df["Quality"].map({"Bad" : 0, "Good" : 1})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["quality_num_column"] = df["Quality"].map({"Bad" : 0, "Good" : 1})


In [15]:
df

Unnamed: 0,Size,Weight,Sweetness,Softness,HarvestTime,Ripeness,Acidity,Quality,quality_num_column
0,-1.924968,0.468078,3.077832,-1.472177,0.294799,2.435570,0.271290,Good,1
1,-2.409751,0.486870,0.346921,-2.495099,-0.892213,2.067549,0.307325,Good,1
2,-0.357607,1.483176,1.568452,-2.645145,-0.647267,3.090643,1.427322,Good,1
3,-0.868524,1.566201,1.889605,-1.273761,-1.006278,1.873001,0.477862,Good,1
4,0.651825,1.319199,-0.022459,-1.209709,-1.430692,1.078345,2.812442,Good,1
...,...,...,...,...,...,...,...,...,...
7995,-6.414403,0.723565,1.134953,2.952763,0.297928,-0.156946,2.398091,Bad,0
7996,0.851143,-2.217875,-2.812175,0.489249,-1.323410,-2.316883,2.113136,Bad,0
7997,1.422722,-1.907665,-2.532364,0.964976,-0.562375,-1.834765,0.697361,Bad,0
7998,-2.131904,-2.742600,-1.008029,2.126946,-0.802632,-3.580266,0.423569,Bad,0


In [16]:
df.drop(["Quality"], axis=1).corr()

Unnamed: 0,Size,Weight,Sweetness,Softness,HarvestTime,Ripeness,Acidity,quality_num_column
Size,1.0,-0.181695,-0.119518,0.161756,0.58207,0.042856,-0.140649,0.353773
Weight,-0.181695,1.0,0.202464,-0.180343,-0.079406,-0.036229,0.44355,0.388175
Sweetness,-0.119518,0.202464,1.0,-0.043703,-0.09866,0.077603,0.080757,0.180123
Softness,0.161756,-0.180343,-0.043703,1.0,0.189806,-0.248649,-0.140784,0.000795
HarvestTime,0.58207,-0.079406,-0.09866,0.189806,1.0,0.107216,-0.090867,0.37731
Ripeness,0.042856,-0.036229,0.077603,-0.248649,0.107216,1.0,-0.352497,0.350545
Acidity,-0.140649,0.44355,0.080757,-0.140784,-0.090867,-0.352497,1.0,-0.001654
quality_num_column,0.353773,0.388175,0.180123,0.000795,0.37731,0.350545,-0.001654,1.0


In [17]:
# The correlation table evidentiates only some possible relationships, like the possitiove correalation between "Size" and "HarvestTime".
# But in rest the correaltions are preety low, so the is no multicolinearity.

In [18]:
# Also I see 0 correlation between the "Softness" columne and the "Quality" one. We can remove it from the data set.

In [19]:
df = df.drop(["Softness"], axis=1)

In [20]:
# Now we can proceed to training ML models.
# But before we have to convert our data into numpy array format, split it into train and test and normalize it.
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [21]:
X = df.drop(["Quality", "quality_num_column"], axis=1).values
y = df["quality_num_column"]

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [23]:
scaler = StandardScaler()
X_train_tr = scaler.fit_transform(X_train)
X_test_tr = scaler.transform(X_test)

In [24]:
# Now we can train a couple of models.
# Importing models.
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [37]:
# Now we will have to import some metrics from classifition.
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score

def try_model(X_train, X_test, y_train, y_test, model, model_name):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_pred, y_test)
    roc_auc = roc_auc_score(y_pred, y_test)

    print(model_name)
    print(f"Accuracy: {accuracy}")
    print(f"ROC AUC: {roc_auc}")
    print(classification_report(y_pred, y_test))
    return accuracy, roc_auc

In [38]:
# Creating the models:
logit = LogisticRegression()
svm = SVC()
gauss = GaussianNB()
bernoulli = BernoulliNB()
decision_tree = DecisionTreeClassifier()
extra_tree = ExtraTreeClassifier()
forest = RandomForestClassifier()

In [40]:
# Defining the models dictionary:
all_models = [
    {"name" : "logit", "model" : logit},
    {"name" : "SVM", "model" : svm},
    {"name" : "Gauss", "model" : gauss},
    {"name" : "bernoulli", "model" : bernoulli},
    {"name" : "decision_tree", "model" : decision_tree},
    {"name" : "extra_tree", "model" : extra_tree},
    {"name" : "forest", "model" : forest}
]

In [41]:
for i in range(len(all_models)):
    accuracy, auc_score = try_model(X_train_tr, X_test_tr, y_train, y_test, all_models[i]["model"], all_models[i]["name"])
    all_models[i]["accuracy"] = accuracy
    all_models[i]["auc_score"] = auc_score

logit
Accuracy: 0.8693039559339009
ROC AUC: 0.8692251329920706
              precision    recall  f1-score   support

           0       0.87      0.87      0.87       972
           1       0.87      0.87      0.87      1025

    accuracy                           0.87      1997
   macro avg       0.87      0.87      0.87      1997
weighted avg       0.87      0.87      0.87      1997

SVM
Accuracy: 0.9414121181772659
ROC AUC: 0.9416687768037653
              precision    recall  f1-score   support

           0       0.93      0.95      0.94       956
           1       0.95      0.94      0.94      1041

    accuracy                           0.94      1997
   macro avg       0.94      0.94      0.94      1997
weighted avg       0.94      0.94      0.94      1997

Gauss
Accuracy: 0.8302453680520782
ROC AUC: 0.8430887122982564
              precision    recall  f1-score   support

           0       0.93      0.77      0.84      1174
           1       0.74      0.92      0.82       

In [42]:
all_models

[{'name': 'logit',
  'model': LogisticRegression(),
  'accuracy': 0.8693039559339009,
  'auc_score': 0.8692251329920706},
 {'name': 'SVM',
  'model': SVC(),
  'accuracy': 0.9414121181772659,
  'auc_score': 0.9416687768037653},
 {'name': 'Gauss',
  'model': GaussianNB(),
  'accuracy': 0.8302453680520782,
  'auc_score': 0.8430887122982564},
 {'name': 'bernoulli',
  'model': BernoulliNB(),
  'accuracy': 0.7636454682023035,
  'auc_score': 0.7645987088114675},
 {'name': 'decision_tree',
  'model': DecisionTreeClassifier(),
  'accuracy': 0.8918377566349525,
  'auc_score': 0.8918442490586779},
 {'name': 'extra_tree',
  'model': ExtraTreeClassifier(),
  'accuracy': 0.872308462694041,
  'auc_score': 0.8722584760808119},
 {'name': 'forest',
  'model': RandomForestClassifier(),
  'accuracy': 0.9334001001502253,
  'auc_score': 0.9334436806814828}]

In [43]:
df = pd.DataFrame.from_records(all_models, columns=["name", "accuracy", "auc_score"])

In [45]:
df

Unnamed: 0,name,accuracy,auc_score
0,logit,0.869304,0.869225
1,SVM,0.941412,0.941669
2,Gauss,0.830245,0.843089
3,bernoulli,0.763645,0.764599
4,decision_tree,0.891838,0.891844
5,extra_tree,0.872308,0.872258
6,forest,0.9334,0.933444


In [46]:
from sklearn.pipeline import Pipeline

In [47]:
pipe = Pipeline(
    [
        ("scaler", StandardScaler()),
        ("SVM", SVC())
    ]
)

In [49]:
pipe.fit(X_train, y_train)

In [50]:
y_pred = pipe.predict(X_test)

In [51]:
accuracy_score(y_pred, y_test)

0.9414121181772659

In [52]:
import pickle

In [53]:
pickle.dump(pipe, open("pipe.pkl", "wb"))