In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import seaborn as sb
from scipy.io import arff
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report

In [2]:
data, meta = arff.loadarff('dryBean.arff')

In [3]:
df = pd.DataFrame(data)
df['Class'].value_counts()

Class
b'DERMASON'    3546
b'SIRA'        2636
b'SEKER'       2027
b'HOROZ'       1928
b'CALI'        1630
b'BARBUNYA'    1322
b'BOMBAY'       522
Name: count, dtype: int64

In [4]:
x = df['Class'].str.decode("utf-8")
x.convert_dtypes(convert_string=True)

0           SEKER
1           SEKER
2           SEKER
3           SEKER
4           SEKER
           ...   
13606    DERMASON
13607    DERMASON
13608    DERMASON
13609    DERMASON
13610    DERMASON
Name: Class, Length: 13611, dtype: string

In [5]:
df['Class'] = x

In [6]:
df['Class'] = df['Class'].convert_dtypes(convert_string=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13611 entries, 0 to 13610
Data columns (total 17 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Area             13611 non-null  float64
 1   Perimeter        13611 non-null  float64
 2   MajorAxisLength  13611 non-null  float64
 3   MinorAxisLength  13611 non-null  float64
 4   AspectRation     13611 non-null  float64
 5   Eccentricity     13611 non-null  float64
 6   ConvexArea       13611 non-null  float64
 7   EquivDiameter    13611 non-null  float64
 8   Extent           13611 non-null  float64
 9   Solidity         13611 non-null  float64
 10  roundness        13611 non-null  float64
 11  Compactness      13611 non-null  float64
 12  ShapeFactor1     13611 non-null  float64
 13  ShapeFactor2     13611 non-null  float64
 14  ShapeFactor3     13611 non-null  float64
 15  ShapeFactor4     13611 non-null  float64
 16  Class            13611 non-null  string 
dtypes: float64(1

In [7]:
dct = {}
for c in df.columns:
    dct[c] = c.lower()

df = df.rename(columns=dct)

dummied = pd.get_dummies(df)

In [8]:
dummied

Unnamed: 0,area,perimeter,majoraxislength,minoraxislength,aspectration,eccentricity,convexarea,equivdiameter,extent,solidity,...,shapefactor2,shapefactor3,shapefactor4,class_BARBUNYA,class_BOMBAY,class_CALI,class_DERMASON,class_HOROZ,class_SEKER,class_SIRA
0,28395.0,610.291,208.178117,173.888747,1.197191,0.549812,28715.0,190.141097,0.763923,0.988856,...,0.003147,0.834222,0.998724,False,False,False,False,False,True,False
1,28734.0,638.018,200.524796,182.734419,1.097356,0.411785,29172.0,191.272750,0.783968,0.984986,...,0.003564,0.909851,0.998430,False,False,False,False,False,True,False
2,29380.0,624.110,212.826130,175.931143,1.209713,0.562727,29690.0,193.410904,0.778113,0.989559,...,0.003048,0.825871,0.999066,False,False,False,False,False,True,False
3,30008.0,645.884,210.557999,182.516516,1.153638,0.498616,30724.0,195.467062,0.782681,0.976696,...,0.003215,0.861794,0.994199,False,False,False,False,False,True,False
4,30140.0,620.134,201.847882,190.279279,1.060798,0.333680,30417.0,195.896503,0.773098,0.990893,...,0.003665,0.941900,0.999166,False,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13606,42097.0,759.696,288.721612,185.944705,1.552728,0.765002,42508.0,231.515799,0.714574,0.990331,...,0.001749,0.642988,0.998385,False,False,False,True,False,False,False
13607,42101.0,757.499,281.576392,190.713136,1.476439,0.735702,42494.0,231.526798,0.799943,0.990752,...,0.001886,0.676099,0.998219,False,False,False,True,False,False,False
13608,42139.0,759.321,281.539928,191.187979,1.472582,0.734065,42569.0,231.631261,0.729932,0.989899,...,0.001888,0.676884,0.996767,False,False,False,True,False,False,False
13609,42147.0,763.779,283.382636,190.275731,1.489326,0.741055,42667.0,231.653248,0.705389,0.987813,...,0.001852,0.668237,0.995222,False,False,False,True,False,False,False


In [9]:
dummied.columns

Index(['area', 'perimeter', 'majoraxislength', 'minoraxislength',
       'aspectration', 'eccentricity', 'convexarea', 'equivdiameter', 'extent',
       'solidity', 'roundness', 'compactness', 'shapefactor1', 'shapefactor2',
       'shapefactor3', 'shapefactor4', 'class_BARBUNYA', 'class_BOMBAY',
       'class_CALI', 'class_DERMASON', 'class_HOROZ', 'class_SEKER',
       'class_SIRA'],
      dtype='object')

In [10]:
Xcols = ['area', 'perimeter', 'majoraxislength', 'minoraxislength',
       'aspectration', 'eccentricity', 'convexarea', 'equivdiameter', 'extent',
       'solidity', 'roundness', 'compactness', 'shapefactor1', 'shapefactor2',
       'shapefactor3', 'shapefactor4']
Ycols = ['class_BARBUNYA', 'class_BOMBAY',
       'class_CALI', 'class_DERMASON', 'class_HOROZ', 'class_SEKER',
       'class_SIRA']

In [11]:
X = dummied[Xcols]
Y = dummied[Ycols].astype(int)

In [12]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.30, random_state=42)

In [13]:
model = KNeighborsClassifier(n_neighbors=3)
model.fit(X_train, Y_train)

In [14]:
preds = model.predict(X_test)

In [15]:
currACC = accuracy_score(Y_test, preds)

In [16]:
# THIS DISPLAYS THE ACCURACY USING KNN
print(f"The current accuracy is {currACC:.4f} or {(currACC*100):.2f}%")

The current accuracy is 0.7101 or 71.01%


In [17]:
params = {
    'n_estimators': [1, 10, 100],
    'max_depth': [3, 5, 7],
    'min_samples_leaf': [1, 2, 4]
}
rfModel = RandomForestClassifier()
gridModel = GridSearchCV(estimator=rfModel, param_grid=params)

In [18]:
gridModel.fit(X_train, Y_train)

  _data = np.array(data, dtype=dtype, copy=copy,


In [19]:
best = gridModel.best_estimator_

In [20]:
preds = best.predict(X_test)

In [21]:
# THIS DISPLAYS THE ACCURACY USING RANDOM FOREST
print(classification_report(Y_test, preds))

              precision    recall  f1-score   support

           0       0.92      0.81      0.86       395
           1       1.00      1.00      1.00       161
           2       0.90      0.90      0.90       479
           3       0.91      0.92      0.91      1043
           4       0.98      0.93      0.95       588
           5       0.96      0.92      0.94       619
           6       0.87      0.87      0.87       799

   micro avg       0.92      0.90      0.91      4084
   macro avg       0.93      0.91      0.92      4084
weighted avg       0.92      0.90      0.91      4084
 samples avg       0.90      0.90      0.90      4084



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
