In [45]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [46]:
data = pd.read_csv("Almond.csv")

In [47]:
data.drop(columns=["Unnamed: 0"], inplace=True)

In [48]:
data.describe()

Unnamed: 0,Length (major axis),Width (minor axis),Thickness (depth),Area,Perimeter,Roundness,Solidity,Compactness,Aspect Ratio,Eccentricity,Extent,Convex hull(convex area)
count,1946.0,1861.0,1799.0,2803.0,2803.0,1946.0,2803.0,2803.0,1004.0,1004.0,2803.0,2803.0
mean,290.609274,171.025915,109.705378,26511.117374,743.86377,0.470466,0.955828,1.825233,1.753216,0.813114,0.724587,27696.218159
std,62.719433,29.916529,18.940597,13782.561344,230.632076,0.118673,0.039596,0.794058,0.206616,0.041312,0.047474,14237.34761
min,151.335266,88.050529,59.494278,6037.0,311.563489,0.173748,0.718772,1.164469,1.400082,0.699897,0.454538,6355.0
25%,245.966293,149.453659,97.091682,16211.5,571.730009,0.38481,0.944579,1.357398,1.61249,0.784476,0.701673,17088.5
50%,279.879883,170.168365,110.280136,23440.5,707.487369,0.472718,0.970422,1.576412,1.705716,0.81012,0.73372,24589.0
75%,330.508575,190.640427,121.392773,33451.0,878.89653,0.577553,0.981484,1.965953,1.833339,0.838141,0.757551,34863.25
max,515.352478,258.569794,181.8452,89282.0,1864.947387,0.697293,0.992889,9.660057,2.731251,0.930563,0.845813,90642.5


In [49]:
data.head()

Unnamed: 0,Length (major axis),Width (minor axis),Thickness (depth),Area,Perimeter,Roundness,Solidity,Compactness,Aspect Ratio,Eccentricity,Extent,Convex hull(convex area),Type
0,,227.940628,127.759132,22619.0,643.813269,,0.973384,1.458265,,,0.681193,23237.5,MAMRA
1,,234.188126,128.199509,23038.0,680.984841,,0.957304,1.601844,,,0.656353,24065.5,MAMRA
2,,229.41861,125.796547,22386.5,646.943212,,0.96727,1.487772,,,0.68362,23144.0,MAMRA
3,,232.763153,125.918808,22578.5,661.227483,,0.965512,1.540979,,,0.68536,23385.0,MAMRA
4,,230.150742,107.253448,19068.0,624.842706,,0.95145,1.629395,,,0.7148,20041.0,MAMRA


In [50]:
data.isna().sum()

Length (major axis)          857
Width (minor axis)           942
Thickness (depth)           1004
Area                           0
Perimeter                      0
Roundness                    857
Solidity                       0
Compactness                    0
Aspect Ratio                1799
Eccentricity                1799
Extent                         0
Convex hull(convex area)       0
Type                           0
dtype: int64

In [51]:
data["Type"].unique()

array(['MAMRA', 'SANORA', 'REGULAR'], dtype=object)

In [53]:
cols = data.columns

In [54]:
al_types = data["Type"].unique()

In [56]:
cols = cols.drop(["Type"])

In [57]:
for i in cols:
    for j in al_types:
        mask = data["Type"] == j
        avg_al = data[mask][i].dropna(inplace=False).mean()
        data.loc[mask, i] = data.loc[mask, i].fillna(avg_al)

In [58]:
data.isna().sum()

Length (major axis)         0
Width (minor axis)          0
Thickness (depth)           0
Area                        0
Perimeter                   0
Roundness                   0
Solidity                    0
Compactness                 0
Aspect Ratio                0
Eccentricity                0
Extent                      0
Convex hull(convex area)    0
Type                        0
dtype: int64

In [112]:
data["Type"].value_counts()

Type
2    943
0    933
1    927
Name: count, dtype: int64

In [69]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import sklearn.metrics
from sklearn.preprocessing import LabelEncoder

In [70]:
le = LabelEncoder()
le.fit(data["Type"])

In [74]:
data["Type"] = le.transform(data["Type"])

In [82]:
data.corr()["Type"].abs().sort_values(ascending=False)

Type                        1.000000
Aspect Ratio                0.741026
Eccentricity                0.739871
Roundness                   0.384091
Thickness (depth)           0.380520
Length (major axis)         0.292310
Solidity                    0.277235
Compactness                 0.209883
Extent                      0.191447
Width (minor axis)          0.142113
Perimeter                   0.123801
Convex hull(convex area)    0.034985
Area                        0.013641
Name: Type, dtype: float64

In [89]:
X = data[["Aspect Ratio", "Eccentricity"]]
y = data["Type"]

In [90]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [92]:
tree_cl = DecisionTreeClassifier(max_depth=8, random_state=42)
tree_cl.fit(X_train, y_train)

In [115]:
y_pred = tree_cl.predict(X_test)
print(sklearn.metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.97      0.84      0.90       189
           1       0.83      0.84      0.83       187
           2       0.81      0.91      0.85       185

    accuracy                           0.86       561
   macro avg       0.87      0.86      0.86       561
weighted avg       0.87      0.86      0.86       561



In [107]:
rf_cl = RandomForestClassifier(500, criterion="gini", max_depth=8, max_samples=10, max_features="sqrt", random_state=42)
rf_cl.fit(X_train, y_train)

In [108]:
y_pred = rf_cl.predict(X_test)
print(sklearn.metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.93      0.85      0.89       189
           1       0.76      0.76      0.76       187
           2       0.79      0.85      0.82       185

    accuracy                           0.82       561
   macro avg       0.83      0.82      0.82       561
weighted avg       0.83      0.82      0.82       561



In [113]:
lr_cl = LogisticRegression(max_iter=1000, random_state=42)
lr_cl.fit(X_train, y_train)

In [114]:
y_pred = lr_cl.predict(X_test)
print(sklearn.metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.90      0.86      0.88       189
           1       0.77      0.72      0.75       187
           2       0.78      0.87      0.82       185

    accuracy                           0.82       561
   macro avg       0.82      0.82      0.82       561
weighted avg       0.82      0.82      0.82       561

