In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pickle
import joblib
import warnings
warnings.filterwarnings('ignore')

In [None]:
header=['area','perimeter','compactness','length_of_kernel','width_of_kernel',
        'asymmetry_coefficient','length_of_kernel_groove','Wheat_Kernel']
wheat=pd.read_csv('D:\Environments\Projects\Wheat-Kernel-Classification\experiments\seeds_dataset.txt',sep='\t',
                  header=None,names=header)
wheat.head()

#### Data was read using the tab delimiter.

In [None]:
wheat.describe()

In [None]:
wheat.info()

#### No presence of null values.

In [None]:
features = wheat.iloc[:, :-1]
plt.figure(figsize=(12, 6))
for i, column in enumerate(features.columns):
    plt.subplot(2, 4, i + 1)
    sns.boxplot(features[column])
    plt.title(f'Boxplot of {column}')
    plt.xlabel(column)
plt.tight_layout()
plt.show()

#### As it seems, there isn't any outlier in the attributes except just some in the 'Compactness'.

In [None]:
wheat.Wheat_Kernel.value_counts()

In [None]:
wheat.groupby('Wheat_Kernel').describe()

#### Each classes are evenly distributed. So no issue of imbalance.

In [4]:
x=wheat.drop('Wheat_Kernel',axis='columns')
y=wheat.Wheat_Kernel

In [None]:
lda=LDA(n_components=2)
x_lda=lda.fit_transform(x,y)

In [None]:
plt.figure(figsize=(8, 6))
markers = ['o', 's', '^']
colors = ['red', 'blue', 'green']
for idx, label in enumerate(np.unique(y)):
    plt.scatter(
        x_lda[y == label, 0], 
        x_lda[y == label, 1], 
        label=f"Class {label}",
        alpha=0.7,
        s=100,
        marker=markers[idx],
        color=colors[idx],
        edgecolor='k'
    )
plt.title("LDA: Reduced to 2 Dimensions")
plt.xlabel("LD1")
plt.ylabel("LD2")
plt.legend()
plt.grid(True)
plt.show()

#### Linear Discriminant Analysis aka LDA was performed to check wheather the classes are separable in respect to the variance ratio. Class 1 and 2 are clearly separable while there is some overlap between class 1 & 3.

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.20,stratify=y,random_state=42)

In [6]:
def find_best_model(X_train, y_train):
    models = {
        "Logistic Regression": LogisticRegression(),
        "SVM (Linear)": SVC(kernel='linear'),
        "Decision Tree": DecisionTreeClassifier(),
        "Random Forest": RandomForestClassifier()
    }
    
    param_grid = {
        "Logistic Regression": {
            'C': [0.01, 0.1, 1, 10]
        },
        "SVM (Linear)": {
            'C': [0.01, 0.1, 1, 10],
            'kernel': ['linear']
        },
        "Decision Tree": {
            'max_depth': [None, 10, 20, 30],
            'min_samples_split': [2, 5, 10]
        },
        "Random Forest": {
            'n_estimators': [50, 100, 200],
            'max_depth': [None, 10, 20],
            'min_samples_split': [2, 5]
        }
    }
    
    best_model = None
    best_score = 0
    best_params = None
    
    for model_name, model in models.items():
        print(f"Tuning {model_name}...")
        grid_search = GridSearchCV(estimator=model, param_grid=param_grid[model_name], cv=5, scoring='accuracy')
        grid_search.fit(X_train, y_train)
        if grid_search.best_score_ > best_score:
            best_score = grid_search.best_score_
            best_model = grid_search.best_estimator_
            best_params = grid_search.best_params_
    
    print(f"Best Model: {best_model}")
    print(f"Best Parameters: {best_params}")
    print(f"Best Cross-Validation Score: {best_score}")
    return best_model, best_params, best_score

In [None]:
best_model, best_params, best_score = find_best_model(x_train, y_train)

In [None]:
svc = SVC(C=10,kernel='linear')
svc.fit(x_train, y_train)

y_pred_svc = svc.predict(x_test)
accuracy = accuracy_score(y_test, y_pred_svc)
print(f"Accuracy after LDA: {accuracy:.2f}")
print(classification_report(y_pred_svc,y_test))

In [None]:
clf = RandomForestClassifier(random_state=42,max_depth=None, min_samples_split=2,n_estimators=50)
clf.fit(x_train, y_train)

y_pred_rf = clf.predict(x_test)
accuracy = accuracy_score(y_test, y_pred_rf)
print(f"Accuracy after LDA: {accuracy:.2f}")
print(classification_report(y_pred_rf,y_test))

In [None]:
cm = confusion_matrix(y_test, y_pred_svc)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='g', cmap='Blues', xticklabels=np.unique(y_test), yticklabels=np.unique(y_test))

plt.title('Confusion Matrix')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.show()

#### As it was seen in the LDA plot too, there are few misclassifications where 1 i.e., 'Kama' is predicted as 'Canadian'. But apart from that, there are no misclassifications and the model seems to be working pretty good. Reducing the test size may increase more accuracy, but it may overfit the data.

In [12]:
with open('svc.pkl', 'wb') as f:
    pickle.dump(svc, f)

In [None]:
with open('clf.pkl', 'wb') as f:
    pickle.dump(clf, f)