In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

import warnings
warnings.filterwarnings("ignore")


In [2]:
df = pd.read_csv("milknew.csv")  # Update filename

print(df.head())
print(df.isnull().sum())
print(df["Grade"].value_counts())


    pH  Temperature  Taste  Odor  Fat  Turbidity  Colour   Grade
0  6.6           35      1     0    1          0     254    high
1  6.6           36      0     1    0          1     253    high
2  8.5           70      1     1    1          1     246     low
3  9.5           34      1     1    0          1     255     low
4  6.6           37      0     0    0          0     255  medium
pH             0
Temperature    0
Taste          0
Odor           0
Fat            0
Turbidity      0
Colour         0
Grade          0
dtype: int64
Grade
low       429
medium    374
high      256
Name: count, dtype: int64


In [3]:
le = LabelEncoder()

In [4]:
df["Grade"] = le.fit_transform(df["Grade"])
df["Grade"]

0       0
1       0
2       1
3       1
4       2
       ..
1054    2
1055    0
1056    1
1057    0
1058    1
Name: Grade, Length: 1059, dtype: int32

In [5]:
X =df.drop("Grade", axis=1)
y = df["Grade"]

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [7]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [8]:
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train_scaled, y_train)


In [9]:
y_pred = clf.predict(X_test_scaled)

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=le.classes_))


Confusion Matrix:
[[50  1  0]
 [ 0 86  0]
 [ 0  0 75]]

Classification Report:
              precision    recall  f1-score   support

        high       1.00      0.98      0.99        51
         low       0.99      1.00      0.99        86
      medium       1.00      1.00      1.00        75

    accuracy                           1.00       212
   macro avg       1.00      0.99      0.99       212
weighted avg       1.00      1.00      1.00       212



In [10]:
train_acc = clf.score(X_train_scaled, y_train)
test_acc = clf.score(X_test_scaled, y_test)

print(f"Train Accuracy: {train_acc:.4f}")
print(f"Test Accuracy : {test_acc:.4f}")


Train Accuracy: 1.0000
Test Accuracy : 0.9953


In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier

models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "SVM (RBF Kernel)": SVC(kernel='rbf'),
    "Naive Bayes": GaussianNB(),
    "Gradient Boosting": GradientBoostingClassifier()
}

for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    acc = model.score(X_test_scaled, y_test)
    print(f"{name}: Test Accuracy = {acc:.4f}")


Logistic Regression: Test Accuracy = 0.8302
SVM (RBF Kernel): Test Accuracy = 0.9057
Naive Bayes: Test Accuracy = 0.8538
Gradient Boosting: Test Accuracy = 0.9953


In [12]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42
)

clf.fit(X_train_scaled, y_train)

train_acc = clf.score(X_train_scaled, y_train)
test_acc = clf.score(X_test_scaled, y_test)

print(f"Train Accuracy: {train_acc:.4f}")
print(f"Test Accuracy : {test_acc:.4f}")



Train Accuracy: 0.9976
Test Accuracy : 0.9906


In [13]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(clf, X, y, cv=5)
print(f"Cross-Validated Accuracy: {scores.mean():.4f} ± {scores.std():.4f}")


Cross-Validated Accuracy: 0.9962 ± 0.0046
