# Building Rice Type Classification Model


In [1]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchsummary import summary
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [3]:
data_df = pd.read_csv("rice_classification/riceClassification.csv")
data_df.head()

Unnamed: 0,id,Area,MajorAxisLength,MinorAxisLength,Eccentricity,ConvexArea,EquivDiameter,Extent,Perimeter,Roundness,AspectRation,Class
0,1,4537,92.229316,64.012769,0.719916,4677,76.004525,0.657536,273.085,0.76451,1.440796,1
1,2,2872,74.691881,51.400454,0.725553,3015,60.471018,0.713009,208.317,0.831658,1.453137,1
2,3,3048,76.293164,52.043491,0.731211,3132,62.296341,0.759153,210.012,0.868434,1.46595,1
3,4,3073,77.033628,51.928487,0.738639,3157,62.5513,0.783529,210.657,0.870203,1.483456,1
4,5,3693,85.124785,56.374021,0.749282,3802,68.571668,0.769375,230.332,0.874743,1.51,1


In [4]:
for column in data_df.copy().columns:
    data_df[column] = data_df[column] / data_df[column].abs().max()

data_df = data_df.drop(["id"], axis = 1)

data_df.head()

Unnamed: 0,Area,MajorAxisLength,MinorAxisLength,Eccentricity,ConvexArea,EquivDiameter,Extent,Perimeter,Roundness,AspectRation,Class
0,0.444368,0.503404,0.775435,0.744658,0.424873,0.66661,0.741661,0.537029,0.844997,0.368316,1.0
1,0.281293,0.407681,0.622653,0.750489,0.273892,0.53037,0.80423,0.409661,0.919215,0.371471,1.0
2,0.298531,0.416421,0.630442,0.756341,0.28452,0.54638,0.856278,0.412994,0.959862,0.374747,1.0
3,0.300979,0.420463,0.629049,0.764024,0.286791,0.548616,0.883772,0.414262,0.961818,0.379222,1.0
4,0.361704,0.464626,0.682901,0.775033,0.345385,0.601418,0.867808,0.452954,0.966836,0.386007,1.0


## Prepare The Data

In [5]:
x = np.array(data_df.iloc[:, :-1]) # take all the rows and all the column except the last one
y = np.array(data_df.iloc[:, -1])  # take all the rows and last column only

x, y

(array([[0.44436827, 0.50340371, 0.77543522, ..., 0.5370287 , 0.844997  ,
         0.36831616],
        [0.28129285, 0.40768133, 0.62265269, ..., 0.40966075, 0.91921498,
         0.37147093],
        [0.29853085, 0.41642141, 0.63044229, ..., 0.41299402, 0.95986205,
         0.37474651],
        ...,
        [0.62340842, 0.84480035, 0.64091576, ..., 0.67304935, 0.75472018,
         0.74783024],
        [0.58374143, 0.8263563 , 0.62355087, ..., 0.67524793, 0.70210346,
         0.75187447],
        [0.60078355, 0.83554818, 0.62495614, ..., 0.6658912 , 0.74305096,
         0.7585284 ]]),
 array([1., 1., 1., ..., 0., 0., 0.]))

### Split The Dataset

In [6]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3)
x_test, x_val, y_test, y_val = train_test_split(x_test, y_test, test_size = 0.5)

In [7]:
print(x_train.shape)
print(x_test.shape)
print(x_val.shape)

(12729, 10)
(2728, 10)
(2728, 10)


## Creating a Custom Dataset

### What’s the point?

Your raw data — CSV, images, audio, whatever — is just **dumb files**. PyTorch doesn’t magically know:
- Where the features are.
- Where the labels are.
- How to get **one** sample at a time.
- How many samples total.

So you wrap your raw data in a **Dataset class** that does 2 things:
1. `__getitem__()` → How to get **one** piece.
2. `__len__()` → How many pieces total.

### What problem does it solve?

- Makes your data *iterable* → `for batch in DataLoader`.
- Handles *indexing* → PyTorch knows how to grab data in batches.
- Works with `DataLoader` to shuffle, batch, sample, and run multi-threaded loading.

###  What if you don’t?

Without a `Dataset`, you’d have to:
- Manually slice batches.
- Shuffle by hand.
- Track indices yourself.
- Write all the loading code.

Nobody wants that. PyTorch’s `Dataset` + `DataLoader` makes life easy.

### So your Dataset is just a blueprint

- *Where’s the data?*
- *How to slice out one row?*
- *What label goes with it?*
- *How many samples you got?*

Boom. Done.

### Yes, this is real life

- For images → `ImageFolder` or custom Dataset.
- For tabular → custom Dataset.
- For text → same idea.
- For audio, medical, video → same idea.

Every serious PyTorch pipeline uses `Dataset` + `DataLoader`. It’s the backbone of the whole show.

### Bottom line

You define a custom Dataset because:
- PyTorch needs to know how to read your custom data.
- You want to batch & shuffle without headaches.
- You don’t want to reinvent the wheel every epoch.

So you build it, pass it to the `DataLoader` — your model trains happily while you sit back like a rice king. 


In [8]:
class RiceDataset(Dataset):
    def __init__(self, x, y):
        self.x = torch.tensor(x, dtype = torch.float32).to(device)
        self.y = torch.tensor(y, dtype = torch.float32).to(device)

    def __len__(self):
        return len(self.x)
    
    def __getitem__(self, index):
        return self.x[index], self.y[index]

In [9]:
training_data = RiceDataset(x_train, y_train)
testing_data = RiceDataset(x_test, y_test)
validation_data = RiceDataset(x_val, y_val)

## Create DataLoader

In [10]:
BATCH_SIZE = 8

training_dataloader = DataLoader(training_data, batch_size = BATCH_SIZE, shuffle = True)
testing_dataloader = DataLoader(testing_data, batch_size = BATCH_SIZE, shuffle = False)
validation_dataloader = DataLoader(validation_data, batch_size = BATCH_SIZE, shuffle = True)

In [11]:
# print a sample of the training data
x_sample, y_sample = next(iter(training_dataloader))
print(x_sample)
print("----------")
print(y_sample)

tensor([[0.6341, 0.7761, 0.7121, 0.9418, 0.6019, 0.7963, 0.6250, 0.6542, 0.8125,
         0.6183],
        [0.9042, 0.8844, 0.8872, 0.9227, 0.8599, 0.9509, 0.8617, 0.7666, 0.8438,
         0.5655],
        [0.9093, 0.8909, 0.8868, 0.9245, 0.8631, 0.9536, 0.6619, 0.7638, 0.8548,
         0.5700],
        [0.5655, 0.7925, 0.6292, 0.9659, 0.5384, 0.7520, 0.5566, 0.6492, 0.7358,
         0.7145],
        [0.6010, 0.8552, 0.6168, 0.9782, 0.5707, 0.7752, 0.7936, 0.6813, 0.7101,
         0.7867],
        [0.6373, 0.8157, 0.6827, 0.9580, 0.6096, 0.7983, 0.7332, 0.6732, 0.7712,
         0.6779],
        [0.6049, 0.8869, 0.6061, 0.9841, 0.5878, 0.7778, 0.5361, 0.7067, 0.6642,
         0.8301],
        [0.9383, 0.8961, 0.9140, 0.9187, 0.8928, 0.9687, 0.7354, 0.7825, 0.8404,
         0.5563]], device='cuda:0')
----------
tensor([0., 0., 0., 1., 1., 1., 1., 0.], device='cuda:0')


## Create The Model of `RiceClassificationModel`

In [12]:
class RiceClassificationModel(nn.Module):
    def __init__(self, input_shape: int,
                 hidden_units: int,
                 output_shape: int):
        
        super().__init__()
    
        self.layer_block = nn.Sequential(
            nn.Linear(in_features = input_shape, out_features = hidden_units),
            nn.Linear(in_features = hidden_units, out_features = output_shape),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.layer_block(x)

In [13]:
# create an instance of above model class
torch.manual_seed(42)

model_0 = RiceClassificationModel(input_shape = x_sample.shape[1],
                                  hidden_units = 10,
                                  output_shape = 1).to(device)

model_0

RiceClassificationModel(
  (layer_block): Sequential(
    (0): Linear(in_features=10, out_features=10, bias=True)
    (1): Linear(in_features=10, out_features=1, bias=True)
    (2): Sigmoid()
  )
)

### Define Loss Function & Optimizer

In [14]:
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(params = model_0.parameters(), lr = 0.001)

### Create Training Loop

In [15]:
from tqdm.auto import tqdm

epochs = 10

for epoch in tqdm(range(epochs)):
    total_train_acc = 0
    total_train_loss = 0
    total_val_loss = 0
    total_val_acc = 0

    for batch, (x, y) in enumerate(training_dataloader):
        model_0.train()

        y_pred = model_0(x).squeeze(1)
        loss = criterion(y_pred, y)
        total_train_loss = total_train_loss + loss.item()
        total_train_acc = total_train_acc + (y_pred.round() == y).sum().item() / len(y_pred)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    model_0.eval()

    with torch.inference_mode():
        for batch, (x, y) in enumerate(validation_dataloader):
            y_pred_val = model_0(x).squeeze(1)

            val_loss = criterion(y_pred_val, y)
            total_val_loss = total_val_loss + loss.item()

            total_val_acc = total_val_acc + (y_pred_val.round() == y).sum().item() / len(y_pred_val)
    
    total_train_acc = total_train_acc / len(training_dataloader) * 100
    total_train_loss = total_train_loss / len(training_dataloader)
    total_val_acc = total_val_acc / len(validation_dataloader) * 100
    total_val_loss = total_val_loss / len(validation_dataloader)


    print(f"Epoch: {epoch} | Train Loss: {total_train_loss} | Train Acc: {total_train_acc} | Val Loss: {total_val_loss} | Val Acc: {total_val_acc}")

  from .autonotebook import tqdm as notebook_tqdm
 10%|█         | 1/10 [00:06<00:58,  6.49s/it]

Epoch: 0 | Train Loss: 0.3938588751994785 | Train Acc: 91.73994974874373 | Val Loss: 0.10214871913194656 | Val Acc: 98.75366568914956


 20%|██        | 2/10 [00:12<00:49,  6.19s/it]

Epoch: 1 | Train Loss: 0.0889015492642984 | Train Acc: 98.4375 | Val Loss: 0.0037377255503088236 | Val Acc: 98.71700879765396


 30%|███       | 3/10 [00:18<00:41,  5.97s/it]

Epoch: 2 | Train Loss: 0.05536768982520657 | Train Acc: 98.50816582914574 | Val Loss: 0.007483759429305792 | Val Acc: 98.75366568914956


 40%|████      | 4/10 [00:23<00:34,  5.81s/it]

Epoch: 3 | Train Loss: 0.04798024798044057 | Train Acc: 98.53172110552764 | Val Loss: 0.1311948299407959 | Val Acc: 98.71700879765396


 50%|█████     | 5/10 [00:29<00:29,  5.81s/it]

Epoch: 4 | Train Loss: 0.04509250677851261 | Train Acc: 98.56312814070351 | Val Loss: 0.00177427486050874 | Val Acc: 98.86363636363636


 60%|██████    | 6/10 [00:34<00:21,  5.40s/it]

Epoch: 5 | Train Loss: 0.043904485493986435 | Train Acc: 98.5160175879397 | Val Loss: 0.020047910511493683 | Val Acc: 98.82697947214076


 70%|███████   | 7/10 [00:39<00:16,  5.42s/it]

Epoch: 6 | Train Loss: 0.04326269539227833 | Train Acc: 98.61023869346734 | Val Loss: 2.5749537599040195e-05 | Val Acc: 98.90029325513197


 80%|████████  | 8/10 [00:44<00:10,  5.39s/it]

Epoch: 7 | Train Loss: 0.04343661444665403 | Train Acc: 98.50816582914574 | Val Loss: 0.00013566056441050023 | Val Acc: 98.86363636363636


 90%|█████████ | 9/10 [00:50<00:05,  5.29s/it]

Epoch: 8 | Train Loss: 0.04305145366274061 | Train Acc: 98.54742462311557 | Val Loss: 5.832789611304179e-07 | Val Acc: 98.90029325513197


100%|██████████| 10/10 [00:57<00:00,  5.74s/it]

Epoch: 9 | Train Loss: 0.04320090463559508 | Train Acc: 98.53957286432161 | Val Loss: 0.0018116642022505403 | Val Acc: 98.75366568914956



