In [1]:
!pip install opendatasets --quiet
import opendatasets as od
od.download("https://www.kaggle.com/datasets/mssmartypants/rice-type-classification")

Please provide your Kaggle credentials to download this dataset. Learn more: http://bit.ly/kaggle-creds
Your Kaggle username: tanmay01bhatt
Your Kaggle Key: ··········
Dataset URL: https://www.kaggle.com/datasets/mssmartypants/rice-type-classification
Downloading rice-type-classification.zip to ./rice-type-classification


100%|██████████| 888k/888k [00:00<00:00, 341MB/s]







In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
df = pd.read_csv("/content/rice-type-classification/riceClassification.csv")
df.head()

Unnamed: 0,id,Area,MajorAxisLength,MinorAxisLength,Eccentricity,ConvexArea,EquivDiameter,Extent,Perimeter,Roundness,AspectRation,Class
0,1,4537,92.229316,64.012769,0.719916,4677,76.004525,0.657536,273.085,0.76451,1.440796,1
1,2,2872,74.691881,51.400454,0.725553,3015,60.471018,0.713009,208.317,0.831658,1.453137,1
2,3,3048,76.293164,52.043491,0.731211,3132,62.296341,0.759153,210.012,0.868434,1.46595,1
3,4,3073,77.033628,51.928487,0.738639,3157,62.5513,0.783529,210.657,0.870203,1.483456,1
4,5,3693,85.124785,56.374021,0.749282,3802,68.571668,0.769375,230.332,0.874743,1.51,1


# **Null Values**

In [4]:
df.isnull().sum()

Unnamed: 0,0
id,0
Area,0
MajorAxisLength,0
MinorAxisLength,0
Eccentricity,0
ConvexArea,0
EquivDiameter,0
Extent,0
Perimeter,0
Roundness,0


In [5]:
df.drop("id",axis=1,inplace=True)

In [6]:
df.head()

Unnamed: 0,Area,MajorAxisLength,MinorAxisLength,Eccentricity,ConvexArea,EquivDiameter,Extent,Perimeter,Roundness,AspectRation,Class
0,4537,92.229316,64.012769,0.719916,4677,76.004525,0.657536,273.085,0.76451,1.440796,1
1,2872,74.691881,51.400454,0.725553,3015,60.471018,0.713009,208.317,0.831658,1.453137,1
2,3048,76.293164,52.043491,0.731211,3132,62.296341,0.759153,210.012,0.868434,1.46595,1
3,3073,77.033628,51.928487,0.738639,3157,62.5513,0.783529,210.657,0.870203,1.483456,1
4,3693,85.124785,56.374021,0.749282,3802,68.571668,0.769375,230.332,0.874743,1.51,1


total number of classes

In [7]:
df['Class'].unique()

array([1, 0])

# Normalizing The values

There are 3 ways to normalize values:

1.   Max-Abs Normaliation(MaxAbsScaler) = used manually in this (instead of using sklearn )
2.   Min-Max Normalization(MinMaxScaler)
3.   Z-Score Standardization (StandardScaler)


In [8]:
# saving the org dataset before normalizing the values wthin a given range : [-1,1] in this case
org_data = df.copy()

In [9]:
y = df['Area'].abs().max()
print(y)

10210


In [10]:
for col in df.columns :
  df[col] = df[col]/df[col].abs().max()

In [11]:
df.head()

Unnamed: 0,Area,MajorAxisLength,MinorAxisLength,Eccentricity,ConvexArea,EquivDiameter,Extent,Perimeter,Roundness,AspectRation,Class
0,0.444368,0.503404,0.775435,0.744658,0.424873,0.66661,0.741661,0.537029,0.844997,0.368316,1.0
1,0.281293,0.407681,0.622653,0.750489,0.273892,0.53037,0.80423,0.409661,0.919215,0.371471,1.0
2,0.298531,0.416421,0.630442,0.756341,0.28452,0.54638,0.856278,0.412994,0.959862,0.374747,1.0
3,0.300979,0.420463,0.629049,0.764024,0.286791,0.548616,0.883772,0.414262,0.961818,0.379222,1.0
4,0.361704,0.464626,0.682901,0.775033,0.345385,0.601418,0.867808,0.452954,0.966836,0.386007,1.0


In [12]:
df['Area'].max()

1.0

# Splitting The Data

In [13]:
from sklearn.model_selection import train_test_split

In [14]:
X = df.iloc[:,:-1].values
y = df.iloc[:,-1].values

In [15]:
# Step 1: Split off 30% for temp (val + test), keep 70% for training
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)

# Step 2: Split temp (30%) into 15% val and 15% test → so split 50% of temp for each
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [16]:
print("Training set is: ", X_train.shape[0], " rows which is ", round(X_train.shape[0]/df.shape[0],4)*100, "%") # Print training shape
print("Validation set is: ",X_val.shape[0], " rows which is ", round(X_val.shape[0]/df.shape[0],4)*100, "%") # Print validation shape
print("Testing set is: ",X_test.shape[0], " rows which is ", round(X_test.shape[0]/df.shape[0],4)*100, "%")

Training set is:  12729  rows which is  70.0 %
Validation set is:  2728  rows which is  15.0 %
Testing set is:  2728  rows which is  15.0 %


# Dataset Object

In [17]:
import torch
import torch.nn as nn

In [18]:
from torch.utils.data import Dataset, DataLoader

Dataset / Dataloader = manage and load data efficiently

In [19]:
class CustomDataset(Dataset):
  def __init__(self,X,y):
    self.X = torch.tensor(X,dtype=torch.float32)  # numpy to torch obj
    self.y = torch.tensor(y,dtype=torch.float32)  # numpy to torch obj

  def __len__(self):  # len of input
    return len(self.X)

  def __getitem__(self,index):  # full row = inp + out of 1 row
    return self.X[index],self.y[index]

create pytorch objects for train,val,test

In [20]:
train_data = CustomDataset(X_train,y_train)
val_data = CustomDataset(X_val,y_val)
test_data = CustomDataset(X_test,y_test)

In [21]:
train_data[0]

(tensor([0.8953, 0.8896, 0.8711, 0.9282, 0.8403, 0.9462, 0.6924, 0.7492, 0.8749,
         0.5794]),
 tensor(0.))

# DataLoader

loads data into batches which allows us to loop through the batches easily during the training

In [40]:
train_dataloader = DataLoader(train_data,batch_size=32,shuffle=True)
val_dataloader = DataLoader(val_data,batch_size=32,shuffle=True)
test_dataloader = DataLoader(test_data,batch_size=32,shuffle=True)

In [41]:
for x,y in train_dataloader:
  print(x,y)
  break
# batch size=32 means it loads 32 rows in 1 batch

tensor([[0.7465, 0.8618, 0.7543, 0.9505, 0.7094, 0.8640, 0.5886, 0.7204, 0.7889,
         0.6482],
        [0.6957, 0.9069, 0.6682, 0.9757, 0.6628, 0.8341, 0.5759, 0.7261, 0.7237,
         0.7700],
        [0.8192, 0.8512, 0.8403, 0.9264, 0.7806, 0.9051, 0.7067, 0.7287, 0.8461,
         0.5747],
        [0.8911, 0.8647, 0.8957, 0.9148, 0.8477, 0.9440, 0.6557, 0.7464, 0.8771,
         0.5477],
        [0.4182, 0.7147, 0.5085, 0.9798, 0.4010, 0.6467, 0.7944, 0.5688, 0.7089,
         0.7974],
        [0.8377, 0.8810, 0.8289, 0.9368, 0.8015, 0.9153, 0.8199, 0.7529, 0.8104,
         0.6030],
        [0.5896, 0.8514, 0.6022, 0.9804, 0.5594, 0.7679, 0.5148, 0.6760, 0.7076,
         0.8021],
        [0.6330, 0.8266, 0.6709, 0.9627, 0.6044, 0.7956, 0.5978, 0.6816, 0.7472,
         0.6990],
        [0.6466, 0.8317, 0.6821, 0.9612, 0.6154, 0.8041, 0.8710, 0.6866, 0.7522,
         0.6918],
        [0.5593, 0.7663, 0.6358, 0.9594, 0.5315, 0.7478, 0.8939, 0.6245, 0.7863,
         0.6838],
        [0

# **Model**

mymodel inherits all the methods and properties from nn.Module (super = call to parent class constructor)

In [42]:
class MyModel(nn.Module):
  def __init__(self):         # define the different model layers
    super(MyModel,self).__init__()

    self.input_layer = nn.Linear(X.shape[1],10)  # dense layer and X.shape[1] = no of inp features(no of inp cols)
    self.linear = nn.Linear(10,1)
    self.sigmoid = nn.Sigmoid()  # or use nn.functional as F and define in forward function

  def forward(self,x):    # build the model architecture
    x = self.input_layer(x)
    x = self.linear(x)
    x = self.sigmoid(x)
    return x


In [43]:
model = MyModel()

In [44]:
from torchsummary import summary
summary(model,(X.shape[1],))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Linear-1                   [-1, 10]             110
            Linear-2                    [-1, 1]              11
           Sigmoid-3                    [-1, 1]               0
Total params: 121
Trainable params: 121
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.00
Forward/backward pass size (MB): 0.00
Params size (MB): 0.00
Estimated Total Size (MB): 0.00
----------------------------------------------------------------


# Loss And Optimizer

In [45]:
from torch.optim import Adam
criterion = nn.BCELoss()
optimizer = Adam(model.parameters(), lr= 1e-3)

# **Model Training**

In [46]:
# 10 Epochs
for epoch in range(10):
  total_acc_train = 0
  total_loss_train = 0
  total_acc_val = 0
  total_loss_val = 0

  for data in train_dataloader:
    inputs,labels = data  # 1 batch = 32 rows of data

    pred = model(inputs).squeeze(1) # [32,1] is depreciated instead use batchsize = [32]

    batch_loss = criterion(pred,labels)  # loss calculation
    total_loss_train += batch_loss.item() # loss after every epoch

    acc = ((pred).round()==labels).sum().item()  #pred are decimal values so we round them(0 or 1)
                         # sum() how many are true   item() = extracts scalar value from tensor
    total_acc_train += acc

    # forward propagation done

    batch_loss.backward()   # gradients are calc w.r.t wts and biases
    optimizer.step()        # updates the model parameter using the gradients
    optimizer.zero_grad()   # resets the prev gradients before stating the next batch

    # backpropagation done

    # Validation

  with torch.no_grad():
      for data in val_dataloader:
        inputs,labels = data
        pred = model(inputs).squeeze(1)
        batch_loss = criterion(pred,labels)
        total_loss_val += batch_loss.item()

        acc = ((pred).round()==labels).sum().item()
        total_acc_val += acc

  print(f'''Epoch no. {epoch + 1} Train Loss: {total_loss_train/1000:.4f} Train Accuracy: {(total_acc_train/(train_data.__len__())*100):.4f} Validation Loss: {total_loss_val/1000:.4f} Validation Accuracy: {(total_acc_val/(val_data.__len__())*100):.4f}''')
  print("="*50)

Epoch no. 1 Train Loss: 0.2350 Train Accuracy: 85.3327 Validation Loss: 0.0385 Validation Accuracy: 97.1774
Epoch no. 2 Train Loss: 0.1193 Train Accuracy: 98.1852 Validation Loss: 0.0157 Validation Accuracy: 98.4604
Epoch no. 3 Train Loss: 0.0554 Train Accuracy: 98.3502 Validation Loss: 0.0086 Validation Accuracy: 98.9370
Epoch no. 4 Train Loss: 0.0356 Train Accuracy: 98.4838 Validation Loss: 0.0059 Validation Accuracy: 98.9003
Epoch no. 5 Train Loss: 0.0276 Train Accuracy: 98.5388 Validation Loss: 0.0047 Validation Accuracy: 98.6804
Epoch no. 6 Train Loss: 0.0236 Train Accuracy: 98.5231 Validation Loss: 0.0040 Validation Accuracy: 98.9003
Epoch no. 7 Train Loss: 0.0213 Train Accuracy: 98.6488 Validation Loss: 0.0037 Validation Accuracy: 98.7537
Epoch no. 8 Train Loss: 0.0199 Train Accuracy: 98.4995 Validation Loss: 0.0033 Validation Accuracy: 98.9003
Epoch no. 9 Train Loss: 0.0191 Train Accuracy: 98.5702 Validation Loss: 0.0031 Validation Accuracy: 99.0103
Epoch no. 10 Train Loss: 0.0

# Testing

In [48]:
with torch.no_grad():
  total_loss_test = 0
  total_acc_test = 0
  for data in test_dataloader:
    inputs, labels = data

    prediction = model(inputs).squeeze(1)

    batch_loss_test = criterion((prediction), labels)
    total_loss_test += batch_loss_test.item()
    acc = ((prediction).round() == labels).sum().item()
    total_acc_test += acc

print(f"Accuracy Score is: {round((total_acc_test/X_test.shape[0])*100, 2)}%")

Accuracy Score is: 98.64%
