# [Raveling Detection Challenge](https://www.kaggle.com/c/raveling-detection-ce784a-2022/data)
## Author : Shivam Pandey [@ShivamPR21](https://shivampr21.github.io/)

### Import modules for data manipulation
1. OS : Python
2. Numpy : Matrix manipulation library
3. Pandas : Tabular dataset manipulation, read, and write
4. Pillow.Image : Image read write modality
5. Scipy.stats : Stastical properties of data
6. skimages : Image manipuation, and feature extraction
7. matplotlib.pyplot : data plots

In [8]:
import os
import numpy as np
import pandas as pd
from skimage.feature import graycomatrix, graycoprops
from PIL import Image
from scipy.stats import skew, kurtosis, entropy
from skimage.color import rgb2gray
from skimage import img_as_ubyte
from matplotlib import pyplot as plt
from tqdm import tqdm

### Import Pytorch
- Useful for deep learning model creation, training, and dataset handeling

In [9]:
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn

try:
    from torchsummary import summary
except:
    print("Installing Torchsummary..........")
    ! pip install torchsummary
    from torchsummary import summary

import torch.optim as optim

### Import dataset preprocessing and split modality

In [10]:
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split

### Create `RavelingTrainDataset` class
1. Handels Image manipulation, train, and test datasets
2. Splits the training dataset into validation and trainning sets.
3. Normalizes the datasets with `RobustScaler` method
4. Creats the tabular feature dataset __Required for ANNs__

In [75]:
class RavelingTrainDataset():
    
    def __init__(self, 
                 root = '/kaggle/input/raveling-detection-ce784a-2022/mod_ravelling_dataset',
                 train_test_dirs = ['train', 'test'],
                 classes = ['Non_raveling', 'Raveling'],
                 val_fraction = 0.2,
                 random_state = 42):
        self.root = os.path.join(root, train_test_dirs[0]) # Root Dir
        self.test_root = os.path.join(root, train_test_dirs[1]) # Test Root dir
        self.classes = classes # classes name list
        self.data_file_path = {}
        self.test_data_path = []
        self.data_x = [] # Complete data features
        self.data_y = [] # Complete data label
        self.data_train_x = [] # Train data features
        self.data_train_y = [] # Train data label
        self.scaler = RobustScaler() # Robust scaler instance to scale the data
        
        self.data_test = [] # Test data
        self.data_val_x = [] # validation data features
        self.data_val_y = [] # validation data label
        
        self.val_fraction = val_fraction # Validation fraction
        self.random_state = random_state # Random state to regenerate the samples
        
        self.generate_data_file_path() # Generate the data file path
        self.generate_tabular_data() # Extract features and create the tabular form of data
        self.load_test_data() # Load the test dataset and create tabular form
        
    def generate_data_file_path(self):
        for c in self.classes:
            self.data_file_path.update({c : [
                os.path.join(self.root, c, x) for x in \
                os.listdir(os.path.join(self.root, c))]
                                       })
            
    def generate_features(self, file_path):
        """
        Given file path the function returns extracted features list
        """
        features = []
        
        img = Image.open(file_path)
        img = np.asarray(img)
        
        for i in range(3):
            tmp = img[:, :, i].flatten()
            features.extend([np.mean(tmp), 
                             np.std(tmp), 
                             skew(tmp), 
                             kurtosis(tmp), 
                             np.max(tmp) - np.min(tmp)])
        
        img_gray = img_as_ubyte(rgb2gray(img))
        glcm = graycomatrix(img_gray, [1], [0, np.pi/4, np.pi/2, 3*np.pi/4], 256, 
                            symmetric=True, normed=True)
        dissimilarity = graycoprops(glcm, 'dissimilarity')
        correlation = graycoprops(glcm, 'correlation')
        homogeneity = graycoprops(glcm,'homogeneity')
        energy = graycoprops(glcm, 'energy')
        contrast = graycoprops(glcm, 'contrast')
        ASM = graycoprops(glcm,'ASM')
        
        for glcm_feature in [dissimilarity, correlation, homogeneity, energy, contrast, ASM]:
            features.extend(glcm_feature[0])
        
        return features
    
    def generate_tabular_data(self):
        """
        Creates the tabular dataset, also normalizes the dataset, and
        applies the stratified train_test_split
        """
        for i, key in enumerate(self.classes):
            for p in self.data_file_path[key]:
                feature = self.generate_features(p)
                self.data_y.extend([i])
                self.data_x.extend([feature])
        
        self.data_x = np.array(self.data_x, dtype=np.float32)
        self.data_y = np.array(self.data_y, dtype=np.float32)
        
        self.data_x = self.scaler.fit_transform(self.data_x)
        
        self.data_train_x, self.data_val_x, self.data_train_y, self.data_val_y = train_test_split(self.data_x, self.data_y, 
                                                                                                  train_size = 1.-self.val_fraction, 
                                                                                                  random_state = self.random_state,
                                                                                                  stratify = self.data_y)
    
    def load_test_data(self):
        """
        Loads the test data and port it to tabular form
        """
        paths = [[os.path.join(self.test_root, x), float(x.split('.')[0])] for x in os.listdir(self.test_root)]
        paths.sort(key = lambda x : x[1])
        
        self.test_data_path = paths
        
        for p in paths:
            feature = self.generate_features(p[0])
            self.data_test.extend([feature])
        
        self.data_test = self.scaler.transform(self.data_test)
        self.data_test = np.array(self.data_test, dtype = np.float32)
    
    def __train__(self):
        """Returns Training Dataset"""
        return self.data_train_x, self.data_train_y
    
    def __val__(self):
        """Returns Validation Dataset"""
        return self.data_val_x, self.data_val_y
    
    def __test__(self):
        """Returns Test Features"""
        return self.data_test
    
    def __pandas__(self):
        """Creates pandas dataframe for better visuakization"""
        header = [f'F{i}' for i in range(self.data_x.shape[1])]
        header.extend(["Class"])
        local_data = np.concatenate((self.data_x, np.expand_dims(self.data_y, axis = 1)), axis=1)
        df = pd.DataFrame(local_data, columns=header)
        return df

### Create `TabularDataset` class
1. Handels the tabular datasets
2. Subclass of `torch.Dataset`
3. Implements indexable dataset for pytorch pipeline
4. Handles data transforms internally

In [12]:
class TabularDataset(Dataset):
    def __init__(self, x, y, transform = None, target_transform = None):
        self.x = x
        self.y = y
        self.transform = transform
        self.target_transform = target_transform

    def __getitem__(self, idx):
        feature = self.x[idx, :]
        target = self.y[idx]
        target = np.array([target], dtype = np.float32) 

        if self.transform:
            feature = self.transform(feature)
        if self.target_transform:
            target = self.target_transform(target)
        
        return torch.from_numpy(feature), torch.from_numpy(target)
    
    def __len__(self):
        return len(self.x)

### Define the global training loop characterstics

In [76]:
# Global Variables
BATCH_SIZE = 30 # Batch size
EPOCHS = 50 # Number of epochs (# complete data iteration)
ITRS = 11 # Not used (calculated on the go)

### Initiate the `RavellingDataset` class instance
> Note: This step will take time based on the system speed

In [163]:
dataset = RavelingTrainDataset(val_fraction=0.2, random_state=51)

### Conver data to pandas dataframe and view

In [164]:
dataset.__pandas__()

### Get the train dataset, and create the dataset loader

In [165]:
TrainX, TrainY = dataset.__train__()
TrainDataset = TabularDataset(TrainX, TrainY)

# Create torch dataloader for train dataset
TrainDataloader = DataLoader(TrainDataset, batch_size = BATCH_SIZE, shuffle = True)
TrainTestDataloader = DataLoader(TrainDataset, batch_size = BATCH_SIZE, shuffle = True)
print(f'Train data : feature shape : {TrainX.shape} ; Labels shape : {TrainY.shape}')

### Get the validation dataset, and create the loader utility

In [166]:
ValX, ValY = dataset.__val__()
ValDataset = TabularDataset(ValX, ValY)

# Create torch dataloader for validation dataset
ValDataloader = DataLoader(ValDataset, batch_size = BATCH_SIZE, shuffle = True)
print(f'Validation data : feature shape : {ValX.shape} ; Labels shape : {ValY.shape}')

### Get the test dataset along with its data loader

In [167]:
TestX = dataset.__test__()
TestDataset = TabularDataset(TestX, np.zeros((len(TestX), )))

# Create torch dataloader for test dataset
TestDataloader = DataLoader(TestDataset, batch_size = BATCH_SIZE, shuffle = False)
print(f'Test data shape : {TestX.shape}')

In [168]:
# Let's see the frequency of the train dataset classes
plt.hist(TrainY, color = 'g', label = "Train Dataset")
plt.gca().set(title='Frequency Histogram of Labels in Train Dataset', ylabel='Frequency')
plt.legend()
plt.show()

In [169]:
# Let's see the frequency of the validation dataset classes
plt.hist(ValY, color = 'orange', label = "Validation Dataset")
plt.gca().set(title='Frequency Histogram of Labels in Validation Dataset', ylabel='Frequency')
plt.legend()
plt.show()

### Define the model
1. Using `torch.nn.Module` to define the model
2. Using `nn.Sigmoid` classifier : _Binary classification_
3. We will be using `L2` regularizer or more specifically an [equivalent known as `Weight Decay` regularization](https://towardsdatascience.com/weight-decay-l2-regularization-90a9e17713cd).
4. For actiation function through out the neural network, we use `SELU` and upgraded version on `RELU` to avoid the data loss and still providing non-linearity
> Note: The regularization will be enforced in the optimizer directly

In [170]:
# Create Model subclass to define the network
class Model(nn.Module):
    def __init__(self, in_features = 100):
        super().__init__()

        # Define possible layers configuration
        self.fc1 = nn.Linear(in_features, 150)
        self.fc2 = nn.Linear(150, 90)
        self.fc3 = nn.Linear(90, 70)
        self.fc4 = nn.Linear(70, 50)
        self.fc5 = nn.Linear(50, 30)
        self.fc6 = nn.Linear(30, 20)
        self.fc7 = nn.Linear(20, 10)
        self.fc8 = nn.Linear(10, 5)
        self.fc9 = nn.Linear(5, 1)
        
        # Define activations, classifier layer, 
        # and if required then regularizations
        self.activation = nn.SELU() # Activations
        self.classifier = nn.Sigmoid() # Classifier
#         self.dropout = nn.Dropout(p=0.1)
    
    def forward(self, x):
        """
        Function implements the `forward` pass of a network.
        While training this will run with gradient enabled, to backprop,
        otherwise while testing this is used with torch.no_grad() to infer on the query.
        """
        x = self.activation(self.fc1(x))
        x = self.activation(self.fc2(x))
        x = self.activation(self.fc3(x))
        x = self.activation(self.fc4(x))
        x = self.activation(self.fc5(x))
        x = self.activation(self.fc6(x))
        x = self.activation(self.fc7(x))
        x = self.activation(self.fc8(x))
        x = self.classifier(self.fc9(x))
        
        return x

### Check for device

In [171]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'{device} device will be used.')

### Create the model instance and move it to the device memory

In [172]:
# Transfer the model parameters and properties to selected device
model = Model(TrainX.shape[1]).to(torch.device(device))

### Have a look at the model intrinsics

In [173]:
summary(model, (TrainX.shape[1],))

### Define the helper functions for easy access to accuracy computations

In [174]:
def get_acc(dl):
    correct = 0
    total = 0
    # since we're not training, we don't need to calculate the gradients for our outputs
    with torch.no_grad():
        for data in dl:
            features, labels = data
            features = features.to(device)
            labels = labels.to(device)

            # calculate outputs by running images through the network
            outputs = model(features)

            # the class with the highest energy is what we choose as prediction
            pivot = torch.tensor([0.5]).to(device)
            value = torch.tensor([0.0]).to(device)
            predicted = torch.heaviside(outputs.data-pivot, value)
            
            # print(predicted, labels)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    
    return 100*correct/total

### Initiate the model weights

In [181]:
def init_weights(layer):
    if isinstance(layer, nn.Linear):
        # Xavier normal initialization for all the layers
        nn.init.xavier_normal_(layer.weight.data)

model.apply(init_weights)

### If `load_prev` is `True` the previous model `basemodel` will be loader if found

In [182]:
load_prev = False
if load_prev:
    model.load_state_dict(torch.load('./basemodel'))
    print(model.eval())

### Define te loss function, and optimizer

In [183]:
criterion = nn.BCELoss() # Loss function
params_list = model.parameters() # model parameters

## We can apply custom learning rate or any other perameters to each layer, use the following:
# params_list = [
#     {'params': model.fc1.parameters(), 'lr': 0.01},
#     {'params': model.fc1_1.parameters(), 'lr': 0.01},
#     {'params': model.fc2.parameters(), 'lr': 0.005},
#     {'params': model.fc2_2.parameters(), 'lr': 0.005},
#     {'params': model.fc3.parameters(), 'lr': 0.001},
#     {'params': model.fc3_3.parameters(), 'lr': 0.001},
#     {'params': model.fc4.parameters(), 'lr': 0.005},
#     {'params': model.fc4_4.parameters(), 'lr': 0.005},
#     {'params': model.fc5.parameters(), 'lr': 0.001},
#     {'params': model.fc5_5.parameters(), 'lr': 0.001},
# ]
optimizer = optim.AdamW(params_list, lr=0.0007, weight_decay=0.01) # Optimizer

### Run the training loop

In [184]:
EPOCHS = 60

In [185]:
accuracies = []
for epoch in range(EPOCHS):  # loop over the dataset multiple times

    running_loss = 0.0
    data_itr = tqdm(enumerate(TrainDataloader, 0))
    for i, data in data_itr:
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data
        inputs = inputs.to(torch.device(device))
        labels = labels.to(torch.device(device))

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        if i % 10 == 9:    # print every 100 mini-batches
            accuracies.extend([[epoch + 1, i + 1, running_loss / 100, get_acc(TrainTestDataloader), get_acc(ValDataloader)]])
            data_itr.set_description('[%d, %5d] loss: %.3f, train accuracy: %.3f, val accuracy: %.3f' %
                  (accuracies[-1][0], accuracies[-1][1], accuracies[-1][2], accuracies[-1][3], accuracies[-1][4]))
            running_loss = 0.0

In [186]:
# Plot the training and validation accuracies throughout the training process
accuracies = np.asarray(accuracies, dtype=np.float32)
plt.plot(accuracies[:, 0], accuracies[:, 3], 'r-', label="Train Accuracies")
plt.plot(accuracies[:, 0], accuracies[:, 4], 'g-', label="Val Accuracies")
plt.legend()
plt.grid()
plt.title('Training vs Validation accuracies')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.show()

In [156]:
print(' Validation accuracy of the network: %f %%' % (
    get_acc(ValDataloader)))
print(' Train accuracy of the network: %f %%' % (
    get_acc(TrainDataloader)))

### Save the final model

In [123]:
torch.save(model.state_dict(), "./basemodel")

### Generate the results

In [124]:
result = []
with torch.no_grad():
    for data in TestDataloader:
        features = data[0].to(device)

        # calculate outputs by running images through the network
        outputs = model(features)

        # the class with the highest energy is what we choose as prediction
        pivot = torch.tensor([0.5]).to(device)
        value = torch.tensor([0.0]).to(device)
        predicted = torch.heaviside(outputs.data-pivot, value)
        
        result.extend(predicted.cpu().detach().numpy().flatten())

In [125]:
# Convert numeric classes to named one
result_class = [[str(int(y[1]))+'.jpg', "Raveling" if (x == 1) else "Non_raveling"] for x, y in zip(result, dataset.test_data_path)]

#### Generate the results dataframe

In [126]:
df_result = pd.DataFrame(result_class, columns = ['filename', 'class'])
df_result.head(5)

In [127]:
## Save to csv file, and submit
df_result.to_csv("submission.csv", index=False)