## Welcome to the AI-ML-Jupyter-Notebooks repository! 
### This guide will help you navigate and learn CNN Image Classification using TensorFlow and OpenCV.

----

#### Make sure to install necessary dependencies by running this command :
pip install -r requirements.txt

---

#### TensorFlow is Required to run this codebase.
Install using the Official Guige [HERE](https://www.tensorflow.org/install/pip).

---

### Understanding PreProcessing
- Load Dataset from 'DATA_DIR' Directory
- Creates DataFrame containing Full Paths of Images and their Class Labels
- (Change as per Requirement) Rescale Images to Computationally Efficient Resolution
- (Optional but Recommended) Extracts Largest Object from Image using 'image_processing' Function
    - Leverages Parallel Processing for Faster Results
- Compares Original and Rescaled+Processed Image SIde-By-Side to make necessary changes
- Converts Processed Images to NumPy Array and Exports as Pickle File
    - Verifies If Exported Pickle File is Appropriate through 10 Random Samples
- (Optional) Merge Certain Class Lables Together
- Split Data for Training, Testing, Validation with Stratify to ensure data balancing
    - Verify if Split is Appropriate through 2 random samples
- (Optional) Perform Random Oversampling on Data to reduce Biasness
    - Verify if Oversampling is Appropriate through 2 random samples
- Perform One-Hot-Encoding of Class Labels
- Training, Testing, Validation Data and One-Hot-Encoding is Exported as Pickle Files

---

In [None]:
'''
Working Directories
'''
import os

# Directory of Original Dataset
DATA_DIR = '../Dataset'

# Directory where Pickle Files will be Stored (Folder Will be Created by Code)
PICKLE_DIR = '../PickleFiles/'
os.makedirs(os.path.dirname(PICKLE_DIR), exist_ok=True)

In [None]:
'''
Importing Necessary Libraries and Packages
'''

# Helpers
import random
import pickle
import concurrent.futures
from HelperFunctions import images_on_side, image_processing

# Data Handling and Visualization
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt

# Image Processing
import cv2
from skimage.io import imread as sk_imread

# Model Pipelining 
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

In [None]:
'''
Creating a DataFrame containing 
    - Paths of All Source Image Files and their respective Class Label
'''

def get_file_paths(main_dir):
    file_paths = []
    damage_class = []
    df = pd.DataFrame()
    for root, dir, files in os.walk(main_dir):
        if root == main_dir:
            continue
        class_damage = os.path.basename(root)
        lst = [os.path.join(root, filename) for filename in files]
        file_paths.extend(lst)
        damage_class.extend([class_damage] * len(lst))
        del(lst) # Clear RAM
        del(class_damage) # Clear RAM
    df['File Path'] = file_paths
    df['Class Label'] = damage_class
    del(file_paths) # Clear RAM
    del(damage_class) # Clear RAM
    return df

# The DataFrame
df = get_file_paths(DATA_DIR)

In [None]:
# Getting Details About Created DataFrame

print('Shape of Created DataFrame : ', df.shape)
print('Total Number of Sample Images : ',len(df))
print('Total Number of Class Label : ',len(set(df['Class Label'])))
print('\nNumber of Samples for Each Class Label :\n',df.groupby('Class Label')['File Path'].count().to_string()[12:])

In [None]:
'''
Verify if there is any Redundancy in DataFrame 
    - Ideally, There should be No Redundancy
Verify if all File Paths exist in Mentioned Directory
    - Ideally, All Paths should Exist
Export Entire DataFrame as a Pickle File
    - Only If there is No Redundancy and All Paths Exist
'''

# Verify : Duplicate Values / Redundancy
redundancy = True
duplicates = df[df['File Path'].duplicated(keep=False)]
if len(duplicates)>0:
    print("Redundancy Identified As :")
    print(duplicates['File Path'])
else:
    print('There is No Redundancy.')
    redundancy = False
del(duplicates) # Clear RAM

# Verify : All Paths Exist
validity = False
paths = list(df['File Path'])
nonexistent_paths = [path for path in paths if not os.path.exists(path)]
if len(nonexistent_paths)>0:
    print("Non-Existent Paths Identified As :")
    print(nonexistent_paths)
else:
    print('All Paths Exist.')
    validity = True
del(paths)  # Clear RAM
del(nonexistent_paths)  # Clear RAM

# Export DataFrame as a Pickle File if all conditions are met
if redundancy == False and validity == True:
    df.to_pickle(PICKLE_DIR+"FilePathsAndClassLabels.pkl")
    print('DataFrame Successfully Exported as Pickle File.')
else:
    print('Cannot Export DataFrame, Check for Redundancy and Validity of File Paths')
del(redundancy) # Clear RAM
del(validity) # Clear RAM

In [None]:
'''
    ** Verification Only **
Getting Random Sample (Image) from Dataset
Checking Dimentions of Obtained Random Sample
Extracting Largest Object from Image (Optional but Recommended)
Resize Random Sample to 600X400 (Optional but Recommended)
Visualizing Original and Resize Sample Side-by-Side
    - Ideally, Images Displayed Side-By-Side should be Similar
'''

# Obtaining and Checking Dimentions of a Random Image from Dataset
path = random.choice(df['File Path'])
sample_img = sk_imread(path)
print('Original Dimentions of Random Sample : ',sample_img.shape)

# Extracting Largest Object from Image (Optional)
processed_img = image_processing(path)

# Rescaling and Verifying 
IMG_HEIGHT = 400
IMG_WIDTH = 600
IMG_CHANNELS = 3 # Assuming RGB Image
processed_img = cv2.resize(processed_img, (IMG_WIDTH, IMG_HEIGHT))
print('Dimentions of ProcessedRandom Sample : ',processed_img.shape)

# Visualizing Original and Processed Random Image
images_on_side(sample_img,'Original Random Sample',processed_img,'Processed Random Sample')


del(path) # Clear RAM
del(sample_img) # Clear RAM
del(processed_img) # Clear RAM

In [None]:
'''
*Using Parallel Processing for Faster Results*
Extracting Largest Object from Image and Change Background to Black
Resize All Processed Images to 600X400 (Optional but Recommended)
Convert Images to Numpy Arrays
    - Later referred to as ImageArray
Export Image Arrays as Pickle File 
    - Exporting is done after every 'export_after' number of Images
'''

export_after = 1000

def process_images_chunk(chunk, start_index):
    chunk_result = np.zeros((len(chunk), IMG_HEIGHT, IMG_WIDTH, IMG_CHANNELS), np.uint8)
    for i, pth in enumerate(chunk):
        img = image_processing(pth)
        chunk_result[i] = cv2.resize(img, (IMG_WIDTH, IMG_HEIGHT))
    return start_index, chunk_result

chunks = [df['File Path'][i:i+export_after].tolist() for i in range(0, len(df['File Path']), export_after)]

X = np.zeros((len(df), IMG_HEIGHT, IMG_WIDTH, IMG_CHANNELS), np.uint8)

with concurrent.futures.ProcessPoolExecutor() as executor:
    futures = {executor.submit(process_images_chunk, chunk, i*export_after): i for i, chunk in enumerate(chunks)}
    for future in concurrent.futures.as_completed(futures):
        start_index, result = future.result()
        end_index = start_index + len(result)
        X[start_index:end_index] = result
        print(f'Processed chunk starting at index: {start_index}')

# Save the merged result
with open(PICKLE_DIR + 'ImageArrays.pkl', 'wb') as f:
    pickle.dump(X, f)
print('Image Array Export Completed.')

In [None]:
'''
    ** Verification Only **
Index 'i' of DataFrame should reflect the DamageClass for Image at Index 'i' in ImageArray 
    - This is done as a Verification Step before using Data for Model Training/Testing.
    - Verification Done using 10 Random Samples.
    - NOTE : Ideally, All Images should match the Labels
'''

# Select 10 Random Images for Verification
inds = list(np.random.randint(0, len(df), 10))

# Show Images from Image Array and Class Label from DataFrame
for i in range(0,10,2):
    images_on_side(X[inds[i]],df['Class Label'].iloc[inds[i]],X[inds[i+1]],df['Class Label'].iloc[inds[i+1]])

del(inds) # Clear RAM

In [None]:
'''
    ** Optional Step **
Merging Class Labeles
    - By Default, Merging of Damaged Classes is Disabled
'''

# Set 'merge' to True if Merging of Damaged Classes is Needed
merge = False

if merge == True :
    # List of Class Labeles to Merge
    to_merge = ['ClassLabel1', 'ClassLabel2', 'ClassLabel3', 'ClassLabel4']
    # Class Label to Merge Into
    merge_into = 'CombinedClassLabel'
    # Merging and Verification
    df['Class Label'].replace(to_merge, merge_into, inplace=True)
    print('Merging Of Class Label is Successful.')
    print('Number of Samples for Each Class Label : \n',df.groupby('Class Label')['File Path'].count().to_string()[12:])
    del(to_merge) # Clear RAM
    del(merge_into) # Clear RAM
else:
    print('Merging Of Class Label is Disabled. \n')
    print('Number of Samples for Each Class Label : \n',df.groupby('Class Label')['File Path'].count().to_string()[12:])

In [None]:
'''
Split the Data for Training, Testing, Validation
  - SEED ensures that Generated Splits are Reproducible.
  - Both DataFrame and ImageArrays are Split for Training and Testing
  -'stratify' ensures that Proportion of Items in Splits is Same as that in the Parameter
'''

# Splitting Data
SEED = 50
test_size = 0.15
val_size = 0.15
temp_size = test_size / (test_size + val_size) # proportion of test and validation set

# Split DamageClassLabels and ImgArray into Training and Temporary (will be split into Testing and Validation).
ClassLabels_train, ClassLabels_temp, ImgArray_train, ImgArray_temp = train_test_split(df, X, stratify=df['Class Label'], shuffle=True, test_size=test_size+val_size, random_state=SEED)
del(df) # Clear RAM
del(X) # Clear RAM

# Split temporary datasets into Testing and Validation.
ClassLabels_validation, ClassLabels_test, ImgArray_validation, ImgArray_test = train_test_split(ClassLabels_temp, ImgArray_temp, stratify=ClassLabels_temp['Class Label'], shuffle=True, test_size=temp_size, random_state=SEED)
del(ClassLabels_temp) # Clear RAM
del(ImgArray_temp) # Clear RAM

del(SEED) # Clear RAM
del(test_size) # Clear RAM
del(val_size) # Clear RAM
del(temp_size) # Clear RAM

# Getting Overview of Split Data
print('Item:','ClassLabels_train','        Shape:',ClassLabels_train.shape)
print('Item:','ClassLabels_test','         Shape:',ClassLabels_test.shape)
print('Item:','ClassLabels_validation','   Shape:',ClassLabels_validation.shape)
print('Item:','ImgArray_train','           Shape:',ImgArray_train.shape)
print('Item:','ImgArray_test','            Shape:',ImgArray_test.shape)
print('Item:','ImgArray_validation','      Shape:',ImgArray_validation.shape)

In [None]:
'''
    ** Verification Only **
Verify that Data Split is Successful.
  - Index 'i' of ImgArray_train should have exactly same image as Index 'i' of ClassLabels_train
  - Index 'i' of ImgArray_test should have exactly same image as Index 'i' of ClassLabels_test
  - NOTE : Ideally, Images on Left and Right should be Exactly Same
  
'''

# Index 'i' of ImgArray_train should have exactly same image as Index 'i' of df_train
ind = random.randint(0, len(ClassLabels_train))
images_on_side(ImgArray_train[ind],'Sample from ImgArray_train',sk_imread(ClassLabels_train['File Path'].iloc[ind]),'Sample from ClassLabels_train')
del(ind) # Clear RAM

# Index 'i' of ImgArray_test should have exactly same image as Index 'i' of df_test
ind = random.randint(0, len(ClassLabels_test))
images_on_side(ImgArray_test[ind],'Sample from ImgArray_test',sk_imread(ClassLabels_test['File Path'].iloc[ind]),'Sample from ClassLabels_test')
del(ind) # Clear RAM

In [None]:
'''
    ** Optional Step **
Over-Sampling Training Data
    - By Default, Over-Sampling Training Data is Disabled
'''

# Set 'oversample' to True if Over-Sampling Training Data is Needed
oversample = False

# Data Properties BEFORE Over Sampling
print('  BEFORE Over Sampling : \n')
print('Number of Training Samples : \n',ClassLabels_train['Class Label'].value_counts().to_string()[12:])
print('Shape of Training ImageArray : ',ImgArray_train.shape)
print('Number of Training Samples : ',len(ClassLabels_train))

# Reset Previous Index to avoid Indexing Issues
ClassLabels_train.reset_index(drop=True, inplace=True)
ClassLabels_test.reset_index(drop=True, inplace=True)
ClassLabels_validation.reset_index(drop=True, inplace=True)

# Perform Over-Sampling
if oversample == True:
    max_class_len = max(ClassLabels_train['Class Label'].value_counts())
    oversample_df = pd.DataFrame()
    oversample_X = np.array(np.zeros((0, IMG_WIDTH, IMG_HEIGHT, IMG_CHANNELS)), dtype='uint8')
    for damage in set(ClassLabels_train['Class Label']):
        damage_df = ClassLabels_train[ClassLabels_train['Class Label']==damage]
        damage_X = ImgArray_train[damage_df.index]
        class_len = len(damage_df)
        if class_len != max_class_len:
            extra = max_class_len - class_len
            inds = random.sample(range(class_len), extra)
            extra_df = damage_df.iloc[inds]
            extra_X = damage_X[inds]
            oversample_X = np.vstack([oversample_X, extra_X])
            oversample_df = pd.concat([oversample_df, extra_df]).reset_index(drop=True)
            del(inds) # Clear RAM
            del(damage_X) # Clear RAM
            del(damage_df) # Clear RAM
            del(extra_df) # Clear RAM
            del(extra_df) # Clear RAM
    if len(set(ClassLabels_train['Class Label'].value_counts())) != 1:
        ClassLabels_train = pd.concat([ClassLabels_train, oversample_df]).reset_index(drop=True)
        ImgArray_train = np.vstack([ImgArray_train, oversample_X])
        y_train = ClassLabels_train['Class Label']
    del(max_class_len) # Clear RAM
    del(oversample_df) # Clear RAM
    del(oversample_X) # Clear RAM
    del(oversample) # Clear RAM

# Data Properties AFTER Over-Sampling
print('\n  AFTER Over Sampling : \n')
print('Number of Training Samples : \n',ClassLabels_train['Class Label'].value_counts().to_string()[12:])
print('Shape of Training ImageArray : ',ImgArray_train.shape)
print('Number of Training Samples : ',len(ClassLabels_train))

In [None]:
'''
    ** Verification Only **
Verify that Random Sampling is Successful.
  - Index 'i' of ImgArray_train should have exactly same image as Index 'i' of ClassLabels_train
  - Verification is done Twice
  - NOTE : Ideally, Images on Left and Right should be Exactly Same

'''

for _ in range(2):
    ind = random.randint(0, len(ClassLabels_train))
    images_on_side(ImgArray_train[ind],'Sample from ImgArray_train',sk_imread(ClassLabels_train['File Path'].iloc[ind]),'Sample from ClassLabels_train')
    del(ind) # Clear RAM

In [None]:
'''
One-Hot-Encoding for 
    - Training Class Label Labels
    - Testing Class Label Labels
    - Validation Class Label Labels
'''

# Defining One-Hot-Encoding
OHE = OneHotEncoder(sparse_output=False)

# One-Hot-Encoding of Training Class Label Labels
ClassLabels_train = np.array(ClassLabels_train['Class Label']).reshape(-1,1)
ClassLabels_train = OHE.fit_transform(ClassLabels_train)

# One-Hot-Encoding of Testing Class Label Labels
ClassLabels_test = np.array(ClassLabels_test['Class Label']).reshape(-1,1)
ClassLabels_test = OHE.fit_transform(ClassLabels_test)

# One-Hot-Encoding of Validation Class Label Labels
ClassLabels_validation = np.array(ClassLabels_validation['Class Label']).reshape(-1,1)
ClassLabels_validation = OHE.fit_transform(ClassLabels_validation)

OHE_classes = OHE.categories_[0]
print('Class Labeles for One-Hot-Encoding Are : \n',OHE_classes)

In [None]:
'''
Exporting the following One-Hot-Encoded Data as Pickle Files
    - ClassLabels_train
    - ClassLabels_test
    - ClassLabels_validation
Exporting the following as Pickle Files
    - ImgArray_train
    - ImgArray_test
    - ImgArray_validation
Exporting One-Hot-Encoding
'''

# Exporting ClassLabels_train
try:
    with open (PICKLE_DIR+'ClassLabels_train.pkl', 'wb') as f: pickle.dump(ClassLabels_train, f)
    print('Export Successful for : ClassLabels_train.pkl ')  
    del(ClassLabels_train) # Clear RAM
except:
    print('Export Unsuccessful for : ClassLabels_train.pkl ')

# Exporting ClassLabels_test
try:
    with open (PICKLE_DIR+'ClassLabels_test.pkl', 'wb') as f: pickle.dump(ClassLabels_test, f)
    print('Export Successful for : ClassLabels_test.pkl ')  
    del(ClassLabels_test) # Clear RAM
except:
    print('Export Unsuccessful for : ClassLabels_test.pkl ')

# Exporting ClassLabels_validation
try:
    with open (PICKLE_DIR+'ClassLabels_validation.pkl', 'wb') as f: pickle.dump(ClassLabels_validation, f)
    print('Export Successful for : ClassLabels_validation.pkl ')  
    del(ClassLabels_validation) # Clear RAM
except:
    print('Export Unsuccessful for : ClassLabels_validation.pkl ')

    
# Exporting ImgArray_train
try:
    with open (PICKLE_DIR+'ImgArray_train.pkl', 'wb') as f: pickle.dump(ImgArray_train, f)
    print('Export Successful for : ImgArray_train.pkl')  
    del(ImgArray_train) # Clear RAM
except:
    print('Export Unsuccessful for : ImgArray_train.pkl ')

# Exporting ImgArray_test
try:
    with open (PICKLE_DIR+'ImgArray_test.pkl', 'wb') as f: pickle.dump(ImgArray_test, f)
    print('Export Successful for : ImgArray_test.pkl')
    del(ImgArray_test) # Clear RAM
except:
    print('Export Unsuccessful for : ImgArray_test.pkl ') 

# Exporting ImgArray_validation
try:
    with open (PICKLE_DIR+'ImgArray_validation.pkl', 'wb') as f: pickle.dump(ImgArray_validation, f)
    print('Export Successful for : ImgArray_validation.pkl')
    del(ImgArray_validation) # Clear RAM
except:
    print('Export Unsuccessful for : ImgArray_validation.pkl ') 

    
# Exporting One-Hot-Encoding 
try:
    with open (PICKLE_DIR+'OHE.pkl', 'wb') as f: pickle.dump(OHE, f)
    print('Export Successful for : OHE.pkl')
    del(OHE) # Clear RAM
except:
    print('Export Unsuccessful for : OHE.pkl ') 

More At : https://github.com/iSiddharth20/DeepLearning-ImageClassification-Toolkit