# **REPLY** - Image Classification

### **Overview**
This project is part of the coursework for Machine Learning, focusing on the application of machine learning techniques for the classification of scanned document images into predefined categories. Our objective is to develop a model capable of accurately identifying the type of document from a set of scanned images. This involves a comprehensive process starting from data analysis to model evaluation, aiming to achieve high accuracy and robustness in classification tasks.

### **Project Description**
The task involves constructing a machine learning model to classify scanned documents into four distinct categories: resumes, advertisements, emails, and handwritten documents. The dataset comprises 2000 images in TIFF format, requiring thorough preprocessing and analysis to ensure optimal model performance. Key stages of the project include data cleaning, model selection, hyperparameter tuning, and performance evaluation.

### **Dataset**
The dataset features 2000 scanned document images in .tif format, categorized into four classes:
* Resumes
* Advertisements
* Emails
* Handwritten documents

## This Project is held by Group 5
- Elisa Dobici elisa.dobici@studenti.luiss.it 785171
- Joshua Brauner joshua.brauner@studenti.luiss.it 778931
- Paoloemilio Grande paoloemilio.grande@studenti.luiss.it 788661

### EDA
Perform an in-depth analysis of the dataset to understand its structure, quality, and any preprocessing needs.

In [106]:
#importing dependencies
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import os
import cv2
import imghdr #non penso serva
import numpy as np
from matplotlib import pyplot as plt
from PIL import Image
from skimage import io

In [107]:
#Setting directory
data_dir = 'Data'

#Counting number of images
print("In this directory there are: ",len(os.listdir(data_dir)), " images.")

In this directory there are:  2442  images.


In [108]:
#Removing dodgy images
valid_ext = 'tiff'

# Iterate over the images in the directory
for image in os.listdir(data_dir):
    image_path = os.path.join(data_dir, image)
    try:
        # Check the file extension
        if not image.lower().endswith('.tif'):
            print(f'Image not in .tif format: {image_path}')
            continue
        
        # Use imghdr to verify the image format
        img_type = imghdr.what(image_path)
        if img_type != valid_ext:
            print(f'Image not in .tif format: {image_path}')
            # If you want to remove the non-tif files, uncomment the next line
            # os.remove(image_path)
    except Exception as e:
        print(f'Issue with image {image_path}: {e}')


In [109]:
#Counting number of images
print("In this directory there are: ",len(os.listdir(data_dir)), " images.")

In this directory there are:  2442  images.


### ANALYZING IMAGE SIZE

In [None]:
def analyze_image_sizes(data_dir):
    sizes = []
    for image_name in os.listdir(data_dir):
        if image_name.lower().endswith('.tif'):
            image_path = os.path.join(data_dir, image_name)
            with Image.open(image_path) as img:
                sizes.append(img.size)  # size is in (width, height) format
    return sizes

### DETERMINE IF BLACK AND WHITE OR RGB

In [None]:
#Determine if image is grayscale or RGB
def analyze_image_colors(data_dir):
    color_modes = {'L': 0, 'RGB': 0, 'Other': 0}
    for image_name in os.listdir(data_dir):
        if image_name.lower().endswith('.tif'):
            image_path = os.path.join(data_dir, image_name)
            with Image.open(image_path) as img:
                if img.mode in color_modes:
                    color_modes[img.mode] += 1
                else:
                    color_modes['Other'] += 1
    return color_modes

### COMPUTING BRIGHTNESS LEVELS

In [None]:
#Calculating brightness
def calculate_brightness(image):
    grayscale_image = image.convert('L')  # Convert image to grayscale
    image_array = np.array(grayscale_image)  # Convert to numpy array
    brightness = np.mean(image_array)  # Calculate mean
    return brightness

def analyze_brightness_levels(data_dir):
    brightness_levels = []
    for image_name in os.listdir(data_dir):
        if image_name.lower().endswith('.tif'):
            image_path = os.path.join(data_dir, image_name)
            try:
                with Image.open(image_path) as img:
                    brightness_levels.append(calculate_brightness(img))
            except Exception as e:
                print(f"Cannot process image {image_path}: {e}")
    return brightness_levels

### IDENTIFYING CORRUPTED IMAGES

In [None]:
def analyze_corrupted_images(data_dir):
    corrupted = []
    for image_name in os.listdir(data_dir):
        if image_name.lower().endswith('.tif'):
            image_path = os.path.join(data_dir, image_name)
            try:
                with Image.open(image_path) as img:
                    # Checking if the image is completely black or white
                    if np.all(np.array(img) == 0) or np.all(np.array(img) == 255):
                        corrupted.append(image_name)
            except Exception as e:
                print(f"Cannot process image {image_path}: {e}")
                corrupted.append(image_name)  # Adding the name of the image that failed to open
    return corrupted

# DA QUI IN POI PROBABILMENTE NON FUNZIONANO I CODICI

In [15]:
"""#Changing format to images, we should decide if it's better to use .tif or .jpg

# Directory containing the .tif files
tiff_directory = 'Data'
# Directory where you want to save the .jpg files
jpg_directory = 'Data jpg'

# Create the jpg directory if it doesn't exist
if not os.path.exists(jpg_directory):
    os.makedirs(jpg_directory)

# Loop through all the .tif files in the directory
for tiff_file in os.listdir(tiff_directory):
    if tiff_file.endswith('.tif'):
        # Construct the full file path
        tiff_file_path = os.path.join(tiff_directory, tiff_file)
        # Open the .tif file
        with Image.open(tiff_file_path) as image:
            # Remove the file extension and add .jpg
            jpg_file_name = os.path.splitext(tiff_file)[0] + '.jpg'
            # Construct the full path for the .jpg file
            jpg_file_path = os.path.join(jpg_directory, jpg_file_name)
            # Convert the image to RGB mode (JPEG doesn't support alpha channel as in RGBA)
            rgb_image = image.convert('RGB')
            # Save the image in JPEG format
            rgb_image.save(jpg_file_path, 'JPEG')

print("Conversion from TIFF to JPEG completed!")
"""

'#Changing format to images, we should decide if it\'s better to use .tif or .jpg\n\n# Directory containing the .tif files\ntiff_directory = \'Data\'\n# Directory where you want to save the .jpg files\njpg_directory = \'Data jpg\'\n\n# Create the jpg directory if it doesn\'t exist\nif not os.path.exists(jpg_directory):\n    os.makedirs(jpg_directory)\n\n# Loop through all the .tif files in the directory\nfor tiff_file in os.listdir(tiff_directory):\n    if tiff_file.endswith(\'.tif\'):\n        # Construct the full file path\n        tiff_file_path = os.path.join(tiff_directory, tiff_file)\n        # Open the .tif file\n        with Image.open(tiff_file_path) as image:\n            # Remove the file extension and add .jpg\n            jpg_file_name = os.path.splitext(tiff_file)[0] + \'.jpg\'\n            # Construct the full path for the .jpg file\n            jpg_file_path = os.path.join(jpg_directory, jpg_file_name)\n            # Convert the image to RGB mode (JPEG doesn\'t support 

In [100]:
"""data_dir = 'Data'

# Initialize the data generator with no augmentation, just rescaling
datagen = ImageDataGenerator(rescale=1./255)

# Create a generator that will read the images found at the directory, and indefinitely generate
# batches of image data (in this case, it will be batches of a single image)
data_gen = datagen.flow_from_directory(
    directory=data_dir,
    classes=['.'],  # Treat current directory as containing all images of a single class
    batch_size=4,  # You can set this to any size you want depending on how many images you want to load at once
    target_size=(256, 256),  # Resize images on the fly to a specified size
    class_mode=None,  # Because we know we don't have labeled data
    shuffle=False  # Since order might be important, it's turned off
)

# You can then use data_gen as an iterator to get your images
# For example, to get the first image:
image = next(data_gen)
"""

Found 2442 images belonging to 1 classes.


'\n# You can then use data_gen as an iterator to get your images\n# For example, to get the first image:\nimage = next(data_gen)\n'