<a href="https://colab.research.google.com/github/FarrelAD/Computer-Vision-Case-Method/blob/main/notebooks/Starter_Notebook.ipynb" target="_parent">
    <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>

In [None]:
import os
import random
import shutil
import csv
import cv2
from PIL import Image, ExifTags
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

Hello world!


# 1. Data Cleaning and Transformation

In [None]:
RAW_DIR = "raw_data"
CLEAN_DIR = "clean_data"
os.makedirs(CLEAN_DIR, exist_ok=True)

target_size = (224, 224)

for filename in os.listdir(RAW_DIR):
    filepath = os.path.join(RAW_DIR, filename)
    try:
        # Try to read the image
        img = cv2.imread(filepath)
        if img is None:
            print(f"❌ Corrupt or unreadable: {filename}")
            continue
        
        # Resize to uniform dimensions
        img = cv2.resize(img, target_size)
        
        # Optional: Convert to RGB
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        
        # Save as JPEG
        clean_path = os.path.join(CLEAN_DIR, os.path.splitext(filename)[0] + ".jpg")
        Image.fromarray(img).save(clean_path, "JPEG", quality=95)
        
    except Exception as e:
        print(f"⚠️ Error processing {filename}: {e}")


# 2. Data Labelling

## 2.1 Method 1

In [None]:
clean_dir = "clean_data"
auto_labels = {}

def guess_label(filepath):
    try:
        image = Image.open(filepath)
        exif = image.getexif()
        # Check EXIF metadata
        if any("Screenshot" in str(v) for v in exif.values()):
            return "fake"
        
        # Check aspect ratio (screenshots are usually wide)
        w, h = image.size
        ratio = w / h
        if 1.5 < ratio < 2.0:
            return "fake"
        else:
            return "real"
    except:
        return "unknown"

for file in os.listdir(clean_dir):
    path = os.path.join(clean_dir, file)
    label = guess_label(path)
    auto_labels[file] = label

# Save auto-labeled results
pd.DataFrame(list(auto_labels.items()), columns=["filename", "auto_label"]).to_csv("auto_labels.csv", index=False)


## 2.2 Method 2

In [None]:
DATA_DIR = "clean_data"
LABEL_CSV = "labels.csv"

with open(LABEL_CSV, "w", newline='') as f:
    writer = csv.writer(f)
    writer.writerow(["filename", "label"])

    for file in os.listdir(DATA_DIR):
        path = os.path.join(DATA_DIR, file)
        img = cv2.imread(path)
        cv2.imshow("Image", img)
        print(f"Label this image: {file}")
        print("[r] Real, [f] Fake, [s] Skip")
        
        key = cv2.waitKey(0)
        if key == ord('r'):
            writer.writerow([file, "real"])
        elif key == ord('f'):
            writer.writerow([file, "fake"])
        elif key == ord('s'):
            continue
        else:
            break

cv2.destroyAllWindows()


# 3. Split Dataset

In [None]:
df = pd.read_csv("labels.csv")
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df['label'])
val_df, test_df = train_test_split(test_df, test_size=0.5, stratify=test_df['label'])

for subset_name, subset_df in [("train", train_df), ("val", val_df), ("test", test_df)]:
    for label in subset_df['label'].unique():
        os.makedirs(f"dataset/{subset_name}/{label}", exist_ok=True)
    
    for _, row in subset_df.iterrows():
        src = os.path.join("clean_data", row["filename"])
        dst = os.path.join(f"dataset/{subset_name}/{row['label']}", row["filename"])
        shutil.copy(src, dst)

Preview dataset

In [None]:
folders = ["dataset/train/real", "dataset/train/fake"]
plt.figure(figsize=(10,5))

for i in range(4):
    label = random.choice(folders)
    img_path = random.choice(os.listdir(label))
    img = cv2.imread(os.path.join(label, img_path))
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    plt.subplot(2,2,i+1)
    plt.imshow(img)
    plt.title(label.split('/')[-1])
    plt.axis('off')

plt.show()


# 3. Data Preprocessing

In [None]:
def preprocess_basic(img_path, target_size=(224, 224)):
    img = cv2.imread(img_path)
    if img is None:
        return None

    # 1️⃣ Convert to grayscale
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

    # 2️⃣ Resize (so all images have the same input shape)
    gray = cv2.resize(gray, target_size)

    # 3️⃣ Normalize pixel values (scale 0–1)
    gray = gray / 255.0

    return gray

def enhance_text_region(gray):
    # Histogram equalization: improves contrast
    eq = cv2.equalizeHist((gray * 255).astype(np.uint8))
    
    # Adaptive threshold: binarize text region
    binary = cv2.adaptiveThreshold(eq, 255,
                                   cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
                                   cv2.THRESH_BINARY, 15, 8)
    # Morphological open: remove small noise
    kernel = np.ones((2,2), np.uint8)
    clean = cv2.morphologyEx(binary, cv2.MORPH_OPEN, kernel)
    return clean

def extract_receipt_features(gray_img):
    edges = cv2.Canny((gray_img * 255).astype(np.uint8), 100, 200)
    edge_density = np.sum(edges > 0) / edges.size
    mean_intensity = np.mean(gray_img)
    std_intensity = np.std(gray_img)
    return [edge_density, mean_intensity, std_intensity]


# 4. Model Training

# 5. Model Evaluation

# 6. Model Deployment