<a href="https://colab.research.google.com/github/SunbalAzizLCWU/BSSE-DS-Project/blob/main/SunbalW6.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#Dataset Setup as always
# -----------------------------------------------------------------------
print("--- Part 1: Setting up Kaggle and Downloading Dataset ---")

# 1. Install Kaggle library
!pip install kaggle

# 2. Set up the Kaggle API token
# (Ensure kaggle.json is uploaded to your Colab environment)
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
print("Kaggle API token installed.")

# 3. Download the dataset
# This is the dataset for project 9: "Image-based Waste Classification"
!kaggle datasets download -d asdasdasasdas/garbage-classification
print("\nDataset downloaded. Unzipping...")

# 4. Unzip the file
!unzip -q garbage-classification.zip
print("Dataset unzipped. Ready for preprocessing.")

--- Part 1: Setting up Kaggle and Downloading Dataset ---
Kaggle API token installed.
Dataset URL: https://www.kaggle.com/datasets/asdasdasasdas/garbage-classification
License(s): copyright-authors
Downloading garbage-classification.zip to /content
  0% 0.00/82.0M [00:00<?, ?B/s]
100% 82.0M/82.0M [00:00<00:00, 1.43GB/s]

Dataset downloaded. Unzipping...
Dataset unzipped. Ready for preprocessing.


In [2]:
# Imports and Data Preprocessing
print("\n--- Part 2: Importing Libraries and Preprocessing Data ---")

# Import necessary libraries
import os
import numpy as np
import pandas as pd
from PIL import Image
import warnings
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

# --- Import Week 6 Models ---
from sklearn.linear_model import LogisticRegression # Our W5 baseline
from sklearn.tree import DecisionTreeClassifier # Class Task
from sklearn.ensemble import RandomForestClassifier # Class Task & Assignment

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

# --- Preprocessing (Identical to Week 5) ---

# Define constants
data_dir = 'Garbage classification/Garbage classification'
classes = ['cardboard', 'glass', 'metal', 'paper', 'plastic', 'trash']
IMG_SIZE = 64 # Must be same as W5 (64x64) for a fair comparison

# Lists to hold our data
X_data = [] # This will hold the flattened image data
y_data = [] # This will hold the labels

def preprocess_images():
    """
    Loops through all images, resizes them, flattens them,
    and returns two numpy arrays: X (data) and y (labels).
    """
    print(f"\nStarting image preprocessing from {data_dir}...")
    for class_name in classes:
        class_dir_path = os.path.join(data_dir, class_name)
        class_label = classes.index(class_name)

        if not os.path.isdir(class_dir_path):
            print(f"Warning: Directory not found {class_dir_path}")
            continue

        for image_file in os.listdir(class_dir_path):
            image_path = os.path.join(class_dir_path, image_file)
            try:
                # Open image, convert to grayscale, and resize
                img = Image.open(image_path).convert('L') # 'L' = grayscale
                img_resized = img.resize((IMG_SIZE, IMG_SIZE))

                # Flatten the 64x64 image into a 1D vector of 4096 pixels
                img_vector = np.array(img_resized).flatten()

                X_data.append(img_vector)
                y_data.append(class_label)

            except Exception as e:
                # Skip corrupted files [cite: 216-217]
                print(f"Skipping corrupted file: {image_path} | Error: {e}")

    print("Image preprocessing complete.")
    return np.array(X_data), np.array(y_data)

# Run the function
X, y = preprocess_images()

print(f"\nData shape (X): {X.shape}")
print(f"Labels shape (y): {y.shape}")

# Scale the data
print("\nScaling data (StandardScaler)...")
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
print("Data scaling complete.")

# --- Train/Test Split (Identical to Week 5) ---
print("\nSplitting data into 80% train and 20% test sets...")
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)
print(f"Training images: {X_train.shape[0]}")
print(f"Testing images: {X_test.shape[0]}")


--- Part 2: Importing Libraries and Preprocessing Data ---

Starting image preprocessing from Garbage classification/Garbage classification...
Image preprocessing complete.

Data shape (X): (2527, 4096)
Labels shape (y): (2527,)

Scaling data (StandardScaler)...
Data scaling complete.

Splitting data into 80% train and 20% test sets...
Training images: 2021
Testing images: 506


In [3]:
# Week 6 Model Training & Comparison
print("\n--- Part 3: Training and Evaluating Week 6 Models ---")

# --- Model 1: Logistic Regression (Baseline from W5) ---
print("Training Model 1: Logistic Regression...")
model_logreg = LogisticRegression(max_iter=1000, random_state=42)
model_logreg.fit(X_train, y_train)
y_pred_logreg = model_logreg.predict(X_test)
acc_logreg = accuracy_score(y_test, y_pred_logreg)
print("Logistic Regression training complete.")

# --- Model 2: Decision Tree (Class Task) ---
print("Training Model 2: Decision Tree...")
model_tree = DecisionTreeClassifier(random_state=42)
model_tree.fit(X_train, y_train)
y_pred_tree = model_tree.predict(X_test)
acc_tree = accuracy_score(y_test, y_pred_tree)
print("Decision Tree training complete.")

# --- Model 3: Random Forest (Class Task & Assignment) ---
print("Training Model 3: Random Forest... (This may take a minute)")
model_rf = RandomForestClassifier(random_state=42)
model_rf.fit(X_train, y_train)
y_pred_rf = model_rf.predict(X_test)
acc_rf = accuracy_score(y_test, y_pred_rf)
print("Random Forest training complete.")


--- Part 3: Training and Evaluating Week 6 Models ---
Training Model 1: Logistic Regression...
Logistic Regression training complete.
Training Model 2: Decision Tree...
Decision Tree training complete.
Training Model 3: Random Forest... (This may take a minute)
Random Forest training complete.


In [4]:
# Final Assignment 6 Report

print("\n=======================================================")
print("      Week 6 Assignment: Model Comparison Report")
print("=======================================================")
print("Comparing accuracy of all trained models on the test set:\n")
print(f"1. Logistic Regression (Baseline): {acc_logreg * 100:.2f}%")
print(f"2. Decision Tree:                  {acc_tree * 100:.2f}%")
print(f"3. Random Forest:                  {acc_rf * 100:.2f}%")
print("\n--- Week 6 Tasks Complete ---")


      Week 6 Assignment: Model Comparison Report
Comparing accuracy of all trained models on the test set:

1. Logistic Regression (Baseline): 31.42%
2. Decision Tree:                  41.90%
3. Random Forest:                  63.64%

--- Week 6 Tasks Complete ---
