<a href="https://colab.research.google.com/github/SunbalAzizLCWU/BSSE-DS-Project/blob/main/SunbalW5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:

# Setup

print("--- Part 1: Setting up Kaggle and Downloading Dataset ---")


!pip install kaggle

# (Ensure kaggle.json is uploaded to your Colab environment)
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
print("Kaggle API token installed.")

!kaggle datasets download -d asdasdasasdas/garbage-classification
print("\nDataset downloaded. Unzipping...")

!unzip -q garbage-classification.zip
print("Dataset unzipped. Ready for preprocessing.")

--- Part 1: Setting up Kaggle and Downloading Dataset ---
Kaggle API token installed.
Dataset URL: https://www.kaggle.com/datasets/asdasdasasdas/garbage-classification
License(s): copyright-authors
Downloading garbage-classification.zip to /content
 73% 60.0M/82.0M [00:00<00:00, 596MB/s]
100% 82.0M/82.0M [00:00<00:00, 634MB/s]

Dataset downloaded. Unzipping...
Dataset unzipped. Ready for preprocessing.


In [3]:
# Linear Regression

print("\n--- Part 2: Executing Week 5 Class Task (Linear Regression Demo) ---")

# Import libraries for the class task
import matplotlib.pyplot as plt
import numpy as np
from sklearn.datasets import load_diabetes
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error

# 1. Loading the data
diabetes = load_diabetes()
X_diabetes = diabetes.data
y_diabetes = diabetes.target

# 2. Spliting the data
X_train_lr, X_test_lr, y_train_lr, y_test_lr = train_test_split(X_diabetes, y_diabetes, test_size=0.2, random_state=42)

# 3. Creating and train the model
model_lr = LinearRegression()
model_lr.fit(X_train_lr, y_train_lr)

# 4. Making predictions
y_pred_lr = model_lr.predict(X_test_lr)

# 5. Evaluating the model
mae = mean_absolute_error(y_test_lr, y_pred_lr)
rmse = np.sqrt(mean_squared_error(y_test_lr, y_pred_lr))

print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
print("Class Task (Linear Regression Demo) complete.")


--- Part 2: Executing Week 5 Class Task (Linear Regression Demo) ---
Mean Absolute Error (MAE): 42.79
Root Mean Squared Error (RMSE): 53.85
Class Task (Linear Regression Demo) complete.


In [4]:
# Adapted Baseline Model

print("\n--- Part 3: Executing Week 5 Project Assignment (Baseline Classification Model) ---")

# Import libraries for the project assignment
import os
from PIL import Image
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import warnings

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

# Defining Constants and Preprocessing Function ---

# Defining constants
data_dir = 'Garbage classification/Garbage classification'
classes = ['cardboard', 'glass', 'metal', 'paper', 'plastic', 'trash']
IMG_SIZE = 64 # Resize images to 64x64 for a fast baseline

# Lists to hold our data
X_data = [] # This will hold the flattened image data
y_data = [] # This will hold the labels

def preprocess_images():
    """
    Loops through all images, resizes them, flattens them,
    and returns two numpy arrays: X (data) and y (labels).
    """
    print(f"\nStarting image preprocessing from {data_dir}...")
    for class_name in classes:
        class_dir_path = os.path.join(data_dir, class_name)
        # Convert class name (e.g., 'paper') to a number (e.g., 3)
        class_label = classes.index(class_name)

        if not os.path.isdir(class_dir_path):
            print(f"Warning: Directory not found {class_dir_path}")
            continue

        for image_file in os.listdir(class_dir_path):
            image_path = os.path.join(class_dir_path, image_file)
            try:
                # Open image, convert to grayscale, and resize
                img = Image.open(image_path).convert('L') # 'L' = grayscale
                img_resized = img.resize((IMG_SIZE, IMG_SIZE))

                # Flatten the 64x64 image into a 1D vector of 4096 pixels
                img_vector = np.array(img_resized).flatten()

                X_data.append(img_vector)
                y_data.append(class_label)

            except Exception as e:
                # Skip corrupted files [cite: 216-217, 621-622, 1081-1083]
                print(f"Skipping corrupted file: {image_path} | Error: {e}")

    print("Image preprocessing complete.")
    return np.array(X_data), np.array(y_data)

# Loading, Preprocessing, and Scaling Data ---

# Run the function
X, y = preprocess_images()

print(f"\nData shape (X): {X.shape}") # Should be (2527, 4096)
print(f"Labels shape (y): {y.shape}") # Should be (2527,)

# Scale the data (very important for Logistic Regression)
print("\nScaling data (StandardScaler)...")
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
print("Data scaling complete.")

# Train/Test Split ---
print("\nSplitting data into 80% train and 20% test sets...")
# We use stratify=y to ensure the class imbalance is preserved in both sets
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)
print(f"Training images: {X_train.shape[0]}")
print(f"Testing images: {X_test.shape[0]}")

# Train Baseline Model (Logistic Regression) ---
print("\nTraining baseline model (Logistic Regression)...")
# We increase max_iter because 4096 features is a lot
# This is our adapted model for the baseline *classification* task [cite: 582-583]
baseline_model = LogisticRegression(max_iter=1000, random_state=42)
baseline_model.fit(X_train, y_train)
print("Model training complete.")

# Evaluate Baseline Model ---
print("\nEvaluating baseline model on the test set...")
y_pred = baseline_model.predict(X_test)

# Calculate Accuracy (the correct metric for classification)
accuracy = accuracy_score(y_test, y_pred)

print("\n=======================================================")
print("     Week 5 Assignment: Baseline Model Output")
print("=======================================================")
print(f"Model: Logistic Regression (on {IMG_SIZE}x{IMG_SIZE} flattened pixels)")
print(f"Test Accuracy: {accuracy * 100:.2f}%")

# Print a detailed report
print("\nDetailed Classification Report:")
print(classification_report(y_test, y_pred, target_names=classes))

print("\n--- Week 5 Tasks Complete ---")


--- Part 3: Executing Week 5 Project Assignment (Baseline Classification Model) ---

Starting image preprocessing from Garbage classification/Garbage classification...
Image preprocessing complete.

Data shape (X): (2527, 4096)
Labels shape (y): (2527,)

Scaling data (StandardScaler)...
Data scaling complete.

Splitting data into 80% train and 20% test sets...
Training images: 2021
Testing images: 506

Training baseline model (Logistic Regression)...
Model training complete.

Evaluating baseline model on the test set...

     Week 5 Assignment: Baseline Model Output
Model: Logistic Regression (on 64x64 flattened pixels)
Test Accuracy: 31.42%

Detailed Classification Report:
              precision    recall  f1-score   support

   cardboard       0.35      0.30      0.32        81
       glass       0.26      0.26      0.26       100
       metal       0.19      0.18      0.19        82
       paper       0.50      0.46      0.48       119
     plastic       0.30      0.33      0.31  