# 1. Data Preprocessing

In [None]:
!python --version

Python 3.10.11


In [None]:
import sklearn
sklearn.__version__

'1.2.2'

In [None]:
import cv2
cv2.__version__

'4.7.0'

In [None]:
import numpy
numpy.__version__

'1.22.4'

In [None]:
import pandas
pandas.__version__

'1.5.3'

## 1.1 Access Data

In [None]:
from google.colab import drive
import pandas as pd
import os


drive.mount('/content/drive')
img_data = '/content/drive/MyDrive/GasDetectionImages'

Mounted at /content/drive


## 1.2 Preprocessing Functions

In [None]:
from sklearn.preprocessing import MinMaxScaler
import cv2

def preprocess_image(image):
    # Convert image colors into gray scale
    image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    # Apply Gaussian filter to smooth the image and reduce noise
    image = cv2.GaussianBlur(image, (3, 3), 0)
    # Apply Laplacian filter to enhance edges
    image = cv2.Laplacian(image, cv2.CV_8U, ksize=3)
    # Resize image to 64*64
    image = cv2.resize(image, (64, 64))
    return image

#normalization
def normalize(X):
    scaler = MinMaxScaler()
    scaled_X = scaler.fit_transform(X)
    return scaled_X

## 1.3 Data Construction Functions

In [None]:
import numpy as np

categories = ['NoGas', 'Mixture', 'Perfume', 'Smoke']


def image_construction():
  X, y = [], []
  for file_name in os.listdir(img_data):
    for j, category in enumerate(categories):
        if category in file_name:
            image = preprocess_image(cv2.imread(os.path.join(img_data, file_name))).flatten()
            X.append(image)
            y.append(j)
            break
  scaled_X = normalize(X)
  return scaled_X, np.array(y)


## 1.4 Data Split Functions

In [None]:
from sklearn.model_selection import train_test_split

def split_data(X,y):
  #split the data into training and testing sets
  X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)
  return X_train, X_test, y_train, y_test

def print_data(X,y, X_train, X_test, y_train, y_test):
  print('X', X.shape, ':')
  print('\n',X)
  print('\ny', y.shape,':')
  print('\n',y)
  print('\nX_train:', X_train.shape)
  print('X_test:', X_test.shape)
  print('y_train:', y_train.shape)
  print('y_test:', y_test.shape)
  return

# 2. Evaluation Function

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

def evaluate(y_test, y_pred, model):
  accuracy = accuracy_score(y_test, y_pred)
  report = classification_report(y_test, y_pred, target_names=categories)

  print("Model: ", model)
  print("Classification Report:")
  print(report)
  print("Confusion Matrix:")
  print('')
  print(confusion_matrix(y_test, y_pred))
  print('')
  print("Accuracy:", accuracy)

  #calculate overall recall, f1 score, and precision
  report_lines = report.split('\n')
  metrics = report_lines[-2].split()
  recall = float(metrics[3])
  precision = float(metrics[4])
  f1 = float(metrics[2])
  print("Overall Recall:", recall)
  print("Overall Precision:", precision)
  print("Overall F1 Score:", f1)
  return

# 3. Second Experiment: Image Data Analysis

## 3.1 Get Data

In [None]:
image_X , image_y = image_construction()
image_X_train, image_X_test, image_y_train, image_y_test = split_data(image_X,image_y)

print_data(image_X , image_y, image_X_train, image_X_test, image_y_train, image_y_test)

X (6407, 4096) :

 [[0.         0.         0.         ... 0.39423077 0.27777778 0.29896907]
 [0.         0.27272727 0.         ... 0.56730769 0.00925926 0.35051546]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [1.         0.         0.12195122 ... 0.03846154 0.01851852 0.03092784]
 [1.         0.27272727 0.14634146 ... 0.125      0.01851852 0.06185567]
 [0.         0.09090909 0.00813008 ... 0.         0.         0.        ]]

y (6407,) :

 [1 1 1 ... 2 2 2]

X_train: (5125, 4096)
X_test: (1282, 4096)
y_train: (5125,)
y_test: (1282,)


## 3.2 Image Classification Models

### 3.2.1  RF

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from joblib import parallel_backend

rf_param_grid = {'bootstrap': [True], 'max_depth': [5, 10, None],
               'n_estimators': [5, 6, 7, 8, 9, 10, 11, 12,13,15,100,500]}

with parallel_backend('multiprocessing'):
    rf_model = GridSearchCV(RandomForestClassifier(), rf_param_grid, cv=3, n_jobs=-1)
    rf_model.fit(image_X_train, image_y_train)

rf_pred = rf_model.predict(image_X_test)

print('Best Parameters : ', rf_model.best_params_, '\n')
evaluate(image_y_test, rf_pred, 'RF')

Best Parameters :  {'bootstrap': True, 'max_depth': None, 'n_estimators': 500} 

Model:  RF
Classification Report:
              precision    recall  f1-score   support

       NoGas       0.96      0.75      0.84       314
     Mixture       1.00      1.00      1.00       347
     Perfume       0.88      0.91      0.90       319
       Smoke       0.84      1.00      0.91       302

    accuracy                           0.92      1282
   macro avg       0.92      0.91      0.91      1282
weighted avg       0.92      0.92      0.91      1282

Confusion Matrix:

[[235   0  39  40]
 [  0 347   0   0]
 [ 10   0 290  19]
 [  0   0   0 302]]

Accuracy: 0.9157566302652106
Overall Recall: 0.92
Overall Precision: 0.91
Overall F1 Score: 0.92
