# Data Preprocessing

In [28]:
import pandas as pd

data = pd.read_csv('TestPad_PCB_XYRGB_V2.csv')

In [29]:
# For now we will only use RGB
features = ['R', 'G', 'B']
X = data[features]
y = data['Grey']

# Splitting Data

In [30]:
from sklearn.model_selection import train_test_split

#Using stratify=True as data will be imbalanced otherwise
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, stratify=y)
print("Class distribution in training set:\n", y_train.value_counts())
print("---")
print("Class distribution in testing set:\n", y_test.value_counts())

Class distribution in training set:
 Grey
0    569319
1      9522
Name: count, dtype: int64
---
Class distribution in testing set:
 Grey
0    142330
1      2381
Name: count, dtype: int64


In [31]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score

dtree = DecisionTreeClassifier()
dtree.fit(X_train, y_train)
y_pred = dtree.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# ROC AUC Score
y_pred_proba = dtree.predict_proba(X_test)[:, 1]
roc_auc = roc_auc_score(y_test, y_pred_proba)
print("ROC AUC Score:", roc_auc)


Accuracy: 1.0
Confusion Matrix:
 [[142330      0]
 [     0   2381]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00    142330
           1       1.00      1.00      1.00      2381

    accuracy                           1.00    144711
   macro avg       1.00      1.00      1.00    144711
weighted avg       1.00      1.00      1.00    144711

ROC AUC Score: 1.0
