# Data Preprocessing

In [24]:
import pandas as pd

data = pd.read_csv('TestPad_PCB_XYRGB_V2.csv')
data['Intensity'] = (data['R'] + data['G'] + data['B']) / 3

In [25]:
# For now we will only use RGB
features = ['R', 'G', 'B']
X = data[features]
y = data['Grey']

# Splitting Data

In [26]:
from sklearn.model_selection import train_test_split

#Using stratify=True as data will be imbalanced otherwise
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, stratify=y)
print("Class distribution in training set:\n", y_train.value_counts())
print("---")
print("Class distribution in testing set:\n", y_test.value_counts())

Class distribution in training set:
 Grey
0    569319
1      9522
Name: count, dtype: int64
---
Class distribution in testing set:
 Grey
0    142330
1      2381
Name: count, dtype: int64


In [27]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score

logreg = LogisticRegression(max_iter=100)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print('---')
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print('---')
print("Classification Report:\n", classification_report(y_test, y_pred, zero_division=0))

# ROC AUC Score
y_pred_proba = logreg.predict_proba(X_test)[:, 1]
roc_auc = roc_auc_score(y_test, y_pred_proba)
print('---')
print("ROC AUC Score:", roc_auc)

Accuracy: 0.9835465168508268
---
Confusion Matrix:
 [[142330      0]
 [  2381      0]]
---
Classification Report:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99    142330
           1       0.00      0.00      0.00      2381

    accuracy                           0.98    144711
   macro avg       0.49      0.50      0.50    144711
weighted avg       0.97      0.98      0.98    144711

---
ROC AUC Score: 0.9553975855071531
