In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, FunctionTransformer
from sklearn.linear_model import LogisticRegression
# from sklearn.svm import SVC
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.model_selection import GridSearchCV
from sklearn.metrics import precision_recall_fscore_support, confusion_matrix, roc_auc_score, accuracy_score

data_frame = pd.read_csv("./data_train.csv")

In [2]:
###########################################################
## Pipeline
###########################################################

from utils.categorical_encoder import CategoricalEncoder
# df['SEX'].map({'M': 1, 'F': 0})

# Dropped HAEMATOCRIT and MCH (correlation)
numerical_features = ['HAEMOGLOBINS', 'ERYTHROCYTE', 'LEUCOCYTE', 'THROMBOCYTE', 'MCHC', 'MCV', 'AGE']
categorical_features = ['SEX']

y = data_frame['SOURCE'].map({'out': 0, 'in': 1})
X = data_frame.drop(columns=['SOURCE'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', MinMaxScaler(), numerical_features),
        ('cat',  CategoricalEncoder(), categorical_features)
    ],
    remainder='drop'
)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(random_state=1))
])

In [3]:
###########################################################
## Train/ Test
###########################################################

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)
y_pred_proba = pipeline.predict_proba(X_test)[:, 1]

In [4]:
###########################################################
## Evaluate
###########################################################

accuracy = accuracy_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred_proba)
conf_matrix = confusion_matrix(y_test, y_pred)

print("Accuracy:", accuracy)
print("AUC:", auc)
print("Confusion Matrix:\n", conf_matrix)
#
# [[TN FP]
#  [FN TP]]

Accuracy: 0.7115869017632241
AUC: 0.7496986821046807
Confusion Matrix:
 [[414  59]
 [170 151]]


In [5]:
###########################################################
## Save Pipeline/Classifier
###########################################################
import joblib
joblib.dump(pipeline, 'pipeline_classifier.pkl')

['pipeline_classifier.pkl']