In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import joblib
import os

# Pipeline

In [4]:
df = pd.read_csv("D:\\Files from MSI 2020\\Natalya Sochilova\\Basic Tests\\all_diagnosis.csv", encoding='latin-1')

df[df['Vid_donacii'] == 'trombocity']

Unnamed: 0,Donory_n_633,POL,Vozrast,Vid_donacii,krov,plazma,trombocity,obsledovania,Tip_donacii,HGB,...,HB_nizkoj_plotnosti...45,Syvorotocnoe_zhelezo...46,OZHSS...47,Ferritin...48,Transferrin...49,Nasysenie_transferrina_zhelezomti...50,Neobhodimost_doobsledovania,Neobhodimost_reabilitacii,donatcia_krovi_i_komponentov_do_obsledovannya,Diagnosis
3,Klevcova O. G.,F,40,trombocity,0,1,9,0,kadrovyj,137,...,1,0,0,1,1,0,1,1,1,Iron deficiency anemia (IDA)
10,Klocev I. S.,M,39,trombocity,1,2,6,0,kadrovyj,164,...,0,0,0,0,0,0,1,0,1,Normal
26,Tolstokorova T. V.,F,58,trombocity,0,0,9,0,kadrovyj,120,...,1,1,0,0,0,1,1,1,1,Iron deficiency anemia (IDA)
27,Boriskina I. M.,F,40,trombocity,3,0,0,0,kadrovyj,120,...,0,0,0,0,0,0,1,0,1,Normal
46,Antonova N. I.,F,39,trombocity,0,0,8,0,kadrovyj,125,...,0,0,0,1,1,0,0,1,1,Iron deficiency anemia (IDA)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
594,Babushkin A. V.,M,50,trombocity,0,5,3,0,kadrovyj,141,...,0,1,0,1,1,0,1,1,1,Iron deficiency anemia (IDA)
595,Lulin V. V.,M,37,trombocity,0,0,8,0,kadrovyj,153,...,0,1,0,0,0,1,0,1,1,Iron deficiency anemia (IDA)
598,Boarshin V. A.,M,43,trombocity,1,2,0,0,kadrovyj,160,...,0,0,0,0,0,0,0,0,1,Normal
607,Vetrov V. V.,M,46,trombocity,0,1,3,0,kadrovyj,168,...,0,0,0,0,0,0,0,0,1,Normal


In [5]:
# Separate predictors (X) and target (y)
X = df[['POL', 'RET_HE', 'Mikrocity' ,'RBC' , 'RDW_SD' , 'MCV' , 'MCH' ,'RDW_CV' , 'Faktor_mikrocitarnoj_anemii' , 'Hb_nizkoj_plotnosti']] 
y = df['Diagnosis']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
# Define column transformer for preprocessing
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = ['POL']  # Replace with your categorical feature column(s)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(), categorical_features)
    ]
)

In [7]:
# Define SVM classifier pipeline
svm_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', SVC(random_state=42))
])

In [8]:
param_grid = {
    'classifier__C': [0.1, 1, 10, 100],  # Regularization parameter
    'classifier__kernel': ['linear', 'rbf'],  # Kernel type
    'classifier__gamma': ['scale', 'auto'],  # Kernel coefficient
    'classifier__class_weight': ['balanced', None],  # Adjusting for class imbalance if any
}

In [9]:
# Initialize GridSearchCV
grid_search = GridSearchCV(svm_pipeline, param_grid, cv=5, scoring='accuracy', verbose=1)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Print the best parameters and best score
print("Best parameters found:")
print(grid_search.best_params_)
print("Best cross-validation score:")
print(grid_search.best_score_)

Fitting 5 folds for each of 32 candidates, totalling 160 fits
Best parameters found:
{'classifier__C': 1, 'classifier__class_weight': None, 'classifier__gamma': 'scale', 'classifier__kernel': 'linear'}
Best cross-validation score:
0.6602019025431954


In [10]:
# Predict on the test set
y_pred = grid_search.best_estimator_.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy}")

Test Accuracy: 0.6377952755905512


In [11]:
print(classification_report(y_test, y_pred))

                              precision    recall  f1-score   support

Iron deficiency anemia (IDA)       0.67      0.85      0.75        82
                      Normal       0.48      0.24      0.32        45

                    accuracy                           0.64       127
                   macro avg       0.58      0.55      0.54       127
                weighted avg       0.60      0.64      0.60       127



In [12]:
#Save, if needed

import joblib

best_svm_estimator = grid_search.best_estimator_

filename = os.getcwd() + '\\tuned_svm_classifier_trombocity.sav'
joblib.dump(best_svm_estimator, filename)

['D:\\Files from MSI 2020\\Natalya Sochilova\\Trombocity\\tuned_svm_classifier_trombocity.sav']

# Precision
Precision is a measure of the accuracy of positive predictions made by the model. It answers the question: "Out of all the instances predicted as positive, how many are actually positive?"



Precision = TP/TP+FP 

True Positives (TP): The number of instances correctly predicted as positive (e.g., correctly predicted as "sick" when they are actually "sick").
False Positives (FP): The number of instances incorrectly predicted as positive (e.g., predicted as "sick" when they are actually "healthy").
High precision indicates that when the model predicts a positive class, it is usually correct. It is essential in applications where the cost of false positives is high.


Recall
=

Recall:
Recall (also known as sensitivity or true positive rate) measures the ability of the model to correctly identify positive instances. It answers the question: "Out of all the actual positive instances, how many did the model correctly identify?"

Mathematically, recall is defined as:

Recall = TP/TP +FN
​
 

True Negatives (TN): The number of instances correctly predicted as negative (e.g., correctly predicted as "healthy" when they are actually "healthy").
False Negatives (FN): The number of instances incorrectly predicted as negative (e.g., predicted as "healthy" when they are actually "sick").

High recall indicates that the model is able to identify a large proportion of positive instances correctly. It is crucial in applications where missing positive instances (false negatives) is costly.

Interpretation:
Precision and recall are inversely related; improving one typically reduces the other. This trade-off is common in machine learning models.

F1-score
=

F1-score: Sometimes, a combined metric like the F1-score (harmonic mean of precision and recall) is used to balance precision and recall:

𝐹1-score = Precision x Recall / Precision + Recall

The F1-score provides a single metric to evaluate both precision and recall, useful when you need to balance both metrics.

Support
=

Support is the number of actual occurrences of the class in the specified dataset.

# Load and make new predictions

In [13]:
# Load the saved model
loaded_model = joblib.load(os.getcwd() + "\\tuned_svm_classifier_trombocity.sav")

# Define the names of the variables 
variable_names = ['POL', 'RET_HE', 'Mikrocity' ,'RBC' , 'RDW_SD' , 'MCV' , 'MCH' ,'RDW_CV' , 'Faktor_mikrocitarnoj_anemii' , 'Hb_nizkoj_plotnosti']

# Define which variables are numeric and which are categorical
numeric_variables = ['RET_HE', 'Mikrocity' ,'RBC' , 'RDW_SD' , 'MCV' , 'MCH' ,'RDW_CV' , 'Faktor_mikrocitarnoj_anemii' , 'Hb_nizkoj_plotnosti']

categorical_variables = ['POL']

# Function to prompt user for input values for each variable
def get_user_input(variable_names, numeric_variables, categorical_variables):
    user_input = {}
    for var in variable_names:
        while True:
            value = input(f"Please enter value for '{var}': ")
            if var in numeric_variables:
                try:
                    user_input[var] = float(value)
                    break
                except ValueError:
                    print(f"Invalid input for '{var}'. Please enter a numeric value.")
            elif var in categorical_variables:
                if value in ['M', 'F']:
                    user_input[var] = value
                    break
                else:
                    print(f"Invalid input for '{var}'. Please enter 'M' or 'F'.")
            else:
                user_input[var] = value
                break
    return user_input

# Get user input
user_input = get_user_input(variable_names, numeric_variables, categorical_variables)

# Convert user input to a DataFrame
input_df = pd.DataFrame([user_input])

# Use the loaded model to make predictions
prediction = loaded_model.predict(input_df)

# Output the prediction
print("Predicted output:", prediction[0])

KeyboardInterrupt: Interrupted by user