logistic regression with 5 fold cross validation and smote

In [37]:
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import cross_val_score, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load data (replace path with your actual data location)
data = pd.read_csv(r'C:\Users\harsh\OneDrive\Desktop\machine learning\machine learning\hospital dataset\data01.csv')

# Data preprocessing
cleaned_data = data.drop(['group', 'ID'], axis=1)
cleaned_data['gendera'] = data['gendera'] - 1
cleaned_data = cleaned_data.dropna(subset=['heart rate'])
cleaned_data = cleaned_data.fillna(cleaned_data.mean())

# Feature selection
X = cleaned_data[['age', 'hypertensive', 'Hyperlipemia', 'diabetes', 'Renal failure', 'COPD', 
                'heart rate','Systolic blood pressure', 'Diastolic blood pressure', 'Respiratory rate', 'SP O2', 
                'Urine output', 'MCV', 'RDW', 'Platelets', 'Lymphocyte', 'Blood calcium', 
                'Urea nitrogen', 'Anion gap', 'Lactic acid']]
y = cleaned_data['outcome']


# 70-30 Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Scale features using StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Alpha value for regularization
alpha_value = 0.001

# Threshold for binary classification
threshold_value = 0.3

# Random state for KFold and SMOTE
random_state_value = 123

# Apply SMOTE for class imbalance
smote = SMOTE(random_state=random_state_value)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y)

# Initialize Logistic Regression model with alpha
log_reg_model = LogisticRegression(C=1/alpha_value)

# Perform 5-fold cross-validation with SMOTE
kf = KFold(n_splits=5, shuffle=True, random_state=random_state_value)
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []

for train_index, test_index in kf.split(X_resampled):
    X_train, X_test = X_resampled[train_index], X_resampled[test_index]
    y_train, y_test = y_resampled.iloc[train_index], y_resampled.iloc[test_index]
    
    # Train Logistic Regression model
    log_reg_model.fit(X_train, y_train)

    # Predict probabilities on test data
    y_pred_proba = log_reg_model.predict_proba(X_test)[:, 1]  # Probability of positive class
    
    # Convert probabilities to binary predictions based on threshold
    y_pred_binary = (y_pred_proba > threshold_value).astype(int)

    # Evaluate Logistic Regression model
    accuracy_scores.append(accuracy_score(y_test, y_pred_binary))
    precision_scores.append(precision_score(y_test, y_pred_binary))
    recall_scores.append(recall_score(y_test, y_pred_binary))
    f1_scores.append(f1_score(y_test, y_pred_binary))

# Print average scores
print("Average Accuracy:", np.mean(accuracy_scores))
print("Average Precision:", np.mean(precision_scores))
print("Average Recall:", np.mean(recall_scores))
print("Average F1 Score:", np.mean(f1_scores))


Average Accuracy: 0.7251618837825735
Average Precision: 0.6671415331502913
Average Recall: 0.8987979725830971
Average F1 Score: 0.7655728053551304


after improving

In [15]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from imblearn.over_sampling import SMOTE


# Load data (replace path with your actual data location)
data = pd.read_csv(r'C:\Users\harsh\OneDrive\Desktop\machine learning\machine learning\hospital dataset\data01.csv')

# Data preprocessing
cleaned_data = data.drop(['group', 'ID'], axis=1)
cleaned_data['gendera'] = data['gendera'] - 1
cleaned_data = cleaned_data.dropna(subset=['heart rate'])
cleaned_data = cleaned_data.fillna(cleaned_data.mean())

# Feature selection
X = cleaned_data[['age', 'hypertensive', 'Hyperlipemia', 'diabetes', 'Renal failure', 'COPD', 
                'heart rate','Systolic blood pressure', 'Diastolic blood pressure', 'Respiratory rate', 'SP O2', 
                'Urine output', 'MCV', 'RDW', 'Platelets', 'Lymphocyte', 'Blood calcium', 
                'Urea nitrogen', 'Anion gap', 'Lactic acid']]
y = cleaned_data['outcome']

# Split data into 70% training and 30% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Scale features using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize SMOTE
smote = SMOTE(random_state=20)


# Perform 5-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=10)
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []
conf_matrices = []  # List to store confusion matrices for each fold

# Adjust the threshold for binary classification
threshold = 0.9 # Adjust this threshold as needed

for train_index, test_index in kf.split(X_train_scaled):
    X_train_fold, X_val_fold = X_train_scaled[train_index], X_train_scaled[test_index]
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[test_index]
    
    # Apply SMOTE to the training fold only
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train_fold, y_train_fold)
    
    # Train Logistic Regression model
    log_reg_model = LogisticRegression()  # No need to specify regularization parameters for now
    log_reg_model.fit(X_train_resampled, y_train_resampled)

    # Predict probabilities on validation fold
    y_pred_proba = log_reg_model.predict_proba(X_val_fold)[:, 1]  # Probability of positive class
    
    # Convert probabilities to binary predictions based on threshold
    y_pred_binary = (y_pred_proba > threshold).astype(int)

    # Evaluate Logistic Regression model
    accuracy_scores.append(accuracy_score(y_val_fold, y_pred_binary))
    precision_scores.append(precision_score(y_val_fold, y_pred_binary))
    recall_scores.append(recall_score(y_val_fold, y_pred_binary))
    f1_scores.append(f1_score(y_val_fold, y_pred_binary))
    
    # Calculate the confusion matrix for the fold
    conf_matrix = confusion_matrix(y_val_fold, y_pred_binary)
    conf_matrices.append(conf_matrix)

# Print average scores
print("Average Accuracy:", np.mean(accuracy_scores))
print("Average Precision:", np.mean(precision_scores))
print("Average Recall:", np.mean(recall_scores))
print("Average F1 Score:", np.mean(f1_scores))

# Print confusion matrix for each fold
for i, conf_matrix in enumerate(conf_matrices):
    print(f"Confusion Matrix for Fold {i+1}:")
    print(conf_matrix)

 


Average Accuracy: 0.8919438285291944
Average Precision: 0.7216666666666666
Average Recall: 0.24741200828157348
Average F1 Score: 0.36600684261974586
Confusion Matrix for Fold 1:
[[139   3]
 [ 17   6]]
Confusion Matrix for Fold 2:
[[136   1]
 [ 21   7]]
Confusion Matrix for Fold 3:
[[138   6]
 [ 17   4]]
Confusion Matrix for Fold 4:
[[150   0]
 [ 10   4]]
Confusion Matrix for Fold 5:
[[146   2]
 [ 12   4]]


Lasso regression with 5 fold cross validation and smote

In [38]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import Lasso
from sklearn.exceptions import ConvergenceWarning
from imblearn.over_sampling import SMOTE
import warnings

# Load data (replace path with your actual data location)
data = pd.read_csv(r'C:\Users\harsh\OneDrive\Desktop\machine learning\machine learning\hospital dataset\data01.csv')

# Data preprocessing
cleaned_data = data.drop(['group', 'ID'], axis=1)
cleaned_data['gendera'] = data['gendera'] - 1
cleaned_data = cleaned_data.dropna(subset=['heart rate'])
cleaned_data = cleaned_data.fillna(cleaned_data.mean())

# Feature selection with Lasso regularization
X = cleaned_data[['age', 'hypertensive', 'Hyperlipemia', 'diabetes', 'Renal failure', 'COPD', 
                'heart rate','Systolic blood pressure', 'Diastolic blood pressure', 'Respiratory rate', 'SP O2', 
                'Urine output', 'MCV', 'RDW', 'Platelets', 'Lymphocyte', 'Blood calcium', 
                'Urea nitrogen', 'Anion gap', 'Lactic acid']]
y = cleaned_data['outcome']

# 70-30 Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Scale features using StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Initialize Lasso Regression model
lasso_model = Lasso(alpha=0.00005)  # Adjust regularization strength (alpha)

# Suppress convergence warnings
warnings.filterwarnings("ignore", category=ConvergenceWarning)

# Perform 5-fold cross-validation with SMOTE
kf = KFold(n_splits=5, shuffle=True, random_state=16)  # Change the random state value here
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []

for train_index, test_index in kf.split(X_scaled):
    X_train, X_test = X_scaled[train_index], X_scaled[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # Apply SMOTE only to training data
    smote = SMOTE()
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
    
    # Train Lasso Regression model
    lasso_model.fit(X_train_resampled, y_train_resampled)

    # Predict on test data
    y_pred = lasso_model.predict(X_test)
    y_pred_binary = [1 if pred > 0.5 else 0 for pred in y_pred]

    # Evaluate Lasso Regression model
    accuracy_scores.append(accuracy_score(y_test, y_pred_binary))
    precision_scores.append(precision_score(y_test, y_pred_binary))
    recall_scores.append(recall_score(y_test, y_pred_binary))
    f1_scores.append(f1_score(y_test, y_pred_binary))

# Print average scores
print("Average Accuracy:", np.mean(accuracy_scores))
print("Average Precision:", np.mean(precision_scores))
print("Average Recall:", np.mean(recall_scores))
print("Average F1 Score:", np.mean(f1_scores))


Average Accuracy: 0.7663324918860439
Average Precision: 0.325191796737295
Average Recall: 0.6538267364683572
Average F1 Score: 0.4307915519689646


after improving

In [18]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.linear_model import Lasso
from imblearn.over_sampling import SMOTE


# Load data (replace path with your actual data location)
data = pd.read_csv(r'C:\Users\harsh\OneDrive\Desktop\machine learning\machine learning\hospital dataset\data01.csv')

# Data preprocessing
cleaned_data = data.drop(['group', 'ID'], axis=1)
cleaned_data['gendera'] = data['gendera'] - 1
cleaned_data = cleaned_data.dropna(subset=['heart rate'])
cleaned_data = cleaned_data.fillna(cleaned_data.mean())

# Feature selection
X = cleaned_data[['age', 'hypertensive', 'Hyperlipemia', 'diabetes', 'Renal failure', 'COPD', 
                'heart rate','Systolic blood pressure', 'Diastolic blood pressure', 'Respiratory rate', 'SP O2', 
                'Urine output', 'MCV', 'RDW', 'Platelets', 'Lymphocyte', 'Blood calcium', 
                'Urea nitrogen', 'Anion gap', 'Lactic acid']]
y = cleaned_data['outcome']

# Split data into 70% training and 30% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Scale features using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize SMOTE
smote = SMOTE(random_state=20)


# Perform 5-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=10)
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []
conf_matrices = []  # List to store confusion matrices for each fold

# Adjust the threshold for binary classification
threshold = 0.9 # Adjust this threshold as needed

for train_index, test_index in kf.split(X_train_scaled):
    X_train_fold, X_val_fold = X_train_scaled[train_index], X_train_scaled[test_index]
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[test_index]
    
    # Apply SMOTE to the training fold only
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train_fold, y_train_fold)
    
    # Train Lasso Regression model
    lasso_model = Lasso(alpha=0.0005)  # Adjust regularization strength (alpha)
    lasso_model.fit(X_train_resampled, y_train_resampled)

    # Predict on validation fold
    y_pred = lasso_model.predict(X_val_fold)
    y_pred_binary = [1 if pred > threshold else 0 for pred in y_pred]

    # Evaluate Lasso Regression model
    accuracy_scores.append(accuracy_score(y_val_fold, y_pred_binary))
    precision_scores.append(precision_score(y_val_fold, y_pred_binary))
    recall_scores.append(recall_score(y_val_fold, y_pred_binary))
    f1_scores.append(f1_score(y_val_fold, y_pred_binary))
    
    # Calculate the confusion matrix for the fold
    conf_matrix = confusion_matrix(y_val_fold, y_pred_binary)
    conf_matrices.append(conf_matrix)

# Print average scores
print("Average Accuracy:", np.mean(accuracy_scores))
print("Average Precision:", np.mean(precision_scores))
print("Average Recall:", np.mean(recall_scores))
print("Average F1 Score:", np.mean(f1_scores))


print("Confusion Matrix for each fold:")
for i, conf_matrix in enumerate(conf_matrices):
    print(f"Fold {i+1}:")
    print(conf_matrix)
    print()

 


Average Accuracy: 0.8955875831485587
Average Precision: 0.8228571428571427
Average Recall: 0.20872153209109728
Average F1 Score: 0.3313874429663904
Confusion Matrix for each fold:
Fold 1:
[[141   1]
 [ 17   6]]

Fold 2:
[[136   1]
 [ 22   6]]

Fold 3:
[[141   3]
 [ 19   2]]

Fold 4:
[[150   0]
 [ 10   4]]

Fold 5:
[[148   0]
 [ 13   3]]



XG boost with 5 fold cross validation and smote

In [39]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE

# Load data (replace path with your actual data location)
data = pd.read_csv(r'C:\Users\harsh\OneDrive\Desktop\machine learning\machine learning\hospital dataset\data01.csv')

# Data preprocessing
cleaned_data = data.drop(['group', 'ID'], axis=1)
cleaned_data['gendera'] = data['gendera'] - 1
cleaned_data = cleaned_data.dropna(subset=['heart rate'])
cleaned_data = cleaned_data.fillna(cleaned_data.mean())

# Feature selection
X = cleaned_data[['age', 'hypertensive', 'Hyperlipemia', 'diabetes', 'Renal failure', 'COPD', 
                'heart rate','Systolic blood pressure', 'Diastolic blood pressure', 'Respiratory rate', 'SP O2', 
                'Urine output', 'MCV', 'RDW', 'Platelets', 'Lymphocyte', 'Blood calcium', 
                'Urea nitrogen', 'Anion gap', 'Lactic acid']]
y = cleaned_data['outcome']


# 70-30 Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Scale features using StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Initialize XGBoost model
xgb_model = XGBClassifier(learning_rate=0.1, n_estimators=100, max_depth=3)  # Adjust hyperparameters as needed

# Perform 5-fold cross-validation with SMOTE
kf = KFold(n_splits=5, shuffle=True, random_state=16)  # Change the random state value here
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []

for train_index, test_index in kf.split(X_scaled):
    X_train, X_test = X_scaled[train_index], X_scaled[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # Apply SMOTE only to training data
    smote = SMOTE()
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
    
    # Train XGBoost model
    xgb_model.fit(X_train_resampled, y_train_resampled)

    # Predict on test data
    y_pred = xgb_model.predict(X_test)

    # Evaluate XGBoost model
    accuracy_scores.append(accuracy_score(y_test, y_pred))
    precision_scores.append(precision_score(y_test, y_pred))
    recall_scores.append(recall_score(y_test, y_pred))
    f1_scores.append(f1_score(y_test, y_pred))

# Print average scores
print("Average Accuracy:", np.mean(accuracy_scores))
print("Average Precision:", np.mean(precision_scores))
print("Average Recall:", np.mean(recall_scores))
print("Average F1 Score:", np.mean(f1_scores))


Average Accuracy: 0.8368265416516408
Average Precision: 0.4144263704163274
Average Recall: 0.5059967671326837
Average F1 Score: 0.4550085534736558


after improving

In [20]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_predict, StratifiedKFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
import numpy as np

# Load data
data = pd.read_csv(r'C:\Users\harsh\OneDrive\Desktop\machine learning\machine learning\hospital dataset\data01.csv')

# Data preprocessing
cleaned_data = data.drop(['group', 'ID'], axis=1)
cleaned_data['gendera'] = data['gendera'] - 1
cleaned_data = cleaned_data.dropna(subset=['heart rate'])
cleaned_data = cleaned_data.fillna(cleaned_data.mean())

# Features and target
X = cleaned_data.drop('outcome', axis=1)
y = cleaned_data['outcome']

# Scale features using MinMaxScaler
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# Apply SMOTE to balance the classes
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=94)

# XGBoost classifier
xgb_model = XGBClassifier()

# Hyperparameter grid
params = {'learning_rate': [0.05, 0.1, 0.2],
          'max_depth': [3, 4, 5],
          'n_estimators': [100, 150, 200],
          'min_child_weight': [1, 2, 3],
          'subsample': [0.8, 0.9, 1.0],
          'colsample_bytree': [0.8, 0.9, 1.0],
          'gamma': [0, 0.5, 1]}

# Randomized search for hyperparameter tuning
random_search = RandomizedSearchCV(xgb_model, param_distributions=params, n_iter=100, scoring='accuracy', cv=5, verbose=2, random_state=94, n_jobs=-1)
random_search.fit(X_train, y_train)

# Best hyperparameters
best_params = random_search.best_params_

# Train XGBoost model with best hyperparameters
xgb_model_selected = XGBClassifier(**best_params)

# Get cross-validation predictions for each fold
cv_predictions = cross_val_predict(xgb_model_selected, X_resampled, y_resampled, cv=5)

# Collect confusion matrices for each fold
conf_matrices = []
for train_index, test_index in StratifiedKFold(n_splits=5, shuffle=True, random_state=42).split(X_resampled, y_resampled):
    X_train_fold, X_test_fold = X_resampled[train_index], X_resampled[test_index]
    y_train_fold, y_test_fold = y_resampled[train_index], y_resampled[test_index]
    xgb_model_selected.fit(X_train_fold, y_train_fold)
    y_pred_fold = xgb_model_selected.predict(X_test_fold)
    conf_matrices.append(confusion_matrix(y_test_fold, y_pred_fold))

# Print average metrics
average_accuracy = np.mean(cross_val_score(xgb_model_selected, X_resampled, y_resampled, cv=5, scoring='accuracy'))
average_precision = np.mean(cross_val_score(xgb_model_selected, X_resampled, y_resampled, cv=5, scoring='precision'))
average_recall = np.mean(cross_val_score(xgb_model_selected, X_resampled, y_resampled, cv=5, scoring='recall'))
average_f1 = np.mean(cross_val_score(xgb_model_selected, X_resampled, y_resampled, cv=5, scoring='f1'))

print("Average Accuracy:", average_accuracy)
print("Average Precision:", average_precision)
print("Average Recall:", average_recall)
print("Average F1 Score:", average_f1)

# Print confusion matrices for each fold
for i, conf_matrix in enumerate(conf_matrices):
    print(f"\nConfusion Matrix for Fold {i+1}:")
    print(conf_matrix)


Fitting 5 folds for each of 100 candidates, totalling 500 fits
Average Accuracy: 0.9154597499425086
Average Precision: 0.8841604202920079
Average Recall: 0.9625857239447504
Average F1 Score: 0.9193898941093561

Confusion Matrix for Fold 1:
[[193  11]
 [  9 194]]

Confusion Matrix for Fold 2:
[[186  18]
 [  6 197]]

Confusion Matrix for Fold 3:
[[188  15]
 [  7 197]]

Confusion Matrix for Fold 4:
[[177  26]
 [  8 196]]

Confusion Matrix for Fold 5:
[[185  18]
 [  2 201]]


SVM with 5 fold cross validation and smote

In [40]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.svm import SVC
from imblearn.over_sampling import SMOTE

# Load data (replace path with your actual data location)
data = pd.read_csv(r'C:\Users\harsh\OneDrive\Desktop\machine learning\machine learning\hospital dataset\data01.csv')

# Data preprocessing
cleaned_data = data.drop(['group', 'ID'], axis=1)
cleaned_data['gendera'] = data['gendera'] - 1
cleaned_data = cleaned_data.dropna(subset=['heart rate'])
cleaned_data = cleaned_data.fillna(cleaned_data.mean())

# Feature selection
X = cleaned_data[['age', 'hypertensive', 'Hyperlipemia', 'diabetes', 'Renal failure', 'COPD', 
                'heart rate','Systolic blood pressure', 'Diastolic blood pressure', 'Respiratory rate', 'SP O2', 
                'Urine output', 'MCV', 'RDW', 'Platelets', 'Lymphocyte', 'Blood calcium', 
                'Urea nitrogen', 'Anion gap', 'Lactic acid']]
y = cleaned_data['outcome']

# 70-30 Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Scale features using StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Initialize SVM model
svm_model = SVC(kernel='rbf', C=1.0, gamma='scale')  # Adjust hyperparameters as needed

# Perform 5-fold cross-validation with SMOTE
kf = KFold(n_splits=5, shuffle=True, random_state=16)  # Change the random state value here
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []

for train_index, test_index in kf.split(X_scaled):
    X_train, X_test = X_scaled[train_index], X_scaled[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # Apply SMOTE only to training data
    smote = SMOTE()
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
    
    # Train SVM model
    svm_model.fit(X_train_resampled, y_train_resampled)

    # Predict on test data
    y_pred = svm_model.predict(X_test)

    # Evaluate SVM model
    accuracy_scores.append(accuracy_score(y_test, y_pred))
    precision_scores.append(precision_score(y_test, y_pred))
    recall_scores.append(recall_score(y_test, y_pred))
    f1_scores.append(f1_score(y_test, y_pred))

# Print average scores
print("Average Accuracy:", np.mean(accuracy_scores))
print("Average Precision:", np.mean(precision_scores))
print("Average Recall:", np.mean(recall_scores))
print("Average F1 Score:", np.mean(f1_scores))


Average Accuracy: 0.8428092318788316
Average Precision: 0.4237308429118774
Average Recall: 0.4348107688272912
Average F1 Score: 0.4266194467146344


after improving

In [24]:
import pandas as pd 
from sklearn.metrics import confusion_matrix
from sklearn.svm import SVC
from imblearn.over_sampling import SMOTE
import numpy as np

# Load data
data = pd.read_csv(r'C:\Users\harsh\OneDrive\Desktop\machine learning\machine learning\hospital dataset\data01.csv')

# Data preprocessing
cleaned_data = data.drop(['group', 'ID'], axis=1)
cleaned_data['gendera'] = data['gendera'] - 1
cleaned_data = cleaned_data.dropna(subset=['heart rate'])
cleaned_data = cleaned_data.fillna(cleaned_data.mean())

# Features and target
X = cleaned_data[['age', 'hypertensive', 'Hyperlipemia', 'diabetes', 'Renal failure', 'COPD',
                  'heart rate', 'Diastolic blood pressure', 'Respiratory rate', 'SP O2',
                  'Urine output', 'MCV', 'RDW', 'Platelets', 'Lymphocyte', 'Blood calcium',
                  'Urea nitrogen', 'Anion gap', 'Lactic acid','Blood sodium','MCH','RBC','Creatine kinase','PCO2','NT-proBNP']]
y = cleaned_data['outcome']

# Scale features using MinMaxScaler
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# Apply SMOTE to balance the classes
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=94)

# SVM classifier
svm_model = SVC()

# Hyperparameter grid
params = {'C': [0.1, 1, 10, 100, 1000, 10000],  # Expanded range
          'gamma': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100],  # Expanded range
          'kernel': ['linear', 'rbf', 'poly']}

# Randomized search for hyperparameter tuning
random_search = RandomizedSearchCV(svm_model, param_distributions=params, n_iter=100, scoring='accuracy', cv=5, verbose=2, random_state=42, n_jobs=-1)
random_search.fit(X_train, y_train)

# Get cross-validation results for accuracy, precision, recall, and F1 score
cv_accuracy = cross_val_score(random_search.best_estimator_, X_resampled, y_resampled, cv=5, scoring='accuracy')
cv_precision = cross_val_score(random_search.best_estimator_, X_resampled, y_resampled, cv=5, scoring='precision')
cv_recall = cross_val_score(random_search.best_estimator_, X_resampled, y_resampled, cv=5, scoring='recall')
cv_f1 = cross_val_score(random_search.best_estimator_, X_resampled, y_resampled, cv=5, scoring='f1')

# Calculate average metrics
average_accuracy = np.mean(cv_accuracy)
average_precision = np.mean(cv_precision)
average_recall = np.mean(cv_recall)
average_f1 = np.mean(cv_f1)

print("Average Accuracy:", average_accuracy)
print("Average Precision:", average_precision)
print("Average Recall:", average_recall)
print("Average F1 Score:", average_f1)

# Define 5-fold cross-validation
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Initialize lists to store confusion matrices
conf_matrices = []

# Perform cross-validation
for train_index, test_index in kf.split(X_resampled, y_resampled):
    X_train_fold, X_test_fold = X_resampled[train_index], X_resampled[test_index]
    y_train_fold, y_test_fold = y_resampled[train_index], y_resampled[test_index]

    # Fit SVM model
    svm_model.fit(X_train_fold, y_train_fold)

    # Predictions
    y_pred_fold = svm_model.predict(X_test_fold)

    # Compute confusion matrix
    conf_matrix_fold = confusion_matrix(y_test_fold, y_pred_fold)
    conf_matrices.append(conf_matrix_fold)

    # Print confusion matrix for this fold
    print(f"\nConfusion Matrix for Fold {len(conf_matrices)}:")
    print(conf_matrix_fold)


Fitting 5 folds for each of 100 candidates, totalling 500 fits
Average Accuracy: 0.9783699059561128
Average Precision: 0.9839988274595853
Average Recall: 0.9724282816574906
Average F1 Score: 0.9781148134141148

Confusion Matrix for Fold 1:
[[160  44]
 [ 26 177]]

Confusion Matrix for Fold 2:
[[161  43]
 [ 25 178]]

Confusion Matrix for Fold 3:
[[170  33]
 [ 28 176]]

Confusion Matrix for Fold 4:
[[160  43]
 [ 37 167]]

Confusion Matrix for Fold 5:
[[158  45]
 [ 23 180]]


KNN with 5 fold cross validation and smote

In [41]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import SelectKBest, f_classif

# Load data (replace path with your actual data location)
data = pd.read_csv(r'C:\Users\harsh\OneDrive\Desktop\machine learning\machine learning\hospital dataset\data01.csv')

# Data preprocessing
cleaned_data = data.drop(['group', 'ID', 'gendera'], axis=1).dropna(subset=['heart rate']).fillna(data.mean())

# Separate features and target variable
X = cleaned_data.drop(['outcome'], axis=1)
y = cleaned_data['outcome']

# 70-30 Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Select top k features
selector = SelectKBest(score_func=f_classif, k=5)
X_selected = selector.fit_transform(X_scaled, y)

# Define cross-validation strategy
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Initialize lists to store evaluation metrics
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []

# Perform cross-validation with SMOTE
for train_index, val_index in kf.split(X_selected, y):
    X_train_fold, X_val = X_selected[train_index], X_selected[val_index]
    y_train_fold, y_val = y.iloc[train_index], y.iloc[val_index]

    # Apply SMOTE to balance the classes only on the training fold
    smote = SMOTE(random_state=42)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train_fold, y_train_fold)

    # Initialize KNeighborsClassifier with adjusted hyperparameters
    knn_model = KNeighborsClassifier(n_neighbors=15, weights='distance', algorithm='auto')

    # Train KNeighborsClassifier on the resampled training data
    knn_model.fit(X_train_resampled, y_train_resampled)

    # Predict on validation data
    y_pred_val = knn_model.predict(X_val)

    # Evaluate KNeighborsClassifier on validation data
    accuracy_scores.append(accuracy_score(y_val, y_pred_val))
    precision_scores.append(precision_score(y_val, y_pred_val))
    recall_scores.append(recall_score(y_val, y_pred_val))
    f1_scores.append(f1_score(y_val, y_pred_val))

# Calculate average scores
avg_accuracy = sum(accuracy_scores) / len(accuracy_scores)
avg_precision = sum(precision_scores) / len(precision_scores)
avg_recall = sum(recall_scores) / len(recall_scores)
avg_f1 = sum(f1_scores) / len(f1_scores)

# Print average evaluation scores
print("\nAverage Evaluation Scores (5-Fold Cross Validation with SMOTE and K-Nearest Neighbors):")
print("Average Accuracy:", avg_accuracy)
print("Average Precision:", avg_precision)
print("Average Recall:", avg_recall)
print("Average F1 Score:", avg_f1)


Average Evaluation Scores (5-Fold Cross Validation with SMOTE and K-Nearest Neighbors):
Average Accuracy: 0.6822574828705373
Average Precision: 0.2274026922409127
Average Recall: 0.54375
Average F1 Score: 0.3196947744905255


after improving

In [25]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold, train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
import numpy as np

# Load data
data = pd.read_csv(r'C:\Users\harsh\OneDrive\Desktop\machine learning\machine learning\hospital dataset\data01.csv')

# Data preprocessing
cleaned_data = data.drop(['group', 'ID', 'gendera'], axis=1).dropna(subset=['heart rate']).fillna(data.mean())

# Separate features and target variable
X = cleaned_data.drop(['outcome'], axis=1)
y = cleaned_data['outcome']

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Splitting data into 70% train and 30% test
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=22)

# Define cross-validation strategy
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=73)

# Initialize lists to store evaluation metrics
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []

# Define threshold
threshold = 0.9  # Adjust the threshold as needed

# Perform cross-validation with SMOTE
for i, (train_index, val_index) in enumerate(kf.split(X_train, y_train), 1):
    X_train_fold, X_val = X_train[train_index], X_train[val_index]
    y_train_fold, y_val = y_train.iloc[train_index], y_train.iloc[val_index]

    # Apply SMOTE to balance the classes only on the training data of each fold
    smote = SMOTE(random_state=72)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train_fold, y_train_fold)

    # Initialize KNeighborsClassifier with adjusted hyperparameters
    knn_model = KNeighborsClassifier(n_neighbors=9, weights='distance', algorithm='auto')

    # Train KNeighborsClassifier
    knn_model.fit(X_train_resampled, y_train_resampled)

    # Predict probabilities on validation data
    y_prob_knn = knn_model.predict_proba(X_val)[:, 1]  # Probability of positive class

    # Apply threshold
    y_pred_knn = (y_prob_knn > threshold).astype(int)

    # Evaluate KNeighborsClassifier on validation data
    accuracy_scores.append(accuracy_score(y_val, y_pred_knn))
    precision_scores.append(precision_score(y_val, y_pred_knn))
    recall_scores.append(recall_score(y_val, y_pred_knn))
    f1_scores.append(f1_score(y_val, y_pred_knn))
    
    # Calculate confusion matrix for this fold
    conf_matrix = confusion_matrix(y_val, y_pred_knn)
    print(f"Confusion Matrix - Fold {i}:")
    print(conf_matrix)
    print()

# Calculate average scores
avg_accuracy = np.mean(accuracy_scores)
avg_precision = np.mean(precision_scores)
avg_recall = np.mean(recall_scores)
avg_f1 = np.mean(f1_scores)

# Print average evaluation scores
print("\nAverage Evaluation Scores (5-Fold Cross Validation with SMOTE and K-Nearest Neighbors with Threshold):")
print("Average Accuracy:", avg_accuracy)
print("Average Precision:", avg_precision)
print("Average Recall:", avg_recall)
print("Average F1 Score:", avg_f1)

Confusion Matrix - Fold 1:
[[139   4]
 [ 18   4]]

Confusion Matrix - Fold 2:
[[139   4]
 [ 18   4]]

Confusion Matrix - Fold 3:
[[140   3]
 [ 19   3]]

Confusion Matrix - Fold 4:
[[139   3]
 [ 19   3]]

Confusion Matrix - Fold 5:
[[141   1]
 [ 16   6]]


Average Evaluation Scores (5-Fold Cross Validation with SMOTE and K-Nearest Neighbors with Threshold):
Average Accuracy: 0.8724390243902439
Average Precision: 0.5714285714285714
Average Recall: 0.18181818181818182
Average F1 Score: 0.27513957307060755


Random forest with 5 fold cross validation and smote

In [42]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE

# Load data (replace path with your actual data location)
data = pd.read_csv(r'C:\Users\harsh\OneDrive\Desktop\machine learning\machine learning\hospital dataset\data01.csv')

# Data preprocessing
cleaned_data = data.drop(['group', 'ID'], axis=1)
cleaned_data['gendera'] = data['gendera'] - 1
cleaned_data = cleaned_data.dropna(subset=['heart rate'])
cleaned_data = cleaned_data.fillna(cleaned_data.mean())

# Feature selection
X = cleaned_data[['age', 'hypertensive', 'Hyperlipemia', 'diabetes', 'Renal failure', 'COPD', 
                'heart rate','Systolic blood pressure', 'Diastolic blood pressure', 'Respiratory rate', 'SP O2', 
                'Urine output', 'MCV', 'RDW', 'Platelets', 'Lymphocyte', 'Blood calcium', 
                'Urea nitrogen', 'Anion gap', 'Lactic acid']]
y = cleaned_data['outcome']

# 70-30 Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Scale features using StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Initialize Random Forest model with adjusted hyperparameters
rf_model = RandomForestClassifier(n_estimators=200, max_depth=15, min_samples_split=10, min_samples_leaf=5, random_state=42)

# Perform 5-fold cross-validation with SMOTE
kf = KFold(n_splits=5, shuffle=True, random_state=16)
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []

# Threshold for classification
threshold = 0.3

for train_index, test_index in kf.split(X_scaled):
    X_train, X_test = X_scaled[train_index], X_scaled[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # Apply SMOTE only to training data
    smote = SMOTE(sampling_strategy='auto', random_state=42)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
    
    # Train Random Forest model
    rf_model.fit(X_train_resampled, y_train_resampled)

    # Predict probabilities on test data
    y_pred_prob = rf_model.predict_proba(X_test)[:, 1]
    
    # Apply threshold for classification
    y_pred = (y_pred_prob > threshold).astype(int)

    # Evaluate Random Forest model
    accuracy_scores.append(accuracy_score(y_test, y_pred))
    precision_scores.append(precision_score(y_test, y_pred))
    recall_scores.append(recall_score(y_test, y_pred))
    f1_scores.append(f1_score(y_test, y_pred))

# Print average scores
print("Average Accuracy:", np.mean(accuracy_scores))
print("Average Precision:", np.mean(precision_scores))
print("Average Recall:", np.mean(recall_scores))
print("Average F1 Score:", np.mean(f1_scores))


Average Accuracy: 0.6847926433465561
Average Precision: 0.2694809739764413
Average Recall: 0.7799705965421937
Average F1 Score: 0.39964875071258044


after improving

In [35]:
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
import numpy as np

# Load data
data = pd.read_csv(r'C:\Users\harsh\OneDrive\Desktop\machine learning\machine learning\hospital dataset\data01.csv')

# Data preprocessing
cleaned_data = data.drop(['group', 'ID', 'gendera'], axis=1).dropna(subset=['heart rate']).fillna(data.mean())

# Separate features and target variable
X = cleaned_data.drop(['outcome'], axis=1)
y = cleaned_data['outcome']

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Perform 70:30 split for training and testing
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

# Define cross-validation strategy
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Initialize lists to store evaluation metrics
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []

# Define threshold
threshold = 0.6 # Adjust the threshold as needed

# Perform cross-validation with SMOTE
for i, (train_index, test_index) in enumerate(kf.split(X_train, y_train), 1):
    X_train_cv, X_val = X_train[train_index], X_train[test_index]
    y_train_cv, y_val = y_train.iloc[train_index], y_train.iloc[test_index]

    # Apply SMOTE to balance the classes only on the training data
    smote = SMOTE(random_state=84)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train_cv, y_train_cv)

    # Initialize RandomForestClassifier with specified hyperparameters
    rf_model = RandomForestClassifier(n_estimators=300, max_depth=10, min_samples_split=2, min_samples_leaf=1, random_state=93)

    # Train RandomForestClassifier
    rf_model.fit(X_train_resampled, y_train_resampled)

    # Predict probabilities on validation data
    y_prob_rf = rf_model.predict_proba(X_val)[:, 1]  # Probability of positive class

    # Apply threshold
    y_pred_rf = (y_prob_rf > threshold).astype(int)

    # Evaluate RandomForestClassifier
    accuracy_scores.append(accuracy_score(y_val, y_pred_rf))
    precision_scores.append(precision_score(y_val, y_pred_rf))
    recall_scores.append(recall_score(y_val, y_pred_rf))
    f1_scores.append(f1_score(y_val, y_pred_rf))
    
    # Calculate confusion matrix for this fold
    conf_matrix = confusion_matrix(y_val, y_pred_rf)
    print(f"Confusion Matrix - Fold {i}:")
    print(conf_matrix)
    print()

# Calculate average scores
avg_accuracy = np.mean(accuracy_scores)
avg_precision = np.mean(precision_scores)
avg_recall = np.mean(recall_scores)
avg_f1 = np.mean(f1_scores)

# Print average evaluation scores
print("\nAverage Evaluation Scores (5-Fold Cross Validation with SMOTE and Random Forest with Threshold):")
print("Average Accuracy:", avg_accuracy)
print("Average Precision:", avg_precision)
print("Average Recall:", avg_recall)
print("Average F1 Score:", avg_f1)

Confusion Matrix - Fold 1:
[[145   0]
 [ 16   4]]

Confusion Matrix - Fold 2:
[[143   1]
 [ 18   3]]

Confusion Matrix - Fold 3:
[[143   1]
 [ 16   5]]

Confusion Matrix - Fold 4:
[[141   3]
 [ 19   1]]

Confusion Matrix - Fold 5:
[[143   1]
 [ 16   4]]


Average Evaluation Scores (5-Fold Cross Validation with SMOTE and Random Forest with Threshold):
Average Accuracy: 0.8894087213599409
Average Precision: 0.7266666666666668
Average Recall: 0.16619047619047617
Average F1 Score: 0.26940740740740743
