# Load the Libraries

In [28]:
# Common imports
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier 
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV
from sklearn.metrics import f1_score, make_scorer
import warnings
warnings.filterwarnings("ignore")
np.random.seed(1)

# Data Load

In [29]:
X_train = pd.read_csv("C:/Users/mssur/Downloads/New folder/df_Caesarian_train_X.csv")
X_test = pd.read_csv("C:/Users/mssur/Downloads/New folder/df_Caesarian_test_X.csv")
y_train = pd.read_csv("C:/Users/mssur/Downloads/New folder/df_Caesarian_train_y.csv")
y_test = pd.read_csv("C:/Users/mssur/Downloads/New folder/df_Caesarian_test_y.csv")

In [30]:
X_train.head()

Unnamed: 0,Age,Delivey No,Delivery No_Premature,Delivery No_Timely,Blood of Pressure_Low,Blood of Pressure_Normal,Heart Problem_inept
0,35,2,0,1,0,1,0
1,30,3,0,0,0,0,0
2,28,3,0,1,0,0,0
3,19,1,0,1,0,1,0
4,17,1,0,1,1,0,0


# Model the data

In [31]:
performance = pd.DataFrame({"model": [], "Accuracy": [], "Precision": [], "Recall": [], "F1": []})

# Logistic Regression model

In [32]:
Logistic_reg_model = LogisticRegression(max_iter=900)
_ = Logistic_reg_model.fit(X_train, np.ravel(y_train))

In [33]:
out_pred = Logistic_reg_model.predict(X_test)
c_matrix = confusion_matrix(y_test, out_pred)
TP = c_matrix[1][1]
TN = c_matrix[0][0]
FP = c_matrix[0][1]
FN = c_matrix[1][0]
performance = pd.concat([performance, pd.DataFrame({'model':"default logistic", 
                                                    'Accuracy': [(TP+TN)/(TP+TN+FP+FN)], 
                                                    'Precision': [TP/(TP+FP)], 
                                                    'Recall': [TP/(TP+FN)], 
                                                    'F1': [2*TP/(2*TP+FP+FN)]
                                                     }, index=[0])])
print(TN,TP,FN,FP)

6 11 0 11


# Testing with the Support Vector Classifier Linear Kernel

In [34]:
svc_linearkernel=SVC(kernel="linear").fit(X_train,np.ravel(y_train))
svc_linearkernel_out=svc_linearkernel.predict(X_test)

In [35]:
c_matrix = confusion_matrix(y_test, svc_linearkernel_out)
TP = c_matrix[1][1]
TN = c_matrix[0][0]
FP = c_matrix[0][1]
FN = c_matrix[1][0]
performance = pd.concat([performance, pd.DataFrame({'model':"linear svm", 
                                                    'Accuracy': [(TP+TN)/(TP+TN+FP+FN)], 
                                                    'Precision': [TP/(TP+FP)], 
                                                    'Recall': [TP/(TP+FN)], 
                                                    'F1': [2*TP/(2*TP+FP+FN)]
                                                     }, index=[0])]) 
print(TN,TP,FN,FP)

6 6 5 11


# RBF Kernal

In [36]:
rbf_svc_model = SVC(kernel="rbf", C=10, gamma=0.01).fit(X_train,np.ravel(y_train))
rbf_svc_model_out=rbf_svc_model.predict(X_test)

In [37]:
c_matrix = confusion_matrix(y_test, rbf_svc_model_out)
TP = c_matrix[1][1]
TN = c_matrix[0][0]
FP = c_matrix[0][1]
FN = c_matrix[1][0]
performance = pd.concat([performance, pd.DataFrame({'model':"rbf kernal svm", 
                                                    'Accuracy': [(TP+TN)/(TP+TN+FP+FN)], 
                                                    'Precision': [TP/(TP+FP)], 
                                                    'Recall': [TP/(TP+FN)], 
                                                    'F1': [2*TP/(2*TP+FP+FN)]}, index=[0])])
print(TN,TP,FN,FP)

9 7 4 8


# Poly Kernal

In [38]:
poly_kernal_svc = SVC(kernel="poly", degree=3, coef0=1, C=10,gamma=0.01).fit(X_train,np.ravel(y_train))
poly_kernal_svc_out=poly_kernal_svc.predict(X_test)

In [39]:
c_matrix = confusion_matrix(y_test, poly_kernal_svc_out)
TP = c_matrix[1][1]
TN = c_matrix[0][0]
FP = c_matrix[0][1]
FN = c_matrix[1][0]
performance = pd.concat([performance, pd.DataFrame({'model':"poly kernal svm", 
                                                    'Accuracy': [(TP+TN)/(TP+TN+FP+FN)], 
                                                    'Precision': [TP/(TP+FP)], 
                                                    'Recall': [TP/(TP+FN)], 
                                                    'F1': [2*TP/(2*TP+FP+FN)]
  }, index=[0])])
print(TN,TP,FN,FP)

10 7 4 7


# Fitting with the decision trees

In [40]:
decision_tree = DecisionTreeClassifier().fit(X_train, np.ravel(y_train))
decision_tree_out=decision_tree.predict(X_test)

In [41]:
c_matrix = confusion_matrix(y_test, decision_tree_out)
TP = c_matrix[1][1]
TN = c_matrix[0][0]
FP = c_matrix[0][1]
FN = c_matrix[1][0]
performance = pd.concat([performance, pd.DataFrame({'model':"Decision Tree", 
                                                    'Accuracy': [(TP+TN)/(TP+TN+FP+FN)], 
                                                    'Precision': [TP/(TP+FP)], 
                                                    'Recall': [TP/(TP+FN)], 
                                                    'F1': [2*TP/(2*TP+FP+FN)]
                                                 }, index=[0])])

print(TN,TP,FN,FP)   

9 9 2 8


In [42]:
performance

Unnamed: 0,model,Accuracy,Precision,Recall,F1
0,default logistic,0.607143,0.5,1.0,0.666667
0,linear svm,0.428571,0.352941,0.545455,0.428571
0,rbf kernal svm,0.571429,0.466667,0.636364,0.538462
0,poly kernal svm,0.607143,0.5,0.636364,0.56
0,Decision Tree,0.642857,0.529412,0.818182,0.642857


In [43]:
performance.sort_values(['Accuracy'])

Unnamed: 0,model,Accuracy,Precision,Recall,F1
0,linear svm,0.428571,0.352941,0.545455,0.428571
0,rbf kernal svm,0.571429,0.466667,0.636364,0.538462
0,default logistic,0.607143,0.5,1.0,0.666667
0,poly kernal svm,0.607143,0.5,0.636364,0.56
0,Decision Tree,0.642857,0.529412,0.818182,0.642857


In [44]:
performance.sort_values(['Precision'])

Unnamed: 0,model,Accuracy,Precision,Recall,F1
0,linear svm,0.428571,0.352941,0.545455,0.428571
0,rbf kernal svm,0.571429,0.466667,0.636364,0.538462
0,default logistic,0.607143,0.5,1.0,0.666667
0,poly kernal svm,0.607143,0.5,0.636364,0.56
0,Decision Tree,0.642857,0.529412,0.818182,0.642857


In [45]:
performance.sort_values(['Recall'])

Unnamed: 0,model,Accuracy,Precision,Recall,F1
0,linear svm,0.428571,0.352941,0.545455,0.428571
0,rbf kernal svm,0.571429,0.466667,0.636364,0.538462
0,poly kernal svm,0.607143,0.5,0.636364,0.56
0,Decision Tree,0.642857,0.529412,0.818182,0.642857
0,default logistic,0.607143,0.5,1.0,0.666667


In [46]:
performance.sort_values(['F1'])

Unnamed: 0,model,Accuracy,Precision,Recall,F1
0,linear svm,0.428571,0.352941,0.545455,0.428571
0,rbf kernal svm,0.571429,0.466667,0.636364,0.538462
0,poly kernal svm,0.607143,0.5,0.636364,0.56
0,Decision Tree,0.642857,0.529412,0.818182,0.642857
0,default logistic,0.607143,0.5,1.0,0.666667


# Cost Benefit analysis and the best metric

## There are several metrics to evaluate the binary classification problem, however as per the class material, we will mainly concentrate on the following metrics they are Accuracy, Precision, Recall, and F1 score.

### Accuracy: Accuracy simply measures how often the classifier correctly predicts. 


### Precision: Precision measures the proportion of positive predictions that are correct.

### Recall: Recall measures the proportion of actual positives that are correctly identified by the model.

### F1 Score: F1 score is the harmonic mean of precision and recall, providing a balanced measure of the model's performance

## In order to consider the best metric for my model I need to analyze the FP and FN scenarios

### False Positive: The model indicates a positive prediction of Caesarean delivery, while the actual label is a normal delivery (negative). In this instance, the model made a falsely positive forecast, which could have an impact on how doctors make decisions as it might result in unneeded medical intervention.

### False Negative: The model indicates a normal delivery (negative), however, the actual label is a Caesarean birth (positive). In this instance, the model made an inaccurate negative prediction, which may have consequences for medical decision-making since it may result in a failure to recognize a potential risk factor that calls for medical intervention.

## So there are both pros and cons as mentioned above so I'm eliminating both precision and recall for considering them as the best metric now I'm left with the remaining Accuracy and F1 scores in these two let us analyze the scenarios

## Firstly, we have addressed the data imbalance so I can consider accuracy is also the best metric 

## From the above output we can clearly see that the F1 score is highest when compared to the accuracy since we are leveraging the cost for both precision and recall also having the F1 score is having the highest value so I'm considering the F1 as the best metric for my dataset

## Best Metric for my model is F1 Score: 0.666667

# Hyper parameter tuning 

## Random Search CV using Logistic regression by f1 score

In [84]:
score_measure = make_scorer(f1_score , average='macro')
kfolds = 5

param_grid = {
    'penalty' : ['l1','l2','elasticnet'], 
    'C'       : np.logspace(-5,5,15),
    'solver'  : ['newton-cg', 'lbfgs', 'liblinear'],
}

logistic_random = LogisticRegression()
randomsearch_logistic = RandomizedSearchCV(estimator = logistic_random, param_distributions=param_grid, cv=kfolds, n_iter=500,
                           scoring=score_measure, verbose=1, n_jobs=-1,  # n_jobs=-1 will utilize all available CPUs 
                           return_train_score=True)

_ = randomsearch_logistic.fit(X_train, y_train)

print(f"The best {score_measure} score is {randomsearch_logistic.best_score_}")
print(f"... with parameters: {randomsearch_logistic.best_params_}")

f1_out = randomsearch_logistic.best_estimator_

Fitting 5 folds for each of 135 candidates, totalling 675 fits
The best make_scorer(f1_score, average=macro) score is 0.6124100899100899
... with parameters: {'solver': 'newton-cg', 'penalty': 'l2', 'C': 0.19306977288832497}


##  Using Grid search CV

In [80]:
score_measure = make_scorer(f1_score , average='macro')
kfolds = 5

param_grid = {
    'penalty' : ['l1','l2'], 
    'C'       : np.logspace(-5,15,15),
    'solver'  : ['newton-cg', 'lbfgs', 'liblinear'],
}

logistic_grid_Cv= LogisticRegression()
gridsearch_logistic = GridSearchCV(estimator = logistic_grid_Cv, param_grid=param_grid, cv=kfolds, 
                           scoring=score_measure, verbose=1, n_jobs=-1,  # n_jobs=-1 will utilize all available CPUs 
                           return_train_score=True)

_ = gridsearch_logistic.fit(X_train, y_train)

print(f"The best {score_measure} score is {gridsearch_logistic.best_score_}")
print(f"... with parameters: {gridsearch_logistic.best_params_}")

f1_out_grid = gridsearch_logistic.best_estimator_

Fitting 5 folds for each of 90 candidates, totalling 450 fits
The best make_scorer(f1_score, average=macro) score is 0.6124100899100899
... with parameters: {'C': 0.19306977288832497, 'penalty': 'l2', 'solver': 'newton-cg'}


## Random Search CV using by f1 score for Linear Kernel SVM

In [91]:
score_measure = make_scorer(f1_score , average='macro')
kfolds = 5

param_grid = {
     'C': [0.001, 0.01, 0.1,1, 10, 100, 1000, 10000],
      'gamma': [1,0.1,0.01,0.001],
      'kernel':['linear']
}

svm_linear = SVC(max_iter=9000)
svm_linear_randomsearch = RandomizedSearchCV(estimator = svm_linear, param_distributions=param_grid, cv=kfolds, n_iter=500,
                           scoring=score_measure, verbose=1, n_jobs=-1,  # n_jobs=-1 will utilize all available CPUs 
                           return_train_score=True)

_ = svm_linear_randomsearch.fit(X_train, y_train)

print(f"The best {score_measure} score is {svm_linear_randomsearch.best_score_}")
print(f"... with parameters: {svm_linear_randomsearch.best_params_}")

svm_linear_out = svm_linear_randomsearch.best_estimator_

Fitting 5 folds for each of 32 candidates, totalling 160 fits
The best make_scorer(f1_score, average=macro) score is 0.6014685314685313
... with parameters: {'kernel': 'linear', 'gamma': 1, 'C': 1}


## Random Search CV using by f1 score for Linear Kernel SVM

In [92]:
score_measure = make_scorer(f1_score , average='macro')
kfolds = 5

param_grid = {
     #'C': [0.1,1, 10, 100], 
    'C': [0.001, 0.01, 0.1,1, 10, 100, 1000, 10000],
     'gamma': [1,0.1,0.01,0.001],
    'kernel': ['linear']
}

svm_linear = SVC(max_iter=9000)
svm_linear_gridsearch = GridSearchCV(estimator = svm_linear, param_grid=param_grid, cv=kfolds, 
                           scoring=score_measure, verbose=1, n_jobs=-1,  # n_jobs=-1 will utilize all available CPUs 
                           return_train_score=True)

_ = svm_linear_gridsearch.fit(X_train, y_train)

print(f"The best {score_measure} score is {svm_linear_gridsearch.best_score_}")
print(f"... with parameters: {svm_linear_gridsearch.best_params_}")

svm_linear_out_randomsearch = svm_linear_gridsearch.best_estimator_

Fitting 5 folds for each of 32 candidates, totalling 160 fits
The best make_scorer(f1_score, average=macro) score is 0.6014685314685313
... with parameters: {'C': 1, 'gamma': 1, 'kernel': 'linear'}


# Random Search CV using by f1 score for SVC RBF Kernel

In [93]:
score_measure = make_scorer(f1_score , average='macro')
kfolds = 5

param_grid = {
     'C': [0.1,1, 10, 100], 
    'gamma': [1,0.1,0.01,0.001],
     'degree':[2,3,4],
    'kernel': ['rbf'],
}

rbf_svc_kernel = SVC()
random_rbf_svc_kernel = RandomizedSearchCV(estimator = rbf_svc_kernel, param_distributions=param_grid, cv=kfolds, n_iter=500,
                           scoring=score_measure, verbose=1, n_jobs=-1,  # n_jobs=-1 will utilize all available CPUs 
                           return_train_score=True)

_ = random_rbf_svc_kernel.fit(X_train, y_train)

print(f"The best {score_measure} score is {random_rbf_svc_kernel.best_score_}")
print(f"... with parameters: {random_rbf_svc_kernel.best_params_}")

random_rbf_svc_kernel_out = random_rbf_svc_kernel.best_estimator_

Fitting 5 folds for each of 48 candidates, totalling 240 fits
The best make_scorer(f1_score, average=macro) score is 0.7330019980019981
... with parameters: {'kernel': 'rbf', 'gamma': 1, 'degree': 2, 'C': 10}


# Grid Search CV using by f1 score for SVC RBF Kernel

In [94]:
score_measure = make_scorer(f1_score , average='macro')
kfolds = 5

param_grid = {
     'C': [1,10,20, 100], 
     'gamma': [1,0.1,0.01,0.001],
     'degree':[1,2,3,4,6],
    'kernel': ['rbf'],
}

rbf_svc_kernel = SVC()
grid_rbf_svc_kernel = GridSearchCV(estimator = rbf_svc_kernel, param_grid=param_grid, cv=kfolds, 
                           scoring=score_measure, verbose=1, n_jobs=-1,  # n_jobs=-1 will utilize all available CPUs 
                           return_train_score=True)

_ = grid_rbf_svc_kernel.fit(X_train, y_train)

print(f"The best {score_measure} score is {grid_rbf_svc_kernel.best_score_}")
print(f"... with parameters: {grid_rbf_svc_kernel.best_params_}")

grid_rbf_svc_kernel_out = grid_rbf_svc_kernel.best_estimator_

Fitting 5 folds for each of 80 candidates, totalling 400 fits
The best make_scorer(f1_score, average=macro) score is 0.7330019980019981
... with parameters: {'C': 10, 'degree': 1, 'gamma': 1, 'kernel': 'rbf'}


# Random Search CV using by f1 score for svc poly kernel

In [95]:
score_measure = make_scorer(f1_score , average='macro')
kfolds = 5

param_grid = {
     'C': [0.1,1, 10, 100], 
    'gamma': [1,0.1,0.01,0.001],
    'degree':[2,3,4],
    'kernel': ['poly']
}

random_svc_poly = SVC(max_iter=30000)
random_search_svc_poly = RandomizedSearchCV(estimator = random_svc_poly, param_distributions=param_grid, cv=kfolds, n_iter=500,
                           scoring=score_measure, verbose=1, n_jobs=-1,  # n_jobs=-1 will utilize all available CPUs 
                           return_train_score=True)

_ = random_search_svc_poly.fit(X_train, y_train)

print(f"The best {score_measure} score is {random_search_svc_poly.best_score_}")
print(f"... with parameters: {random_search_svc_poly.best_params_}")

random_search_svc_poly_out = random_search_svc_poly.best_estimator_

Fitting 5 folds for each of 48 candidates, totalling 240 fits
The best make_scorer(f1_score, average=macro) score is 0.6460594960594961
... with parameters: {'kernel': 'poly', 'gamma': 0.1, 'degree': 3, 'C': 0.1}


# Grid Search CV using by f1 score for svc poly kernel

In [96]:
score_measure = make_scorer(f1_score , average='macro')
kfolds = 5

param_grid = {
    'C': [0.1,1, 10, 100], 
    'gamma': [1,0.1,0.01,0.001],
    'degree':[2,3,4],
    'kernel': ['poly']
}

grid_svm_poly = SVC(max_iter=30000)
grid_search_svc_poly = GridSearchCV(estimator = grid_svm_poly, param_grid=param_grid, cv=kfolds, 
                           scoring=score_measure, verbose=1, n_jobs=-1,  # n_jobs=-1 will utilize all available CPUs 
                           return_train_score=True)

_ = grid_search_svc_poly.fit(X_train, y_train)

print(f"The best {score_measure} score is {grid_search_svc_poly.best_score_}")
print(f"... with parameters: {grid_search_svc_poly.best_params_}")

grid_search_svc_poly_out = grid_search_svc_poly.best_estimator_

Fitting 5 folds for each of 48 candidates, totalling 240 fits
The best make_scorer(f1_score, average=macro) score is 0.6460594960594961
... with parameters: {'C': 0.1, 'degree': 3, 'gamma': 0.1, 'kernel': 'poly'}


# Random search cv for Decision Trees

In [97]:
score_measure = make_scorer(f1_score , average='macro')
kfolds = 5

param_grid = {
    'min_samples_split': np.arange(1,100),  
    'min_samples_leaf': np.arange(1,100),
    'min_impurity_decrease': np.arange(0.0001, 0.01, 0.0005),
    'max_leaf_nodes': np.arange(5, 100), 
    'max_depth': np.arange(1,50), 
    'criterion': ['entropy', 'gini'],
}

decisiontree_random = DecisionTreeClassifier()
decisiontree_random_search = RandomizedSearchCV(estimator = decisiontree_random, param_distributions=param_grid, cv=kfolds, n_iter=500,
                           scoring=score_measure, verbose=1, n_jobs=-1,  # n_jobs=-1 will utilize all available CPUs 
                           return_train_score=True)

_ = decisiontree_random_search.fit(X_train, y_train)

print(f"The best {score_measure} score is {decisiontree_random_search.best_score_}")
print(f"... with parameters: {decisiontree_random_search.best_params_}")

decisiontree_random_search_out = decisiontree_random_search.best_estimator_

Fitting 5 folds for each of 500 candidates, totalling 2500 fits
The best make_scorer(f1_score, average=macro) score is 0.6584242881301704
... with parameters: {'min_samples_split': 50, 'min_samples_leaf': 23, 'min_impurity_decrease': 0.0076, 'max_leaf_nodes': 61, 'max_depth': 21, 'criterion': 'entropy'}


# Grid search cv for Decision Trees

In [98]:
score_measure = make_scorer(f1_score , average='macro')
kfolds = 5

param_grid = {
    'min_samples_split': np.arange(1,10),  
    'min_samples_leaf': np.arange(1,10),
    'min_impurity_decrease': np.arange(0.0001, 0.01, 0.0005),
    'max_leaf_nodes': np.arange(5, 10), 
    'max_depth': np.arange(1,10), 
    'criterion': ['entropy', 'gini'],
}

grid_dt = DecisionTreeClassifier()
decisiontrees_grid_search = GridSearchCV(estimator = grid_dt, param_grid=param_grid, cv=kfolds,
                           scoring=score_measure, verbose=1, n_jobs=-1,  # n_jobs=-1 will utilize all available CPUs 
                           return_train_score=True)

_ = decisiontrees_grid_search.fit(X_train, y_train)

print(f"The best {score_measure} score is {decisiontrees_grid_search.best_score_}")
print(f"... with parameters: {decisiontrees_grid_search.best_params_}")

decisiontrees_grid_search_out = decisiontrees_grid_search.best_estimator_

Fitting 5 folds for each of 145800 candidates, totalling 729000 fits
The best make_scorer(f1_score, average=macro) score is 0.6774281274281274
... with parameters: {'criterion': 'entropy', 'max_depth': 4, 'max_leaf_nodes': 7, 'min_impurity_decrease': 0.0001, 'min_samples_leaf': 1, 'min_samples_split': 1}


## Among all the models which I have considered the best optimized metric that i found is SVM RBG kernel which gave the 73.33% f1 score
## So hence F1 score for the RBF kerenel is the best model and savig this into a pickel file for the future use as questioned.

# Saving this model for future use (In the question they mentioned)

In [99]:
import pickle

# save model
pickle.dump(random_rbf_svc_kernel_out, open('E:/Spring-23/DSP/Assignmnet1/assignment1_svm_best.pickle', "wb"))

# If you wish to load this model later, simply use pickle.load method
#loaded_model = pickle.load(open('logistic_model_example01.pkl', "rb"))