In [85]:
#import packages and libraries that will be needed
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
#The package by itself comes with a single module and an estimator.To install the module execute:pip install category_encoders
import category_encoders as ce
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegressionCV as LRCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
#from functools import partial

In [86]:
#read data
Dataset = pd.read_csv("caesarian.csv")
Pred = pd.read_csv('caesarianPred.csv')
print(Pred)

   age  delivery_no  delivery_time  blood_pressure  heart_problem
0   21            0              0               0              0


# Part 1 - Data Preprocessing

*To replace the missing data with the recommended 
inputs we need to know if variable type is compatible 
with this imputation, hence check variable type*

In [87]:
print(Dataset.dtypes)

Unnamed: 0         int64
age               object
delivery_no       object
delivery_time      int64
blood_pressure     int64
heart_problem      int64
caesarian          int64
dtype: object


*The columns with missing values are of data type "Object" and it is recommended to replace missing values with mean and mode respectively, to make it easy, it is advisable to replace missing values with zeros since there are no zeros in these columns and convert the columns to "ints" to able to use the mean and mode function in pandas*

In [88]:
#Replacing missing values (-) with zeros(0)
Dataset['delivery_no'] = Dataset['delivery_no'].str.replace('-', '0')
Dataset['age'] = Dataset['age'].str.replace('-', '0')

In [89]:
#converting data type objects to int64
Dataset['age'] = Dataset['age'].astype('int64')
Dataset['delivery_no'] = Dataset['delivery_no'].astype('int64')

*Now, since the column, age is of the data type int64 it is easy to replace all zeros, which in actual fact are missing values with the mean value of the column, likewise the delivery_no column, zeros with the mode of the column*

In [90]:
mean_age = round(Dataset['age'].mean())

In [91]:
Dataset['age']=Dataset.age.mask(Dataset.age == 0,mean_age)

In [92]:
mode_delivery_no = round(Dataset['delivery_no'].mode())
Dataset['delivery_no']=Dataset.delivery_no.mask(Dataset.delivery_no == 0, mode_delivery_no)

In [93]:
#For whatever reason the above imputation seems to replace zeros with NaN, hence apply SimpleImputer to replace the NaN that get generated 
imp = SimpleImputer(missing_values= np.NaN, strategy='most_frequent')

#the fit and transform are a numpy attribute so use pandas.DataFrame to take it back to DataFrame 
imp.fit(Dataset)
Dataset = pd.DataFrame(imp.transform(Dataset))

#Replace the original headings with indeces numpy creates with DataFrame.columns
Dataset.columns = ['unit', 'age', 'delivery_no', 'delivery_time', 'blood_pressure', 'heart_problem', 'caesarian']
Dataset.head(10)

Unnamed: 0,unit,age,delivery_no,delivery_time,blood_pressure,heart_problem,caesarian
0,0.0,22.0,1.0,0.0,2.0,0.0,0.0
1,1.0,26.0,2.0,0.0,1.0,0.0,1.0
2,2.0,26.0,2.0,1.0,1.0,0.0,0.0
3,3.0,28.0,1.0,0.0,2.0,0.0,0.0
4,4.0,22.0,2.0,0.0,1.0,0.0,1.0
5,5.0,26.0,1.0,1.0,0.0,0.0,0.0
6,6.0,27.0,2.0,0.0,1.0,0.0,0.0
7,7.0,26.0,1.0,0.0,1.0,0.0,1.0
8,8.0,26.0,2.0,0.0,1.0,0.0,0.0
9,9.0,27.0,1.0,1.0,1.0,0.0,1.0


**Dividing the DataFrame to Response and Explanatory Variables**

In [94]:
#Dependant variable selection
Dependant = Dataset.loc[:,['caesarian']]

#Independent variable selection
Independent = Dataset.loc[:,['age', 'delivery_no', 'delivery_time', 'blood_pressure', 'heart_problem']]
print(Dependant.head(5))
print(Independent.head(5))

   caesarian
0        0.0
1        1.0
2        0.0
3        0.0
4        1.0
    age  delivery_no  delivery_time  blood_pressure  heart_problem
0  22.0          1.0            0.0             2.0            0.0
1  26.0          2.0            0.0             1.0            0.0
2  26.0          2.0            1.0             1.0            0.0
3  28.0          1.0            0.0             2.0            0.0
4  22.0          2.0            0.0             1.0            0.0


**ENCODING CATEGORICAL FEATURES**

*Encoding the categorical variables using "category_encoders", it follows the same API as sklearn’s preprocessors. They have some added conveniences, such as the ability to easily add an encoder to a pipeline. Additionally, the encoder returns a pandas DataFrame if a DataFrame is passed in. I am using this because it seems to follow the usual practice of other statistical software* 
[category_encoders](https://pypi.org/project/category_encoders/)

In [95]:
# instantiate an encoder - here we use Binary()
encod_val = ce.BinaryEncoder(cols = ['delivery_time', 'blood_pressure', 'heart_problem'])

# fit and transform
Independent = encod_val.fit_transform(Independent)
Pred = encod_val.fit_transform(Pred)
Pred

Unnamed: 0,delivery_time_0,blood_pressure_0,heart_problem_0,age,delivery_no
0,1,1,1,21,0


In [96]:
#Training set and Predicted set shape and dimension manipulations to make them equalin size
Diff_shape_p = set(Independent) - set(Pred)
Diff_shape_p = pd.DataFrame(data = np.zeros((Pred.shape[0], len(Diff_shape_p))), columns = list(Diff_shape_p))
Pred = Pred.join(Diff_shape_p)
Pred

Unnamed: 0,delivery_time_0,blood_pressure_0,heart_problem_0,age,delivery_no,blood_pressure_1,delivery_time_2,heart_problem_1,blood_pressure_2,delivery_time_1
0,1,1,1,21,0,0.0,0.0,0.0,0.0,0.0


**Standardization**

*Standardize the new features by removing the mean and scaling to unit variance*

In [97]:
standardize = preprocessing.StandardScaler().fit(Independent)
Independent = pd.DataFrame(standardize.transform(Independent))
Independent.columns = ['delivery_time_0', 'delivery_time_1', 'delivery_time_2',
                       'blood_pressure_0', 'blood_pressure_1', 'blood_pressure_2',
                       'heart_problem_0', 'heart_problem_1','age', 'delivery_no']
print(Independent.head(1))

#standardization for predicted dataset
Pred_standardize = preprocessing.StandardScaler().fit(Pred)
Pred = Pred_standardize.transform(Pred)
#Pred = pd.DataFrame(standardize.transform(Pred))
#Pred.columns = ['delivery_time_0', 'delivery_time_1', 'delivery_time_2',
                       #'blood_pressure_0', 'blood_pressure_1', 'blood_pressure_2',
                       #'heart_problem_0', 'heart_problem_1','age', 'delivery_no']
print(Pred)

   delivery_time_0  delivery_time_1  delivery_time_2  blood_pressure_0  \
0              0.0        -0.859727         0.519462               0.0   

   blood_pressure_1  blood_pressure_2  heart_problem_0  heart_problem_1  \
0         -1.732051               1.0        -0.774597         0.774597   

        age  delivery_no  
0 -1.186046    -0.697097  
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]


  return self.partial_fit(X, y)
  
  return self.partial_fit(X, y)
  # Remove the CWD from sys.path while we load stuff.


**Creating Training and  Testing Set**

In [98]:
#Split the Data into Training and Testing Set 70/30 respectively 
Indep_train, Indep_test, Dep_train, Dep_test = train_test_split(Independent, Dependant, test_size=0.30, random_state=0)

## Logistic Regression - L2 with 100 regularization coefficients and 3-fold cross-validation 

In [100]:
# Train logistic regression and cross validation
Logit_RegCV_L2 = LRCV(Cs=10, cv=10, penalty='l2', random_state=0, multi_class='multinomial', solver='lbfgs', max_iter=1000, n_jobs=-1)
Logit_RegCV_L2.fit(Independent, Dependant)
print('1/alpha = ', Logit_RegCV_L2.C_) 
print('coef = ', Logit_RegCV_L2.coef_)
print('accuracy = ', Logit_RegCV_L2.score(Independent, Dependant))
print('Predicted outcome using Accuracy Scoring function:', Logit_RegCV_L2.predict(Pred))

1/alpha =  [0.00599484]
coef =  [[ 0.         -0.03491522  0.01640852  0.         -0.01972001  0.05454202
   0.05363667 -0.05363667  0.0112653   0.01791942]]
accuracy =  0.65
Predicted outcome using Accuracy Scoring function: [1.]


  y = column_or_1d(y, warn=True)


**Train a model where the regularization coefficient is determined using the
accuracy as the scoring function**

In [20]:
#generate the predicted Y based on the model
y_pred = Logit_RegCV_L2.predict(Independent)

#Creating Accuracy Scoring Function
accuracy = accuracy_score(Dependant, y_pred)
N_accuracy = accuracy_score(Dependant, y_pred, normalize = False)
print('accuracy without normalization: ', N_accuracy)
print('accuracy with normalization: ', accuracy)

#Traning Logistic Regression Model with Cross Validation using Accuracy Scoring Function
Logit_RegCV_L2_ac = LRCV(Cs=10, cv=10, penalty='l2', 
                                      scoring= 'accuracy', random_state=0, 
                                      multi_class='multinomial', solver='lbfgs', 
                                      max_iter=1000, n_jobs=-1)

#fitting the just trained model
Logit_RegCV_L2_ac.fit(Independent, Dependant)
Acc_Scores = Logit_RegCV_L2_ac.scores_
print('1/alpha = ', Logit_RegCV_L2_ac.C_) 
print('coef = ', Logit_RegCV_L2_ac.coef_)
print('accuracy = ', Logit_RegCV_L2_ac.score(Independent,Dependant))
print('average predicted accuracy', np.array(list(Acc_Scores.values())).mean())
print('Expected value for thie model', EV_score(Logit_RegCV_L2_ac, Independent,Dependant))

accuracy without normalization:  52
accuracy with normalization:  0.65


  y = column_or_1d(y, warn=True)


1/alpha =  [0.00599484]
coef =  [[ 0.         -0.03491522  0.01640852  0.         -0.01972001  0.05454202
   0.05363667 -0.05363667  0.0112653   0.01791942]]
accuracy =  0.65
average predicted accuracy 0.6118055555555554
Expected value for thie model -76.24999999999999




**Train another model where the regularization coefficient is determined using the expected value as the scoring function**

In [101]:
#Creating Expected value scoring function
def EV_score(estimator, X, Y):
    EV_y_pred = estimator.predict(X)
    EV_Confusion_Matrix = confusion_matrix(EV_y_pred, Y)
    Prob_TN, Prob_FP, Prob_FN, Prob_TP = EV_Confusion_Matrix.ravel()/np.sum(EV_Confusion_Matrix)
    Expected_Value = 200*Prob_TP - 400*Prob_TN - 450*Prob_FN
    return Expected_Value


#Traning Logistic Regression Model with Cross Validation Using Expected Value Scoring Function
Logit_RegCV_L2_EV = LRCV(Cs=10, cv=10, penalty='l2', 
                                      scoring= EV_score, random_state=0, 
                                      multi_class='multinomial', solver='lbfgs', 
                                      max_iter=1000, n_jobs=-1)

#fitting the just trained model
Logit_RegCV_L2_EV.fit(Independent, Dependant)
EV_scores = Logit_RegCV_L2_EV.scores_
print('average predicted Expected Value', np.array(list(EV_scores.values())).mean())
print('coef = ', Logit_RegCV_L2_EV.coef_)
#print('accuracy = ', Logit_RegCV_L2_EV.score(Independent,Dependant))
print('Expected value for thie model', EV_score(Logit_RegCV_L2_EV, Independent,Dependant))
print('Predicted outcome using Expected value Scoring function:', Logit_RegCV_L2.predict(Pred))

average predicted Expected Value -91.91865079365074
coef =  [[ 0.         -0.00071508  0.00043007  0.         -0.0005709   0.00119289
   0.00138122 -0.00138122  0.00034953  0.00046433]]
Expected value for thie model -76.25000000000001
Predicted outcome using Expected value Scoring function: [1.]


  y = column_or_1d(y, warn=True)
