In [59]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from collections import Counter
from sklearn.preprocessing import StandardScaler
import shap
pd.set_option('future.no_silent_downcasting', True)
np.seterr(over='ignore')







{'divide': 'warn', 'over': 'ignore', 'under': 'ignore', 'invalid': 'warn'}

In [49]:
class CustomLogisticRegression:
    def __init__(self,weights, featureMatrix, labels, learningRate, epochs):
        self.weights = weights
        self.featureMatrix = featureMatrix
        self.labels = labels
        self.learningRate = learningRate
        self.epochs = epochs
    

    def fit(self):
        for i in range(self.epochs):
            # if(i%(self.epochs//10)==0):
            #     print("Epoch: ",i)
            self.weights = self.weights - self.learningRate * self.gradient()
        


    def gradient(self):
        
        featureMatrix = np.asarray(self.featureMatrix, dtype=np.float32)
        weights = np.asarray(self.weights, dtype=np.float32)
        labels = np.asarray(self.labels, dtype=np.float32).reshape(-1, 1)


        dotP = np.dot(featureMatrix, weights)  # (m, n) @ (n, 1) -> (m, 1)


        dotP = np.asarray(dotP, dtype=np.float32)

        # Compute denominator safely
        denom = 1 + np.exp(labels * dotP)  


        gradient = -np.dot(featureMatrix.T, labels / denom) / len(labels)  
        return gradient

    def accuracy(self,xTest,yTest):

        correct = 0
        outArray=[]
        for i in range(xTest.shape[0]):
            testY = self.predict(xTest[i])
            outArray.append(1 if testY>=0 else -1)
            if (testY) > 0:
                if yTest[i] == 1:
                    correct+=1
            else:
                if yTest[i] == -1:
                    correct+=1
        
        
        return correct/len(yTest)

    def predict(self, x):
        fM = np.hstack((x, np.ones((x.shape[0],1))))
        return np.sign(fM @ self.weights)
    
    def predict_proba(self, X):
        logits = X @ self.weights
        probs = 1 / (1 + np.exp(-logits))  # Sigmoid function
        return np.column_stack([1 - probs, probs])  # SHAP expects both class probabilities
    
    def __call__(self, x):
        return self.predict_proba(x)





In [93]:
df1 = pd.read_csv("Heart_Disease_Prediction.csv")
#print(backUpData.loc[1])
df1['Heart Disease'] = df1['Heart Disease'].replace(['Presence','Absence'],[1,-1])
backUpData = df1.copy()

df = df1
ratio = 0.9

df = df.sample(frac=1)
Train,Test = df.iloc[:int(ratio*len(df))],df.iloc[int(ratio*len(df)):]

xData = np.matrix(Train.iloc[:,:-1])
yData = (np.matrix(Train.iloc[:,-1])).T
xTest = np.matrix(Test.iloc[:,:-1])
yTest = (np.matrix(Test.iloc[:,-1])).T
featureMatrix = np.hstack((xData, np.ones((xData.shape[0],1))))
weights = np.random.random((featureMatrix.shape[1],1))#(1,14)

learningRate = 10**-6
dataNum = featureMatrix.shape[0]


print(f"feature: {featureMatrix.shape}, weights: {weights.shape}, labels: {yData.shape}, dataNum: {dataNum}")
myModel = CustomLogisticRegression(weights, featureMatrix,yData,learningRate,10000)
myModel.fit()
print("Accuracy: ",myModel.accuracy(xTest,yTest))

#Shapely values







feature: (243, 14), weights: (14, 1), labels: (243, 1), dataNum: 243
Accuracy:  0.5555555555555556


In [92]:
bigDF = pd.read_excel("LargerData.xlsx")
print(bigDF.head())


   PatientID    State     Sex GeneralHealth   AgeCategory  HeightInMeters  \
0          1  Alabama  Female          Fair  Age 75 to 79            1.63   
1          2  Alabama  Female     Very good  Age 65 to 69            1.60   
2          3  Alabama    Male     Excellent  Age 60 to 64            1.78   
3          4  Alabama    Male     Very good  Age 70 to 74            1.78   
4          5  Alabama  Female          Good  Age 50 to 54            1.68   

   WeightInKilograms        BMI  HadHeartAttack  HadAngina  ...  \
0          84.820000  32.099998               0          1  ...   
1          71.669998  27.990000               0          0  ...   
2          71.209999  22.530001               0          0  ...   
3          95.250000  30.129999               0          0  ...   
4          78.019997  27.760000               0          0  ...   

                             ECigaretteUsage  ChestScan  \
0  Never used e-cigarettes in my entire life          1   
1  Never used e-

# Beginning feature elimination
### Need to clean up the following columns: AgeCategory, Sex, State, SmokerStatus

In [94]:
dataSet = bigDF[['State','Sex','AgeCategory','HeightInMeters','WeightInKilograms',
                 'BMI','HadAngina','HadStroke','HadAsthma','HadSkinCancer','HadCOPD',
                 'HadDiabetes','DifficultyWalking','SmokerStatus','AlcoholDrinkers',
                 'HighRiskLastYear','HadHeartAttack']]


states = ['Alabama','Alaska','Arizona','Arkansas','California','Colorado','Connecticut','Delaware',
                                             'Florida','Georgia','Hawaii','Idaho','Illinois','Indiana','Iowa','Kansas','Kentucky','Louisiana',
                                             'Maine','Maryland','Massachusetts','Michigan','Minnesota','Mississippi','Missouri','Montana','Nebraska',
                                             'Nevada','New Hampshire','New Jersey','New Mexico','New York','North Carolina','North Dakota','Ohio','Oklahoma',
                                             'Oregon','Pennsylvania','Rhode Island','South Carolina','South Dakota','Tennessee','Texas','Utah','Vermont','Virginia',
                                             'Washington','West Virginia','Wisconsin','Wyoming', "Puerto Rico", "District of Columbia", 'Guam', 'Virgin Islands']


dataSet['State'] = dataSet['State'].replace(states,list(range(len(states))))

ageCats = [
    'Age 65 to 69',     
'Age 60 to 64',
'Age 70 to 74',
'Age 55 to 59',
'Age 50 to 54',
'Age 75 to 79',
'Age 80 or older',
'Age 40 to 44',
'Age 45 to 49',
'Age 35 to 39',
'Age 30 to 34',
'Age 18 to 24',
'Age 25 to 29',
]
dataSet['AgeCategory'] = dataSet['AgeCategory'].replace(sorted(ageCats),list(range(len(ageCats))))

diab = [
    'No',
'Yes',
'No, pre-diabetes or borderline diabetes',
'Yes, but only during pregnancy (female)',
]

dataSet['HadDiabetes'] = dataSet['HadDiabetes'].replace(diab,[0,1,2,3])

smoke=[
    'Never smoked',
'Former smoker',
'Current smoker - now smokes every day',
'Current smoker - now smokes some days'
]

dataSet['SmokerStatus'] = dataSet['SmokerStatus'].replace(smoke,[0,1,2,3])
dataSet['Sex'] = dataSet['Sex'].replace(['Male','Female'],[0,1])
dataSet['HadHeartAttack'] = dataSet['HadHeartAttack'].replace([0,1],[-1,1])

scaler = StandardScaler()
numericalCols=['State','AgeCategory','HadDiabetes','SmokerStatus','HeightInMeters','WeightInKilograms','BMI']
dataSet[numericalCols] = scaler.fit_transform(dataSet[numericalCols])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataSet['State'] = dataSet['State'].replace(states,list(range(len(states))))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataSet['AgeCategory'] = dataSet['AgeCategory'].replace(sorted(ageCats),list(range(len(ageCats))))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataSet['HadDiabetes'] = data

In [95]:
bigRatio=0.8
dataSet = dataSet.sample(frac=1)
partition = int(bigRatio*len(dataSet))
Train, Test = dataSet.iloc[:partition],dataSet.iloc[partition:]

bigxData = np.matrix(Train.iloc[:,:-1])
bigyData = np.matrix(Train.iloc[:,-1]).T
bigxTest = np.matrix(Test.iloc[:,:-1])
bigyTest = np.matrix(Test.iloc[:,-1]).T
bigFeature = np.hstack((bigxData, np.ones((bigxData.shape[0],1))))

bigdataNum = bigFeature.shape[0]


# model = LogisticRegression()
# model.fit(bigFeature,YTrain)



weights = np.random.random((bigFeature.shape[1],1))#(1,14)
print(f"feature: {bigFeature.shape}, weights: {weights.shape}, labels: {bigyData.shape}, dataNum: {bigdataNum}")
bigModel = CustomLogisticRegression(weights, bigFeature,bigyData,10**-2,1000)
bigModel.fit()
print("Accuracy: ",bigModel.accuracy(bigxTest,bigyTest))

feature: (190104, 17), weights: (17, 1), labels: (190104, 1), dataNum: 190104
Accuracy:  0.9454193494087447


# Feature Selection - Recursive Feature Eliminiation

In [100]:
class RFE:
    def __init__(self,dataF,learningRate,epochs):
        self.dataFrame = dataF
        self.LRate = learningRate
        self.epochs = epochs
    
    def prepareData(self,df):
            df = df.sample(frac=1)
            Train,Test = df.iloc[:int(ratio*len(df))],df.iloc[int(ratio*len(df)):]

            xData = np.matrix(Train.iloc[:,:-1])
            yData = (np.matrix(Train.iloc[:,-1])).T
            xTest = np.matrix(Test.iloc[:,:-1])
            yTest = (np.matrix(Test.iloc[:,-1])).T
            featureMatrix = np.hstack((xData, np.ones((xData.shape[0],1))))
            weights = np.random.random((featureMatrix.shape[1],1))#(1,14)
            return weights,featureMatrix,yData,xTest,yTest

    def conductRFE(self,requiredFeaturesNum):
        
        RFEDataSet = self.dataFrame
        columnDict=[]

        ratio = 0.8
        df=RFEDataSet
        for i,col in enumerate(df.columns):
            columnDict.append(col)

        for i in range(len(df.columns)-requiredFeaturesNum+1):
            df = df.sample(frac=1)
            Train,Test = df.iloc[:int(ratio*len(df))],df.iloc[int(ratio*len(df)):]

            xData = np.matrix(Train.iloc[:,:-1])
            yData = (np.matrix(Train.iloc[:,-1])).T
            xTest = np.matrix(Test.iloc[:,:-1])
            yTest = (np.matrix(Test.iloc[:,-1])).T
            featureMatrix = np.hstack((xData, np.ones((xData.shape[0],1))))
            weights = np.random.random((featureMatrix.shape[1],1))#(1,14)

            weights,featureMatrix,yData,xTest,yTest = self.prepareData(df)

            myModel = CustomLogisticRegression(weights, featureMatrix,yData,self.LRate,self.epochs)
            myModel.fit()
            print(f"Accuracy with {len(df.columns)} features: ",myModel.accuracy(xTest,yTest))
            flatW = np.array(myModel.weights.T).flatten().tolist()
            minW = 100
            minIndex= 0
            #LastIndex is bias let it be
            for i in range(len(flatW)-1):
                if abs(flatW[i])<minW:
                    minW = abs(flatW[i])
                    minIndex = i
            

            df.drop(columnDict[minIndex],axis=1,inplace=True)
            columnDict.pop(minIndex)
            

        print("Remaining Columns: ",df.columns)

        print("*"*30)
        weights,featureMatrix,yData,xTest,yTest = self.prepareData(df)
        newModel = CustomLogisticRegression(weights, featureMatrix,yData,self.LRate,self.epochs)
        newModel.fit()
        print("Accuracy after feature Selection: ",newModel.accuracy(xTest,yTest))






In [101]:
featureSelection = RFE(dataSet,10**-2,1000)
featureSelection.conductRFE(7)

Accuracy with 17 features:  0.9433152379750032
Accuracy with 16 features:  0.9444514581492236
Accuracy with 15 features:  0.9436939780330766
Accuracy with 14 features:  0.9426419223162059
Accuracy with 13 features:  0.9439885536338004
Accuracy with 12 features:  0.9451668560366957
Accuracy with 11 features:  0.944114800319825
Accuracy with 10 features:  0.9448722804359719
Accuracy with 9 features:  0.9452510204940454
Accuracy with 8 features:  0.9445356226065732
Accuracy with 7 features:  0.9424315111728317
Remaining Columns:  Index(['Sex', 'HadAngina', 'HadSkinCancer', 'DifficultyWalking',
       'AlcoholDrinkers', 'HadHeartAttack'],
      dtype='object')
******************************
Accuracy after feature Selection:  0.9453772671800699
