## Importing required Libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report

## Code for Logistic Regression

In [2]:
class LogisticRegression:
    # Intializing learning rate and number of iterations
    def __init__(self, learning_rate=0.001, iterations=1000):
        self.learn_rate = learning_rate
        self.iterations = iterations
        self.weights = None
        self.bias = None
    
    # Intializing weights and bias
    def fit(self, X, y):
        num_of_samples, num_of_features = X.shape
        self.weights = np.zeros(num_of_features)
        self.bias = 0
        print(self.weights)
    
    # Intiating sigmoid function for logistic regression
    # Gradient Descent algorithm
        for _ in range(self.iterations):
            # creating a mathematical function for logistic regression
            
            function = np.dot(X, self.weights) + self.bias
            
            #predicting y value using sigmoid function
            y_predicted = self._sigmoid(function)

            # Computing values for dw,db 
            dw = (1 / num_of_samples) * np.dot(X.T, (y_predicted - y))
            db = (1 / num_of_samples) * np.sum(y_predicted - y)
            
            # Updating weights and bias after every itertaion
            self.weights -= self.learn_rate * dw
            self.bias -= self.learn_rate * db
            
    # Predicting the labels for training set
    def predict(self, X):
        predicting = np.dot(X, self.weights) + self.bias
        y_predicted = self._sigmoid(predicting)
        y_predicted_cls = [1 if i > 0.5 else 0 for i in y_predicted]
        
        return np.array(y_predicted_cls)
    def _sigmoid(self, x):
        return 1 / (1 + np.exp(-x))    

### Reading the data set,cleaning and applying it to Logistic Regression class

In [3]:
if __name__ == "__main__":
    
    from sklearn.model_selection import train_test_split
    from sklearn import datasets

    
    # Reading the data
    df=pd.read_csv(r"/Users/shyam/Downloads/Credit_card.csv")
    
    # Changing column names as required
    df.rename(columns= {'Unnamed: 0':'ID','X1':'credit_given','X2':'Gender','X3':'EDUCATION','X4':'Marital_Status','X5':'AGE',
                    'X6':'september_payment_status','X7':'august_payment_status','X8':'july_payment_status',
                    'X9':'june_payment_status','X10':"may_payment_status",'X11':'april_payment_status',
                    'X12':'september_bill','X13':'august_bill','X14':'july_bill','X15':'june_bill',
                    'X16':'may_bill','X17':'april_bill','X18':'amount_paid_september','X19':'amount_paid_august',
                    'X20':'amount_paid_july','X21':'amount_paid_june','X22':'amount_paid_may',
                    'X23':'amount_paid_april'},inplace=True)
    
    # Dropping duplicate column names
    df.drop(index=df.index[0],axis=0,inplace=True)
    
    # Rearraging the index values for dataset
    df.index=df.index-1
    
    # There are extra values in some columns which are not mentioned in the question. So I am changing them to other category
    df['Marital_Status']=df['Marital_Status'].apply(lambda x: '3' if (x == '0')  else x)
    df['EDUCATION']=df['EDUCATION'].apply(lambda x: '4' if (x == '0' or x=='5' or x=='6')  else x)
    
    # Converting data frame dtype to integer type values
    df=df.astype(int)
    
    # Selecting features
    x1=df[['ID', 'credit_given', 'Gender', 'EDUCATION', 'Marital_Status', 'AGE',
       'september_payment_status', 'august_payment_status',
       'july_payment_status', 'june_payment_status', 'may_payment_status',
       'april_payment_status', 'september_bill', 'august_bill', 'july_bill',
       'june_bill', 'may_bill', 'april_bill', 'amount_paid_september',
       'amount_paid_august', 'amount_paid_july', 'amount_paid_june',
       'amount_paid_may', 'amount_paid_april']].to_numpy()
    
    # Selecting Label Column
    y=df['Y']
    X_train1, X_test1, y_train1, y_test1 = train_test_split(x1, y, test_size=0.2, random_state=42)
    
    # Calling the model
    regressor1 = LogisticRegression(learning_rate=0.001,iterations=1000)
    regressor1.fit(X_train1, y_train1)
    predictions1 = regressor1.predict(X_test1)
    
    # Using confusion matrix for true positives and false positives to find P value
    from sklearn.metrics import confusion_matrix
    matrix=confusion_matrix(predictions1,y_test1,labels=[0,1])
    cm=pd.DataFrame(matrix,index=['0','1'],columns=['0','1'])
    print("Output for all features is")
    print(" ")
    print("Confusion Matrix for all features is\n",cm)
    true_positives=matrix[0,0]
    true_negatives=matrix[1,1]
    print("Classification Report",classification_report(y_test1,predictions1,target_names=['0','1']))
    p=(((true_positives/4687)+(true_negatives/1313))/2)*100
    print("The value for P using all features is:",p)
    
    # Finding accuracy of our model
    def accuracy(y_true, y_pred):
        accuracy = np.sum(y_true == y_pred) / len(y_true)
        return accuracy      
    print("Classification Accuracy of model using all features is:", accuracy(y_test1, predictions1)*100, "%")
    print(" ")
    print(" Output for selected features")
    print(" ")
    
    
    
    
    
    # Calling the model using selected features.
    x2=df[['september_payment_status', 'august_payment_status',
       'july_payment_status', 'june_payment_status', 'may_payment_status',
       'april_payment_status']].to_numpy()
    
    X_train2, X_test2, y_train2, y_test2 = train_test_split(x2, y, test_size=0.2, random_state=42)
    # Calling the model
    regressor2 = LogisticRegression(learning_rate=0.01,iterations=1000)
    regressor2.fit(X_train2, y_train2)
    predictions2 = regressor2.predict(X_test2)
    matrix2=confusion_matrix(predictions2,y_test2,labels=[0,1])
    cm2=pd.DataFrame(matrix2,index=['0','1'],columns=['0','1'])
    print("Confusion Matrix for Logistic Regression using selected features  is\n",cm2)
    #print("Classification Report",classification_report(y_test2,predictions2,target_names=['0','1']))
    true_positives=matrix2[0,0]
    true_negatives=matrix2[1,1]
    tp=(true_positives/4687)*100
    tn=(true_negatives/1313)*100
    p=(tp/2+tn/2)
    print("The value for P using all features is:",p) 
    print("Classification Accuracy of model using all features is:", accuracy(y_test2, predictions2)*100,"%")
    
    

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
Output for all features is
 
Confusion Matrix for all features is
       0     1
0  4643  1276
1    44    37
Classification Report               precision    recall  f1-score   support

           0       0.78      0.99      0.88      4687
           1       0.46      0.03      0.05      1313

    accuracy                           0.78      6000
   macro avg       0.62      0.51      0.46      6000
weighted avg       0.71      0.78      0.70      6000

The value for P using all features is: 50.93960365165532
Classification Accuracy of model using all features is: 78.0 %
 
 Output for selected features
 
[0. 0. 0. 0. 0. 0.]
Confusion Matrix for Logistic Regression using selected features  is
       0     1
0  4543  1008
1   144   305
The value for P using all features is: 60.07845914328348
Classification Accuracy of model using all features is: 80.80000000000001 %
