In [99]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import roc_auc_score, recall_score, precision_score, f1_score

## The Dataset
[Credit Card Approval Dataset](http://archive.ics.uci.edu/ml/datasets/credit+approval) from the UCI Machine Learning Repository.
The features for this dataset has been anonymized to protect the privacy, but the probable features are  <code>Gender</code>, <code>Age</code>, <code>Debt</code>, <code>Married</code>, <code>BankCustomer</code>, <code>EducationLevel</code>, <code>Ethnicity</code>, <code>YearsEmployed</code>, <code>PriorDefault</code>, <code>Employed</code>, <code>CreditScore</code>, <code>DriversLicense</code>, <code>Citizen</code>, <code>ZipCode</code>, <code>Income</code> and finally the <code>ApprovalStatus</code> according to [this blog](http://rstudio-pubs-static.s3.amazonaws.com/73039_9946de135c0a49daa7a0a9eda4a67a72.html)

### EDA


In [123]:
df = pd.read_csv('https://raw.githubusercontent.com/Explore-AI/Public-Data/89fee4463f428f55d31a254924e18501a3c468c3/Data/classification_sprint/cc_approvals.data',header=None)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202,0,+
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43,560,+
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280,824,+
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100,3,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120,0,+


In [113]:
df.tail(20)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
670,b,47.17,5.835,u,g,w,v,5.5,f,f,0,f,g,465,150,-
671,b,25.83,12.835,u,g,cc,v,0.5,f,f,0,f,g,0,2,-
672,a,50.25,0.835,u,g,aa,v,0.5,f,f,0,t,g,240,117,-
673,?,29.5,2.0,y,p,e,h,2.0,f,f,0,f,g,256,17,-
674,a,37.33,2.5,u,g,i,h,0.21,f,f,0,f,g,260,246,-
675,a,41.58,1.04,u,g,aa,v,0.665,f,f,0,f,g,240,237,-
676,a,30.58,10.665,u,g,q,h,0.085,f,t,12,t,g,129,3,-
677,b,19.42,7.25,u,g,m,v,0.04,f,t,1,f,g,100,1,-
678,a,17.92,10.21,u,g,ff,ff,0.0,f,f,0,f,g,0,50,-
679,a,20.08,1.25,u,g,c,v,0.0,f,f,0,f,g,0,0,-


In [102]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 690 entries, 0 to 689
Data columns (total 16 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       690 non-null    object 
 1   1       690 non-null    object 
 2   2       690 non-null    float64
 3   3       690 non-null    object 
 4   4       690 non-null    object 
 5   5       690 non-null    object 
 6   6       690 non-null    object 
 7   7       690 non-null    float64
 8   8       690 non-null    object 
 9   9       690 non-null    object 
 10  10      690 non-null    int64  
 11  11      690 non-null    object 
 12  12      690 non-null    object 
 13  13      690 non-null    object 
 14  14      690 non-null    int64  
 15  15      690 non-null    object 
dtypes: float64(2), int64(2), object(12)
memory usage: 86.4+ KB


In [122]:

for col in df.columns:
    count = df[col].isna().sum()
    print(col ,count)

0 12
1 12
2 0
3 6
4 6
5 9
6 9
7 0
8 0
9 0
10 0
11 0
12 0
13 13
14 0
15 0


In [124]:
def data_cleaning(df, column_name):
    # Replace '?' with NaN
    df.replace('?', np.nan, inplace=True)

    non_numeric_cols = df.select_dtypes(exclude=[np.number]).columns
    # Impute missing numeric values with mean
    if column_name not in non_numeric_cols:
        df[column_name] = df[column_name].astype(float)
        df.fillna(df.mean(), inplace=True)

    else: 
        for col in non_numeric_cols:
            most_frequent_value = df[col].mode()[0]
            df[col].fillna(most_frequent_value, inplace=True)
    
    # Count unique values in the specified column after cleaning
    unique_values_count = df[column_name].value_counts().tolist()
    
    return unique_values_count


In [131]:
data_cleaning(df,13)

[145,
 35,
 35,
 34,
 30,
 30,
 22,
 18,
 16,
 14,
 14,
 13,
 11,
 9,
 9,
 9,
 7,
 7,
 5,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1]

In [132]:
def preprocess_data(df):
    # Convert non-numeric data to numeric using LabelEncoder
    label_encoders = {}
    for column in df.select_dtypes(include=['object']):
        le = LabelEncoder()
        df[column] = le.fit_transform(df[column])
        label_encoders[column] = le
    
    # Drop features 11 and 13
    df = df.drop(columns=[11, 13])
    
    # Convert DataFrame to NumPy array
    X = df.iloc[:, df.columns != df.columns[-1]].values
    y = df.iloc[:, -1].values
    
    # Standardize features using MinMaxScaler
    scaler = MinMaxScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
    
    return (X_train,y_train), (X_test,y_test)

(X_train, y_train), (X_test, y_test) = preprocess_data(df)


In [133]:

print(X_train[:1])
print(y_train[:1])
print(X_test[:1])
print(y_test[:1])

[[1.         0.25862069 0.48214286 1.         1.         0.38461538
  0.25       0.         0.         0.         0.         0.
  0.        ]]
[1]
[[0.         0.20402299 0.05357143 0.5        0.         0.38461538
  0.25       0.         0.         1.         0.02985075 0.
  0.00105   ]]
[1]


In [134]:
def train_model(X_train,y_train):
    model = LogisticRegression(solver='lbfgs')

    model.fit(X_train,y_train)

    return model

In [135]:
lm = train_model(X_train,y_train)
print(lm.intercept_[0])
print(lm.coef_)

3.0489548322720865
[[ 0.10540556 -0.6212872   0.01130681  0.76133957  0.30195581 -0.2834454
  -0.49387636 -0.76575712 -3.43528863 -1.06426785 -0.82406864  0.04956249
  -1.35582238]]


In [136]:
def calculate_roc(model, X_test, y_test):
    y_prob = model.predict_proba(X_test)[:,1]
    roc_auc = roc_auc_score(y_test,y_prob)
    return roc_auc

In [138]:
print(calculate_roc(lm,X_test,y_test))

0.8821428571428571


In [139]:
def metrics(model,x_test,y_test):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test,y_pred)
    precision = precision_score(y_test,y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test,y_pred)
    return (accuracy, precision, recall, f1)

In [140]:
(accuracy, precision, recall, f1) = metrics(lm, X_test, y_test)    

print('Accuracy: %f' % accuracy)
print('Precision: %f' % precision)
print('Recall: %f' % recall)
print('F1 score: %f' % f1)

Accuracy: 0.826087
Precision: 0.854839
Recall: 0.779412
F1 score: 0.815385
